# Scrape the top-100 lists
Get all tables from https://en.wikipedia.org/wiki/Wikipedia:Popular_pages

In [178]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [None]:
popular_pages = requests.get('https://en.wikipedia.org/wiki/Wikipedia:Popular_pages').content
soup = BeautifulSoup(popular_pages)

In [182]:
tables = soup.find_all('table')
top_lists = []
h2s_to_exclude = ['Categories']
h3s_to_exclude = ['Historical most-viewed 3rd-millennium persons']

for table in tables:
    
    table_data = {}
    
    # find subjects
    
    h2 = table.find_previous('h2')
    h2_text = h2.text if h2 else None
    table_data['h2'] = h2_text
    if h2_text in h2s_to_exclude:
        continue
    
    h3 = table.find_previous('h3')
    if h3 and h3.find_previous('h2') == h2: # find h3 only if it appears after h2
        h3_text = h3.text
    else:
        h3_text = None
    table_data['h3'] = h3_text
    if h3_text in h3s_to_exclude:
        continue
    
    table_data['headers'] = [re.sub(r'(\n|\*)', '', th.text) for th in table.find_all('th')
                             if 'As of 24 February 2024' not in th.text] + ['href']
    
    # get table content
    table_content = []
    for tr in table.find_all('tr'):
        row = []
        for td in tr.find_all('td'):
            row.append(td.text.strip('\n'))
        if row:
            links_in_row = tr.find_all('a')
            has_link = False
            if links_in_row:
                for link in links_in_row[::-1]:
                    if not link['href'].startswith('#'):
                        row.append(link['href'])
                        has_link = True
                        break
            if not has_link:
                row.append(None)
            table_content.append(row)
                
    
    table_data['content'] = table_content
    top_lists.append(table_data)

In [183]:
pd.DataFrame(top_lists)

Unnamed: 0,h2,h3,headers,content
0,Top-100 list,,"[Rank, Page, Views in millions, href]","[[[a], Main Page, 46,800, /wiki/Main_Page], [[..."
1,Universe,,"[Rank, Page, Views in millions, href]","[[1, Earth, 80, /wiki/Earth], [2, Sun, 40, /wi..."
2,Earth,,"[Rank, Page, Views in millions, href]","[[1, Mount Everest, 46, /wiki/Mount_Everest], ..."
3,Life,,"[Rank, Page, Views in millions, href]","[[1, Cat, 61, /wiki/Cat], [2, Dog, 50, /wiki/D..."
4,Civilization,Wars,"[Rank, Page, Views in millions, href]","[[1, World War II, 145, /wiki/World_War_II], [..."
5,Civilization,Empires and hegemonies,"[Rank, Page, Views in millions, href]","[[1, Soviet Union, 63, /wiki/Soviet_Union], [2..."
6,Civilization,Present countries,"[Rank, Page, Views in millions, href]","[[1, United States, 254, /wiki/United_States],..."
7,Civilization,Cities,"[Rank, Page, Continent, Views in millions, href]","[[1, New York City, North America, 96, /wiki/N..."
8,Civilization,Buildings and structures,"[Rank, Page, Date of completion [a], Views in ...","[[1, Taj Mahal, 1653, 53, /wiki/Taj_Mahal], [2..."
9,People,,"[Rank, Page, Views in millions, href]","[[1, Donald Trump, 243, /wiki/Donald_Trump], [..."


In [184]:
# For example...
for lst in top_lists[:2]:
    df = pd.DataFrame(lst['content'], columns=lst['headers'])
    display(df)

Unnamed: 0,Rank,Page,Views in millions,href
0,[a],Main Page,46800,/wiki/Main_Page
1,[a],Special:Search,15000,/wiki/Special:Search
2,[a],Special:Random,7900,/wiki/Special:Random
3,[b],-,2900,/wiki/-
4,[a],Undefined,1800,/wiki/Undefined
...,...,...,...,...
126,97,John Cena,77,/wiki/John_Cena
127,97,Charles Manson,77,/wiki/Charles_Manson
128,97,Ryan Reynolds,77,/wiki/Ryan_Reynolds
129,97,Brad Pitt,77,/wiki/Brad_Pitt


Unnamed: 0,Rank,Page,Views in millions,href
0,1,Earth,80.0,/wiki/Earth
1,2,Sun,40.0,/wiki/Sun
2,3,Moon,39.0,/wiki/Moon
3,[a],Skathi (moon),38.0,/wiki/Skathi_(moon)
4,4,Black hole,37.0,/wiki/Black_hole
5,4,Mars,37.0,/wiki/Mars
6,6,Solar System,36.0,/wiki/Solar_System
7,7,Pluto,31.0,/wiki/Pluto
8,8,Big Bang,30.0,/wiki/Big_Bang
9,9,Milky Way,26.0,/wiki/Milky_Way


In [204]:
combined_df_all = pd.DataFrame()

rename_dict = {'Rank': 'rank', 'Page': 'title', 'Views in millions': 'views_mn',
               'Continent': 'continent', 'Date of completion [a]': 'data_completed',
               'Sport': 'sport', 'Country': 'country', 'Time': 'time', 'Artist': 'artist'}

for lst in top_lists:
    df = (pd.DataFrame(lst['content'], columns=lst['headers'])
          .rename(columns=rename_dict))
    df['group'] = lst['h2']
    df['subgroup'] = lst['h3']
    combined_df_all = pd.concat([combined_df_all, df], axis=0)

# Clean the combined dataframe
combined_df = combined_df_all.copy()
# Remove special rows
combined_df = combined_df[~combined_df['rank'].str.contains(r'[a-z]')]
# remove unnecessary columns
combined_df = combined_df[['title', 'href', 'views_mn', 'group', 'subgroup', 'rank']]

combined_df

Unnamed: 0,title,href,views_mn,group,subgroup,rank
11,United States,/wiki/United_States,254,Top-100 list,,1
12,Donald Trump,/wiki/Donald_Trump,243,Top-100 list,,2
17,Elizabeth II,/wiki/Elizabeth_II,198,Top-100 list,,3
21,India,/wiki/India,165,Top-100 list,,4
22,Barack Obama,/wiki/Barack_Obama,161,Top-100 list,,5
...,...,...,...,...,...,...
25,States and territories of the United States,/wiki/List_of_states_and_territories_of_the_Un...,33,Lists,,26
26,Suits episodes,/wiki/List_of_Suits_episodes,32,Lists,,27
27,NBA champions,/wiki/List_of_NBA_champions,30,Lists,,28
28,Pornographic actresses by decade,/wiki/List_of_pornographic_actresses_by_decade,30,Lists,,28


In [205]:
combined_df_all.to_csv('../data/top100_all.csv')
combined_df.to_csv('../data/top100.csv')