# Scraping Wikipedia: Names by Year

This notebook demonstrates how to collect names of people born each year from 1865 to 2015 using the Wikipedia API.

In [None]:
import requests
from collections import Counter
import pandas as pd
import time
import random

wikipedia_api_url = "https://en.wikipedia.org/w/api.php"

In [None]:
# Build a list of names by year using the Wikipedia API

req = {
    'action': 'query',
    'format': 'json',
    'list': 'categorymembers',
    'cmlimit': 500,
    'cmtitle': 'Category:1865_births'
}

last_continue = {}
year = 1865
year_list = []
print(f"Start year: {year}")

Start = 1865


In [None]:
while year <= 2015:
    req.update(last_continue)
    result = requests.get(wikipedia_api_url, params=req).json()
    for item in result['query']['categorymembers']:
        year_list.append(f"{year}@{item['title']}")
    if year == 2015:
        print(f"End year: {year}")
    if 'continue' in result:
        last_continue = result['continue']
    else:
        year += 1
        req['cmtitle'] = f'Category:{year}_births'
        last_continue = {}
        if year % 10 == 0:
            print(f"Current year: {year}")
        if year % 20 == 0:
            time.sleep(300 + random.randrange(1, 60, 3))
            print('Sleeping for five-ish min')
        if "cmcontinue" in req:
            req.pop("cmcontinue")

current year = 1870
current year = 1880
sleeping for five-ish min
current year = 1890
current year = 1900
sleeping for five-ish min
current year = 1910
current year = 1920
sleeping for five-ish min
current year = 1930
current year = 1940
sleeping for five-ish min
current year = 1950
current year = 1960
sleeping for five-ish min


In [None]:
# Save results to a tab-separated file

with open("wikipedia_names.txt", "w", encoding="utf-8") as out_file:
    out_file.write("Year\tName\n")
    for line in year_list:
        year, name = line.split("@", 1)
        out_file.write(f"{year}\t{name}\n")

In [None]:
# Clean up names for further processing

clean = []
for name in year_list:
    if '(' in name:
        name = name.replace('(', '@').replace(')', '').replace(' @', '@').split(',', 1)[0]
        clean.append(name)
    else:
        clean.append(name + '@NA')

In [None]:
# Replace '@' with ';' for easier splitting

clean_yr_name_occ = [name.replace('@', ';') for name in clean]

In [None]:
# Split into year and name columns

final_list = [i.split(';') for i in clean_yr_name_occ]

In [None]:
print(f'Total number of people in Wikipedia from 1865 to 2015: {len(final_list):,}')

In [None]:
# Read back the saved file if needed

with open('wikipedia_names.txt') as f:
    lines = f.readlines()

In [None]:
# Count the number of people per year

years = []
for i in final_list:
    if i:
        years.append(i[0])

year_counts = Counter(years)
print(year_counts.most_common(15))

Note: The Wikipedia API caps results at 500 per year, so most years will have a maximum of 500 names.