In [1]:
from datasets import load_dataset
import pandas as pd
import requests

WIKIPEDIA_URL = "https://en.wikipedia.org/w/api.php"
session = requests.Session()

In [2]:
dataset = load_dataset("wikipedia", "20220301.en")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
len(dataset["train"])

6458670

In [4]:
dataset = dataset["train"]

In [5]:
def load_dataframe(shard_index: int):
    shard = dataset.shard(30, shard_index)
    return pd.DataFrame(shard)
    

In [6]:
df_wikipedia = load_dataframe(0)

In [7]:
df_wikipedia

Unnamed: 0,id,url,title,text
0,12,https://en.wikipedia.org/wiki/Anarchism,Anarchism,Anarchism is a political philosophy and moveme...
1,597,https://en.wikipedia.org/wiki/Austroasiatic%20...,Austroasiatic languages,"The Austroasiatic languages , also known as Mo..."
2,666,https://en.wikipedia.org/wiki/Alkali%20metal,Alkali metal,The alkali metals consist of the chemical elem...
3,734,https://en.wikipedia.org/wiki/Actinopterygii,Actinopterygii,"Actinopterygii (; ), members of which are know..."
4,795,https://en.wikipedia.org/wiki/Affidavit,Affidavit,"An ( ; Medieval Latin for ""he has declared un..."
...,...,...,...,...
215284,70198872,https://en.wikipedia.org/wiki/Sunny%20Prajapati,Sunny Prajapati,Sunny Prajapati (born 25 December 1995) is an ...
215285,70199540,https://en.wikipedia.org/wiki/2022%20British%2...,2022 British Indoor Athletics Championships,The 2022 British Indoor Athletics Championship...
215286,70200137,https://en.wikipedia.org/wiki/Geertruida%20H.%...,Geertruida H. Springer,Geertruida H. Springer (1895 – 1988) was a Dut...
215287,70200675,https://en.wikipedia.org/wiki/Sceloporus%20coz...,Sceloporus cozumelae,"Sceloporus cozumelae, the Cozumel spiny lizar..."


In [8]:
df_wikipedia["title"].iloc[0]

'Anarchism'

In [9]:
def search_categories(title: str):
    params = {
    "action": "query",
    "format": "json",
    "prop": "categories",
    "cllimit": "max",
    "clshow": "!hidden",
    "titles": title
    }
    response = session.get(url=WIKIPEDIA_URL, params=params)
    json_response = response.json()
    pages = json_response["query"]["pages"]
    
    categories = []
    
    for k, v in pages.items():
        cats = v.get("categories")
        if cats is None:
            continue
        for category in cats:
            categories.append(category["title"])
    print("Processed", title)
    return categories

In [10]:
def build_hashmap(df, nb_rows):
    categories_hashmap = {}
    for i in range(nb_rows):
        title = df["title"].iloc[i]
        categories_list = search_categories(title)
        for cat in categories_list:
            categories_hashmap[cat] = categories_hashmap[cat] + 1 if categories_hashmap.get(cat) is not None else 1
        #if (i % 10 == 0):
            #print(i)
    return categories_hashmap

In [11]:
hashmap = build_hashmap(df_wikipedia, 1000)

Processed Anarchism
Processed Austroasiatic languages
Processed Alkali metal
Processed Actinopterygii
Processed Affidavit
Processed Ancient Egypt
Processed A. A. Milne
Processed August
Processed André Gide
Processed Abbot
Processed Angle
Processed April 26
Processed Arabian Prince
Processed Army
Processed Absalom
Processed Albert
Processed Agesilaus II
Processed Alexander of Pherae
Processed August 23
Processed Alfonso XII
Processed Ambrosius Aurelianus
Processed Andronikos II Palaiologos
Processed Actium
Processed Aimery of Cyprus
Processed Argo Navis
Processed Apollo 12
Processed Atle Selberg
Processed Aeronautics
Processed Ani DiFranco
Processed Arcadius
Processed Afghans
Processed August 10
Processed Albert Pike
Processed Albrecht Achilles
Processed April 21
Processed Ab urbe condita
Processed Aedicula
Processed Abd al-Latif al-Baghdadi
Processed Abersychan
Processed ACE inhibitor
Processed Antigonid dynasty
Processed Amoxicillin
Processed Ariel Sharon
Processed Ann Druyan
Processe

KeyboardInterrupt: 

In [117]:
sorted_hashmap = dict(reversed(sorted(hashmap.items(), key=lambda item: item[1])))

In [109]:
sorted_hashmap

{'Category:Disambiguation pages': 72,
 'Category:Living people': 35,
 'Category:Days of the year': 16,
 'Category:Place name disambiguation pages': 11,
 'Category:20th-century American male writers': 10,
 'Category:American people of English descent': 10,
 'Category:American male non-fiction writers': 9,
 'Category:Members of the American Philosophical Society': 8,
 'Category:Fellows of the Royal Society': 7,
 'Category:American male screenwriters': 6,
 'Category:American male novelists': 6,
 'Category:21st-century American male writers': 6,
 'Category:Columbia Records artists': 6,
 'Category:American people of Scottish descent': 6,
 'Category:20th-century American novelists': 6,
 'Category:20th-century American male musicians': 5,
 'Category:Philosophers of science': 5,
 'Category:American Nobel laureates': 5,
 'Category:Fellows of the American Academy of Arts and Sciences': 5,
 'Category:Chemical elements': 5,
 'Category:Decca Records artists': 5,
 'Category:American male film actors

In [130]:
to_delete = []
for k, v in sorted_hashmap.items():
    if ("article" in k.lower()) or\
    ("wiki" in k.lower()) or\
    ("sources" in k.lower()) or\
    ("cs" in k.lower()) or\
    ("webarchive" in k.lower()) or\
    ("pages" in k.lower()) or\
    ("use dmy dates" in k.lower()) or\
    ("use mdy dates" in k.lower()) or\
    ("all accuracy disputes" in k.lower()) or\
    ("american" in k.lower()) or\
    ("births" in k.lower()):
        to_delete.append(k)

for k in to_delete:
    del sorted_hashmap[k]

In [131]:
sorted_hashmap

{'Category:Living people': 35,
 'Category:Days of the year': 16,
 'Category:Fellows of the Royal Society': 7,
 'Category:Columbia Records artists': 6,
 'Category:Philosophers of science': 5,
 'Category:Chemical elements': 5,
 'Category:Decca Records artists': 5,
 'Category:Hugo Award-winning writers': 5,
 'Category:Grammy Award winners': 5,
 'Category:United States National Film Registry films': 4,
 'Category:Telecommunications stubs': 4,
 'Category:Films whose editor won the Best Film Editing Academy Award': 4,
 'Category:Centuries': 4,
 'Category:Leap years in the Gregorian calendar': 4,
 'Category:Tony Award winners': 4,
 'Category:1990s English-language films': 4,
 'Category:20th-century short story writers': 4,
 'Category:Knights Bachelor': 4,
 'Category:Presidential Medal of Freedom recipients': 4,
 'Category:Harvard College alumni': 4,
 'Category:Latin words and phrases': 4,
 'Category:Alumni of Trinity College, Cambridge': 4,
 'Category:New York (state) counties': 3,
 'Category

In [128]:
def remove_maintenance_categories(categories):
    URL = "https://en.wikipedia.org/wiki/"
    cat_to_delete = []
    for k, v in categories.items():
        body = session.get(URL+k)
        if "maintenance" in body.text:
            cat_to_delete.append(k)
            print(k)
    for k in cat_to_delete:
        del categories[k]
    return categories, cat_to_delete

In [129]:
len(sorted_hashmap)

6862