In [47]:
import wikipediaapi
import random 

wikipedia = wikipediaapi.Wikipedia(user_agent='TUM_NLP (moritz.ladenburger@tum.de)', language='en')

API_URL = "https://en.wikipedia.org/w/api.php"

wiki_html = wikipediaapi.Wikipedia(
    user_agent='TUM_NLP (moritz.ladenburger@tum.de)',
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI
)

def get_category_articles(category):
    cat = wikipedia.page(f"Category:{category}")
    return cat.categorymembers.values()

def get_children_articles(children):
    articles = []
    for child in children:
        child_articles = get_category_articles(child)  # Missing assignment
        articles.extend(child_articles)  # Use extend instead of append
    return articles

def get_page_text(title):
    page = wiki_html.page(title)
    if page.exists():
        return page.text
    else:
        return None
    

In [51]:
def get_texts(articles, no_words):
    text = ""
    text_len = 0
    children = []
    counter = 0
    for article in articles:
        counter += 1
        if counter % 50 == 0:
            print(f"Processed {counter} articles, current text length: {text_len} words.")
        if article.title.startswith("Category:"):
            children.append(article.title.replace("Category:", ""))
            continue
        page_text = get_page_text(article.title)
        if page_text:
            text += page_text + "\n\n "
            text_len += len(page_text.split())
        if text_len >= no_words:
            break
    print(f"Collected {len(text.split())} words from {len(articles)} articles.")
    return " ".join(text.split()[:no_words]), get_children_articles(children)

def create_dataset_from_category(category, name, no_words):
    total_words = int(no_words * 1.2)
    articles = list(get_category_articles(category))
    random.seed(42)
    text = ""
    text_len = 0
    while text_len < total_words and articles: 
        random.shuffle(articles)    
        for i in range(0, len(articles), 20):
            print(f"Processing article: {articles[i].title})")
        print(f"Current layer has {len(articles)} articles, total words collected: {text_len}")   
        article_texts, children_articles = get_texts(articles, total_words - text_len)
        text += article_texts + "\n\n "
        text_len += len(article_texts.split())
        articles = children_articles
        print(f" Finished layer, moving further. Total words collected: {text_len}")


    with open(f"../datasets/wiki_categories/{name}.train", 'w') as f:
        train_text = " ".join(text.split()[:no_words])
        print(f"Total words: {len(train_text.split())}")
        f.write(train_text)
    with open(f"../datasets/wiki_categories/{name}_dev.train", 'w') as f:
        train_text = " ".join(text.split()[no_words:total_words])
        print(f"Total dev words: {len(train_text.split())}")
        f.write(train_text)
    
    


In [49]:
create_dataset_from_category("Quantum mechanics", "wiki_subfields_of_physics", 1_000_000)

Current layer has 369 articles, total words collected: 0
Processed 10 articles, current text length: 7314 words.
Processed 20 articles, current text length: 24040 words.
Processed 30 articles, current text length: 40955 words.
Processed 40 articles, current text length: 51588 words.
Processed 50 articles, current text length: 69937 words.
Processed 60 articles, current text length: 96834 words.
Processed 70 articles, current text length: 123660 words.
Processed 80 articles, current text length: 141046 words.
Processed 90 articles, current text length: 174505 words.
Processed 100 articles, current text length: 193274 words.
Processed 110 articles, current text length: 204575 words.
Processed 120 articles, current text length: 225192 words.
Processed 130 articles, current text length: 241026 words.
Processed 140 articles, current text length: 272835 words.
Processed 150 articles, current text length: 279868 words.
Processed 160 articles, current text length: 294353 words.
Processed 170 a

In [52]:
create_dataset_from_category("History", "wiki_history", 1_000_000)

Processing article: Muhammad ibn al-Alqami)
Processing article: Category:People by historical event)
Current layer has 38 articles, total words collected: 0
Collected 44010 words from 38 articles.
 Finished layer, moving further. Total words collected: 44010
Processing article: Horned God)
Processing article: Category:History of Algeria by location)
Processing article: Category:Military history lists)
Processing article: Porter–MacKenzie debate)
Processing article: History of communication)
Processing article: Category:History of Israel by location)
Processing article: Artis Historicae Penus)
Processing article: Chronology of the ancient Near East)
Processing article: Gender: A Useful Category of Historical Analysis)
Processing article: Post-classical history)
Processing article: Category:Lists of expeditions)
Processing article: List of the oldest mosques)
Processing article: Category:History of Germany by location)
Processing article: Shubhodaya controversy)
Processing article: Categ

In [53]:
create_dataset_from_category("Culture", "wiki_culture", 1_000_000)

Processing article: Semiotics of culture)
Processing article: Highbrow)
Processing article: Hypermobility (travel))
Processing article: Design culture)
Processing article: Category:Consensus reality)
Current layer has 100 articles, total words collected: 0
Processed 50 articles, current text length: 30717 words.
Processed 100 articles, current text length: 77468 words.
Collected 77468 words from 100 articles.
 Finished layer, moving further. Total words collected: 77468
Processing article: Category:Nigerian culture by ethnicity)
Processing article: Category:Images of clothing)
Processing article: Category:Transport in culture)
Processing article: Samizdat)
Processing article: Cultural critic)
Processing article: Seven Natural Wonders of Africa)
Processing article: South Park and Philosophy: Bigger, Longer, and More Penetrating)
Processing article: Category:Arts occupations)
Processing article: Category:Romani-related controversies)
Processing article: Category:Food and drink by depende

In [54]:
create_dataset_from_category("Society", "wiki_society", 1_000_000)

Processing article: Category:Society by topic)
Processing article: Category:Society-related lists)
Processing article: Humanities)
Processing article: Civilization)
Current layer has 71 articles, total words collected: 0
Processed 50 articles, current text length: 49490 words.
Collected 68039 words from 71 articles.
 Finished layer, moving further. Total words collected: 68039
Processing article: Eye-gouging)
Processing article: Divorce demography)
Processing article: Demographic surveillance system)
Processing article: Sadistic personality disorder)
Processing article: Trusona)
Processing article: Convention on Psychotropic Substances)
Processing article: Demographics of scientific divers)
Processing article: Protection of Homes, Small Businesses, and Private Property Act of 2005)
Processing article: Category:Sociolects)
Processing article: Category:Medical districts)
Processing article: Simonyi Professor for the Public Understanding of Science)
Processing article: Inherent bad faith 

In [57]:
create_dataset_from_category("Linguistics", "wiki_linguistics", 1_000_000)

Processing article: Category:Language contact)
Processing article: Al-Douri 'an Abi 'Amr recitation)
Processing article: Mathematical linguistics)
Processing article: Node (linguistics))
Processing article: Theta criterion)
Processing article: Filled pause)
Processing article: Metafunction)
Processing article: Linguistic description)
Processing article: Figurae)
Processing article: Type–token distinction)
Current layer has 192 articles, total words collected: 0
Processed 50 articles, current text length: 48062 words.
Processed 100 articles, current text length: 95256 words.
Processed 150 articles, current text length: 169876 words.
Collected 208514 words from 192 articles.
 Finished layer, moving further. Total words collected: 208514
Processing article: Category:Works about translation)
Processing article: Usko-Mediterranean languages)
Processing article: Linguistics and the Book of Mormon)
Processing article: LMS Center)
Processing article: Category:Linguistics books by writer)
Proce