In [1]:
import os
import pandas as pd
import regex as re
import wikipediaapi
import requests
import nltk
# nltk.download('punkt') # needed to download punkt once

In [2]:
def clean_string(
    text, remove_headers=True, remove_enumerations=True
):  # ? might not work for all articles, need to test
    text = re.split(r"==See also==", text)[0]  # remove everything after "See also"
    text = re.sub(
        r"\{\{Asof\|(\d{4})\|(\d{1,2})\}\}", r"as of \2/\1", text
    )  # replace {{Asof|YYYY|MM}} with as of MM/YYYY
    text = re.sub(r"\{\{.*}}", "", text)  # remove {{}}
    text = re.sub(r"&lt;!--.*?-->", "", text)  # remove <!--...-->
    text = re.sub(r"&lt.*--", "", text)  # remove &lt;!--
    text = re.sub(r"-->", "", text)  # remove -->
    if remove_enumerations:
        text = re.sub(r"\n\*.*", "\n", text)
    else:
        text = re.sub(r"\n\* ", "\n", text)  # remove enumeration symbol *
    text = re.sub(r"\n#", "\n", text)  # remove enumeration symbol #
    text = re.sub(r"&lt;ref.*?&lt;/ref>", "", text)  # remove <ref>...</ref>
    text = re.sub(r"&lt;ref.*/>", "", text)  # remove <ref ... />
    if remove_headers:
        text = re.sub(
            r"==+.*==+\n", "", text
        )  # remove lines containing ==, ===, ====, ...
    else:
        text = re.sub(r"==+", "", text)  # remove ==, ===, ====, ...
    text = re.sub(r"'+", "", text)  # remove ''', '''', ...
    text = re.sub(r"\xa0", " ", text)  # replace non-breaking space with space
    text = re.sub(
        r"\[\[File:(?:\[\[[^\]]*?\]\]|.)*?\]\]", "", text
    )  # remove [[File:...]]
    text = re.sub(
        r"\[\[[^\|\]]*\|([^\]]+)\]\]", r"[[\1]]", text
    )  # replace [[left|right]] with [right]
    text = text.replace(r"[[", "").replace("]]", "")  # remove [[ and ]]
    text = re.sub(
        r"\{\|(?:(?:\{\|(?:(?:\{\|(?:[^{}])*\|\})|(?:[^{}]))*\|\})|(?:[^{}]))*\|\}",
        "",
        text,
    )  # replace nested {| * |}
    text = re.sub(r"\{\{(?:\n|.)*?\}\}", "", text)  # replace {{ * }}
    text = re.sub(r"mini\|.*\|", "", text)
    text = re.sub(r"mini\|", "", text)
    text = re.sub(r":\* .*ISBN.*", "", text)  # remove reverence books
    text = re.sub(r"Kategorie:.*", "", text)  # remove reverence books
    text = re.sub(r"\n+", "\n", text)  # replace multiple newlines with one
    return text


def extract_source(text):
    text = re.split(r"<textarea[^>]*>", text)[1]
    return re.split(r"</textarea>", text)[0]



def tokenize_articles(articles):
    tokenized_articles = []
    for article in articles:
        tokenized_articles.append(nltk.sent_tokenize(article))
    return tokenized_articles



In [29]:
def fetch_article_api(article_title, language="en"):
    wiki = wikipediaapi.Wikipedia(
        "FramingAnalysis (riedl.manuel.privat@gmail.com)",
        language,
        extract_format=wikipediaapi.ExtractFormat.WIKI,
    )
    page = wiki.page(article_title)
    article = page.text
    if language == "en":
        article = re.split(r"\nSee also\n", article)[0]
        article = re.split(r"\nReferences\n", article)[0]
        article = re.split(r"\nSignificant publications\n", article)[0]
        article = re.split(r"\nPublications\n", article)[0]
        article = re.split(r"\nPublications\n", article)[0]
        article = re.split(r"\n== References ==", article)[0]

    return article


def fetch_wiki_articles_http(
    article_title, language="en"
): 
    response = requests.get(
        f"https://{language}.wikipedia.org/w/index.php?title={article_title}&action=edit"
    )
    text = extract_source(response.text)
    return clean_string(text)

def fetch_article_locally(article_title):
    with open("articles/" + article_title + ".txt", "r", encoding="utf8") as file:
        return file.read()

def fetch_articles(article_titles, language="en", fetch_method="api"):
    articles = []
    for article_title in article_titles:
        if fetch_method == "api":
            article = fetch_article_api(article_title, language)
        elif fetch_method == "http":
            article = fetch_wiki_articles_http(article_title, language)
        elif fetch_method == "local":
            article = fetch_article_locally(article_title)
        else:
            raise ValueError("Invalid fetch method")
        articles.append(article)
   
    return articles


def save_articles_locally(articles, file_names, path="articles/"):
    os.makedirs(path, exist_ok=True)
    for i in range(len(articles)):
        with open(path + file_names[i] + ".txt", "w", encoding="utf8") as file:
            file.write(articles[i])


In [27]:
path = "./article_titles_by_category/"
article_titles = []
for file in os.listdir(path):
    with open(path + file, "r") as f:
        article_titles.extend(f.read().splitlines())

In [31]:
#articles_http = fetch_articles(article_titles, fetch_method="http")
#save_articles_locally(articles_http, article_titles, path="articles_http/")

articles_api = fetch_articles(article_titles, fetch_method="api")
save_articles_locally(articles_api, article_titles)