In [None]:
import pandas as pd
import numpy as np
import time
import string
import re
from IPython.display import clear_output
import requests
import spacy
import gensim
from gensim import corpora
from nltk.corpus import stopwords
import concurrent.futures
#nltk.download("stopwords")

import warnings
warnings.filterwarnings("ignore")

In [None]:
def update_progress(progress):
    bar_length = 50
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [None]:
nlp = spacy.load('de_core_news_md')

In [None]:
def preprocess(text, idx):
    text = text.lower()
    
    # remove "&nbsp"
    text = re.sub(r"\&nbsp", "", text)
    # remove urls
    # source: url_extract_pattern from https://uibakery.io/regex-library/url-regex-python
    url_extract_pattern = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
    text = re.sub(url_extract_pattern, '', text)
    # remvoe "\n"
    text = re.sub(r"[^ ]*\n", "", text)
    # remove file names with commom endings with 4 or 3 digits
    text = re.sub(r"[^ ]*\..{4}|[^ ]*\..{3}", "", text)
    # remove any refs
    text = re.sub(r"[^ ]*ref", "", text)
    # remove -
    text = re.sub(r"-", "", text)
    #remove punctuation thats left
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # remove stopwords
    text = text.split(" ")
    # source: https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
    text = [word for word in text if word not in german_stopwords]
    
    # lemmatization
    text_lemma = []

    for ix, word in enumerate(text):
        doc = nlp(word)
        result = ' '.join([x.lemma_ for x in doc]) 
        text_lemma.append(result)
    
    final = [gensim.utils.simple_preprocess(word, deacc = True) for word in text_lemma]
    
    for word in final:
        if len(word) == 0:
            final.remove(word)
    
    preprocessed_content.update({idx: final})
    #preprocessed_content.append(final)

---

In [None]:
URL = "https://de.wikipedia.org/w/api.php"

In [None]:
german_stopwords = stopwords.words("german")

In [None]:
# request excellent arictles from german wikipedia via wiki api (10 at a time)
S = requests.Session()

params = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "rvslots": "*",
    "format": "json",
    "formatversion": 2,
    "srsearch": "incategory:Wikipedia:Exzellent",
    "list": "search",
    "sroffset": 0
}

response = S.get(url = URL, params = params)
data = response.json()

# get ids from excellent articles
ids = []

for entry in data["query"]["search"]:
    ids.append(entry["pageid"])

while data.get("continue"):
    params.update({"sroffset": data["continue"]["sroffset"]})
    
    #print("\n%s" % (PARAMS))
    response = S.get(url = URL, params = params)
    data = response.json()
    
    for entry in data["query"]["search"]:
        ids.append(entry["pageid"])

print("Anzahl gesammelter Exzellenter Artikel: %s" %(len(ids)))

#if DATA['query']['search'][0]['title'] == SEARCHPAGE:
#    print("Your search page '" + SEARCHPAGE + "' exists on English Wikipedia")

In [None]:
# request data to every excellent article in german wikipedia via wikipedia api using pageid
params = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "rvslots": "*",
    "format": "json",
    "formatversion": 2,
    "pageids": 0
}

data = pd.DataFrame()
content = {}

for ix, id in enumerate(ids):
    update_progress(ix / len(ids))
    params.update({"pageids": id})
    response = S.get(url = URL, params = params)
    page = response.json()
    
    """preprocessed = preprocess(page["query"]["pages"][0]["revisions"][0]["slots"]["main"]["content"], german_stopwords)
    
    for word in preprocessed:
        if len(word) == 0:
            preprocessed.remove(word)
        
    content.update({id: preprocessed})"""
    content.update({id: page["query"]["pages"][0]["revisions"][0]["slots"]["main"]["content"]})

In [None]:
def get_pages_by_id(id):
    params.update({"pageids": id})
    response = S.get(url = URL, params = params)
    page = response.json()
    content.update({id: page["query"]["pages"][0]["revisions"][0]["slots"]["main"]["content"]})

In [None]:
content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(get_pages_by_id, ids)

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

In [None]:
# save requested data in df
df = pd.DataFrame(content.items())
df = df.rename({0: "pageid", 1:"content"}, axis = 1)

In [None]:
df.head()

In [None]:
preprocessed_content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(preprocess, df["content"], ids)
    
end = time.time()
dur = end - start
print("DUR: %s" % (dur))

In [None]:
# save data to csv for faster loading
#df.to_csv("./preprocessed_excellent_article-109.csv")

In [None]:
final = [gensim.utils.simple_preprocess(word, deacc = True) for word in test_page_lemma]

id2word = corpora.Dictionary(final)

corpus = [id2word.doc2bow(word) for word in final]

In [None]:
(len(ids) * 53.56256318092346) / 60 / 60