In [1]:
import pandas as pd
import numpy as np
import time
import string
import re
from IPython.display import clear_output
import requests
import spacy
import gensim
from gensim import corpora
from nltk.corpus import stopwords
import concurrent.futures
#nltk.download("stopwords")

import warnings
warnings.filterwarnings("ignore")

In [30]:
#!python3 -m spacy download de_core_news_md
nlp = spacy.load('de_core_news_md')

In [52]:
def preprocess(text, idx):
    text = text.lower()
    print("lower - %s" % (idx))
    
    # remove "&nbsp"
    text = re.sub(r"\&nbsp", "", text)
    # remove urls
    # source: url_extract_pattern from https://uibakery.io/regex-library/url-regex-python
    url_extract_pattern = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
    text = re.sub(url_extract_pattern, '', text)
    # remvoe "\n"
    text = re.sub(r"[^ ]*\n", "", text)
    # remove file names with commom endings with 4 or 3 digits
    text = re.sub(r"[^ ]*\..{4}|[^ ]*\..{3}", "", text)
    # remove any refs
    text = re.sub(r"[^ ]*ref", "", text)
    # remove -
    text = re.sub(r"-", "", text)
    #remove punctuation thats left
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # remove stopwords
    text = text.split(" ")
    # source: https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
    text = [word for word in text if word not in german_stopwords]
    
    # lemmatization
    text_lemma = []

    for ix, word in enumerate(text):
        doc = nlp(word)
        result = ' '.join([x.lemma_ for x in doc]) 
        text_lemma.append(result)
    
    final = [gensim.utils.simple_preprocess(word, deacc = True) for word in text_lemma]
    
    for word in final:
        if len(word) == 0:
            final.remove(word)
    
    final_text = [''.join(word) for word in final]
    final_text = " ".join(final_text)
    print("final - %s" % (idx))
    
    preprocessed_content.update({idx: final_text})
    #preprocessed_content.append(final)

In [13]:
def get_pages_by_id(id):
    params.update({"pageids": id})
    response = S.get(url = URL, params = params)
    page = response.json()
    content.update({id: page["query"]["pages"][0]["revisions"][0]["slots"]["main"]["content"]})

---

In [4]:
URL = "https://de.wikipedia.org/w/api.php"

In [9]:
german_stopwords = stopwords.words("german")

In [10]:
additional_stopwords = ["isbn", "url", "infobox", "dateila", "di"]
for word in additional_stopwords:
    german_stopwords.append(word)

In [12]:
# request excellent arictles from german wikipedia via wiki api (10 at a time)
S = requests.Session()

params = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "rvslots": "*",
    "format": "json",
    "formatversion": 2,
    "srsearch": "incategory:Wikipedia:Exzellent",
    "list": "search",
    "sroffset": 0
}

response = S.get(url = URL, params = params)
data = response.json()

# get ids from excellent articles
ids = []

for entry in data["query"]["search"]:
    ids.append(entry["pageid"])

while data.get("continue"):
    params.update({"sroffset": data["continue"]["sroffset"]})
    
    #print("\n%s" % (PARAMS))
    response = S.get(url = URL, params = params)
    data = response.json()
    
    for entry in data["query"]["search"]:
        ids.append(entry["pageid"])

print("Anzahl gesammelter Exzellenter Artikel: %s" %(len(ids)))

#if DATA['query']['search'][0]['title'] == SEARCHPAGE:
#    print("Your search page '" + SEARCHPAGE + "' exists on English Wikipedia")

Anzahl gesammelter Exzellenter Artikel: 2803


In [14]:
content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(get_pages_by_id, ids)

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

DUR: 77.34979796409607


In [15]:
len(content)

2782

In [16]:
# save requested data in df
df = pd.DataFrame(content.items())
df = df.rename({0: "pageid", 1:"content"}, axis = 1)

In [17]:
df.head()

Unnamed: 0,pageid,content
0,1428,"[[Datei:Danimarca XIII secolo, plinio historia..."
1,2677,{{Begriffsklärungshinweis}}\n{{Infobox Staat\n...
2,2391,{{Dieser Artikel|behandelt das Land – zu ander...
3,490244,{{Begriffsklärungshinweis}}\n[[Datei:La scuola...
4,3221050,{{Begriffsklärungshinweis}}\n{{Infobox Ort in ...


In [25]:
content = df["content"][:5]
content

0    [[Datei:Danimarca XIII secolo, plinio historia...
1    {{Begriffsklärungshinweis}}\n{{Infobox Staat\n...
2    {{Dieser Artikel|behandelt das Land – zu ander...
3    {{Begriffsklärungshinweis}}\n[[Datei:La scuola...
4    {{Begriffsklärungshinweis}}\n{{Infobox Ort in ...
Name: content, dtype: object

In [26]:
ids = df["pageid"][:5]
ids

0       1428
1       2677
2       2391
3     490244
4    3221050
Name: pageid, dtype: int64

In [53]:
preprocessed_content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(preprocess, content, ids)
    
end = time.time()
dur = end - start
print("DUR: %s" % (dur))

lower - 1428lower - 2677

lower - 2391
lower - 490244
lower - 3221050
final - 490244
final - 3221050
final - 1428
final - 2677
final - 2391
DUR: 213.14298605918884


In [54]:
preprocessed_content

{490244: 'scuola raffaelraffael schule athen idealisierten darstellung grundervater abendlandisch ohl seit platon sache schriftlich abhandlung angeregen gesprach heute wichtig bestandteil philosophisch ersicht bedeutend philosoph philosophie grcsφιλοσοφια philosophia wortlich lieb weisheit versuchen welt menschlich existenz ergrunden deuten unterscheiden philosophie dadurch oft speziell gebiet bestimmen methodologie begrenzen art fragestellung besonderer vielfaltig artikel gehen westlich abendlandisch philosophie rhunderen antik griechenland ht behandeln abendlandisch philosophie mannigfaltig zusammenhang stehend tradition judisch islamisch philosophie sowie ursprunglich unabhangig tradition afrikanisch ostlich philosophie antikeantik philosophie entfalten systematisch orientiert laufen jahrhundert differenziert unterschiedlich methode disziplin wissenschaft direkt mittelbar philosophie teil abgrenzung irrationale religios weltbildern kerngebieen philosophie logik wissenschaft folgeric

In [56]:
df_pp = pd.DataFrame(preprocessed_content.items())
df_pp = df_pp.rename({0: "pageid", 1:"preprocessed_content"}, axis = 1)

In [57]:
df_pp.head()

Unnamed: 0,pageid,preprocessed_content
0,490244,scuola raffaelraffael schule athen idealisiert...
1,3221050,ort deutsch name name landessprache wappe coat...
2,1428,dateidanimarca xiii secolo plinio historia min...
3,2677,nameamtssprach namedeutsch bildflagge flag of ...
4,2391,land bedeutung sehen indien nameamtssprach spa...


In [59]:
df_pp.to_csv("./all_preprocessed_excellent_article_as_text.csv")