# <Hier Titel einfügen>
<Hier Kurzbeschreibung einfügen>

In [1]:
import pandas as pd
import numpy as np
import time
import requests
import concurrent.futures
import re
import string
import spacy
import gensim
import yake
from gensim import corpora
from nltk.corpus import stopwords
#nltk.download("stopwords")

import warnings
warnings.filterwarnings("ignore")

## request data

In [3]:
URL = "https://de.wikipedia.org/w/api.php"

```python
S = requests.Session()
```

```python
# request excellent arictles from german wikipedia via wiki api (10 at a time)
params_pageid = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "rvslots": "*",
    "format": "json",
    "formatversion": 2,
    "srsearch": "incategory:Wikipedia:Exzellent",
    "list": "search",
    "sroffset": 0
}

response = S.get(url = URL, params = params_pageid)
data = response.json()

# get ids from excellent articles
ids = []

for entry in data["query"]["search"]:
    ids.append(entry["pageid"])

while data.get("continue"):
    params_pageid.update({"sroffset": data["continue"]["sroffset"]})
    
    #print("\n%s" % (PARAMS))
    response = S.get(url = URL, params = params_pageid)
    data = response.json()
    
    for entry in data["query"]["search"]:
        ids.append(entry["pageid"])

print("Anzahl gesammelter Exzellenter Artikel: %s" %(len(ids)))
```

In [4]:
def get_pages_by_id(id):
    params_content.update({"pageids": id})
    response = S.get(url = URL, params = params_content)
    page = response.json()
    content.update({id: page["query"]["pages"][0]["extract"]})

In [5]:
# Source: https://stackoverflow.com/questions/4452102/how-to-get-plain-text-out-of-wikipedia
# Source: https://www.mediawiki.org/wiki/API:Parsing_wikitext
params_content = {
    "action": "query",
    "prop": "extracts",
    "format": "json",
    "formatversion": 2,
    "pageids": 0,
    "explaintext": True
}

```python
content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(get_pages_by_id, ids)

end = time.time()
dur = end - start
print("DUR: %s" % (dur))
```

```python
# save requested data (wikipage content) in df
df = pd.DataFrame(content.items())
df = df.rename({0: "pageid", 1:"content"}, axis = 1)
```

```python
# save data to csv for faster loading
# TODO: Den Dateinamen dynamisch anpassen
# --> Die Zahl vorne dynamisch durch die Anzahl an Seiten in content ersetzten
df.to_csv("./2783_excellent_article_extract.csv")
```

Die angefragten Daten können auch vom filesystem geladen werden

In [3]:
# load data from file
data = pd.read_csv("./2783_excellent_article_extract.csv")

data = data[["pageid", "content"]]

In [4]:
data.head()

Unnamed: 0,pageid,content
0,1428,Kanada (englisch und französisch Canada) ist e...
1,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i..."
2,2996,Leipzig ([ˈlaɪ̯pt͡sɪç]; im sächsischen Dialekt...
3,1200964,Australien (amtlicher deutscher Name; englisch...
4,290,Argentinien (spanisch [aɾxenˈtina]) ist eine ...


## preprocess data

In [29]:
def preprocess_data(text, idx):
    text = re.sub(r"\n", "", text)
    #t = re.sub(r"\{{2}.*?\}{2}", "", t)
    #t = re.sub(r"\}{2}", "", t)
    #t = re.sub(r"\[\[[0-9A-Za-z\s()]*?\|", "", t)
    #t = re.sub(r"\[\[", "", t)
    #t = re.sub(r"\]\]", "", t)
    #t = re.sub(r"\[.*?\]", "", t)
    #t = re.sub(r"<ref.*?/ref>", "", t)
    #t = re.sub(r"<.*?>", "", t)
    #t = re.sub(r"\|", " ", t)
    #t = re.sub(r"\'", "", t)

    # source: url_extract_pattern from https://uibakery.io/regex-library/url-regex-python
    #url_extract_pattern = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
    #t = re.sub(url_extract_pattern, '', t)

    text = re.sub(r"\(=.*?\)", "", text)
    text = re.sub(r"--+", "", text)
    text = re.sub(r"==.*?==", "", text)
    #t = re.sub(r"==+", "", t)
    #t = re.sub(r"\&nbsp;", "", t)
    # Dateinamen
    #t = re.sub(r"[^ ]*\..{4}|[^ ]*\..{3}", "", t)

    #source: https://stackoverflow.com/questions/2077897/substitute-multiple-whitespace-with-single-whitespace-in-python
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
    text = _RE_COMBINE_WHITESPACE.sub(" ", text).strip()
    
    preprocessed_content.update({idx: text})

In [32]:
preprocessed_content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(preprocess_data, data["content"], data["pageid"])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

DUR: 6.13576602935791


In [34]:
len(preprocessed_content)

2794

In [35]:
preprocessed_df = pd.DataFrame(preprocessed_content.items())
preprocessed_df = preprocessed_df.rename({0: "pageid", 1:"content"}, axis = 1)

preprocessed_df.head()

Unnamed: 0,pageid,content
0,1428,Kanada (englisch und französisch Canada) ist e...
1,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i..."
2,1200964,Australien (amtlicher deutscher Name; englisch...
3,2996,Leipzig ([ˈlaɪ̯pt͡sɪç]; im sächsischen Dialekt...
4,2677,Kanada (englisch und französisch Canada) ist e...


In [36]:
# save preproessed wiki pages to filesystem
preprocessed_df.to_csv("./data/preprocessed_wiki_pages.csv")

backup
```python
#!python3 -m spacy download de_core_news_md
nlp = spacy.load('de_core_news_md')
```

```python
t_word_list = t.split(" ")

# lemmatization
text_lemma = []

for ix, word in enumerate(t_word_list):
    doc = nlp(word)
    result = ' '.join([x.lemma_ for x in doc]) 
    text_lemma.append(result)

lemma_word_list = [gensim.utils.simple_preprocess(word, deacc = True) for word in text_lemma]

# remove stopwords
# source: https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
#final = [word for word in lemma_word_list if word not in german_stopwords]

final = []
for word in lemma_word_list:
    if len(word) > 0 and word[0] not in german_stopwords:
        final.append(word[0])

text_without_stopwords = " ".join(final)
```

## extract keywords
using yakes keyword extractor  
(https://liaad.github.io/yake/)

In [30]:
german_stopwords = stopwords.words("german")

In [40]:
kw_extractor = yake.KeywordExtractor()

('Kanada', 0.0007969276779531967)
('und die', 0.001910502339844232)
('von Kanada', 0.0019105629687418378)
('Ontario und Québec', 0.0020771841265886668)
('den Vereinigten Staaten', 0.0025137544434408374)
('und der', 0.002543935783646112)
('Kanadas', 0.002846170278404274)
('Kanada ist', 0.003506146900051947)
('englisch und französisch', 0.0039061489078895313)
('mit der', 0.003954407244586976)
##############################
('Moskau', 0.0003090203792953505)
('der Stadt Moskau', 0.0003528573847636329)
('der Stadt', 0.0004907477457929229)
('die Stadt', 0.0005513636053280246)
('Moskau die Stadt', 0.0009694463559016367)
('Stadt', 0.0011065952798747713)
('Stadt Moskau', 0.0012596861120265533)
('von Moskau', 0.0014695805909670332)
('Zentrum von Moskau', 0.0015359446017669347)
('Moskaus', 0.0017478965203893261)
('Die Stadt ist', 0.0018941819312858752)
('die Stadt mit', 0.001904495773274296)
('Moskau die', 0.0021195300520005706)
('Moskau ist', 0.0021851486954712015)
##############################

keyword extraction for the first 10 pages

```python
start = time.time()
for wiki_page in preprocessed_df["content"][:10]:
    keywords = kw_extractor.extract_keywords(wiki_page)

    for kw in keywords:
        if kw[0] not in german_stopwords:
            print(kw)
            
    print("#" * 30)
end = time.time()

print("DUR: %s" % (end - start))
```

In [63]:
def extract_keywords(text, idx):
    keywords = []
    extracted_keywords = kw_extractor.extract_keywords(text)
    
    for keyword in extracted_keywords:
        # remove keywords that resamble stopwords from the german language
        if keyword[0] not in german_stopwords:
            keywords.append(keyword)
    
    wikipage_keywords.update({idx: keywords})

In [45]:
wikipage_keywords = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(extract_keywords, preprocessed_df["content"], preprocessed_df["pageid"])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

DUR: 2620.6581139564514


In [46]:
len(wikipage_keywords)

2794

In [50]:
keywords_df = pd.DataFrame(wikipage_keywords.items())
keywords_df = keywords_df.rename({0: "pageid", 1:"keywords"}, axis = 1)

In [58]:
keywords_df.head()

Unnamed: 0,pageid,keywords
0,290,"[(Provinz Buenos Aires, 0.0005268788558665713)..."
1,880316,"[(Finnland, 0.0009182279474707477), (Finnlands..."
2,1200964,"[(Australien, 0.0009412149837407555), (der Ein..."
3,490244,"[(die Philosophie, 0.00030965526458213934), (d..."
4,29938,"[(die Philosophie, 0.00030965526458213934), (d..."


In [59]:
# save keywords with coresponding page id
keywords_df.to_csv("./data/wikipage_keywords.csv")

## results

In [52]:
# join wikipages with (predicted) keywords on pageid
results = pd.concat(
    [preprocessed_df.set_index("pageid"), keywords_df.set_index("pageid")], axis = 1, join = "inner"
).reset_index()

In [60]:
results.head()

Unnamed: 0,pageid,content,keywords
0,1428,Kanada (englisch und französisch Canada) ist e...,"[(Kanada, 0.0007969276779531967), (und die, 0...."
1,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i...","[(Moskau, 0.0003090203792953505), (der Stadt M..."
2,1200964,Australien (amtlicher deutscher Name; englisch...,"[(Australien, 0.0009412149837407555), (der Ein..."
3,2996,Leipzig ([ˈlaɪ̯pt͡sɪç]; im sächsischen Dialekt...,"[(der Stadt Leipzig, 1.658497245537868e-05), (..."
4,2677,Kanada (englisch und französisch Canada) ist e...,"[(Kanada, 0.0007969276779531967), (und die, 0...."


## evaluation