# <Hier Titel einfügen>
<Hier Kurzbeschreibung einfügen>

In [1]:
import pandas as pd
import numpy as np
import time
import requests
import concurrent.futures
import re
import string
import spacy
import gensim
import yake
from gensim import corpora
from nltk.corpus import stopwords
from IPython.display import clear_output
#nltk.download("stopwords")

import warnings
warnings.filterwarnings("ignore")

In [2]:
# source: IBM internal
def update_progress(progress):
    bar_length = 50
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

## request data

In [3]:
URL = "https://de.wikipedia.org/w/api.php"

```python
S = requests.Session()
```

```python
# request excellent arictles from german wikipedia via wiki api (10 at a time)
params_pageid = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "rvslots": "*",
    "format": "json",
    "formatversion": 2,
    "srsearch": "incategory:Wikipedia:Exzellent",
    "list": "search",
    "sroffset": 0
}

response = S.get(url = URL, params = params_pageid)
data = response.json()

# get ids from excellent articles
ids = []

for entry in data["query"]["search"]:
    ids.append(entry["pageid"])

while data.get("continue"):
    params_pageid.update({"sroffset": data["continue"]["sroffset"]})
    
    #print("\n%s" % (PARAMS))
    response = S.get(url = URL, params = params_pageid)
    data = response.json()
    
    for entry in data["query"]["search"]:
        ids.append(entry["pageid"])

print("Anzahl gesammelter Exzellenter Artikel: %s" %(len(ids)))
```

In [6]:
def get_pages_by_id(id):
    request_params = params_content.copy()
    request_params.update({"pageids": id})
    response = S.get(url = URL, params = request_params)
    page = response.json()
    content.update({id: page["query"]["pages"][0]["extract"]})
    if len(content) % 10 == 0:
        update_progress(len(content) / len(ids))

In [7]:
# Source: https://stackoverflow.com/questions/4452102/how-to-get-plain-text-out-of-wikipedia
# Source: https://www.mediawiki.org/wiki/API:Parsing_wikitext
params_content = {
    "action": "query",
    "prop": "extracts",
    "format": "json",
    "formatversion": 2,
    "pageids": 0,
    "explaintext": True
}

```python
content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(get_pages_by_id, ids)

end = time.time()
dur = end - start
print("DUR: %s" % (dur))
```

In [9]:
len(content)

2796

```python
# save requested data (wikipage content) in df
df = pd.DataFrame(content.items())
df = df.rename({0: "pageid", 1:"content"}, axis = 1)
```

```python
# save data to csv for faster loading
# TODO: Den Dateinamen dynamisch anpassen
# --> Die Zahl vorne dynamisch durch die Anzahl an Seiten in content ersetzten
df.to_csv("./2783_excellent_article_extract.csv")
```

Die angefragten Daten können auch vom filesystem geladen werden

In [12]:
# load data from file
data = pd.read_csv("./2783_excellent_article_extract.csv")

data = data[["pageid", "content"]]

In [13]:
data.head(15)

Unnamed: 0,pageid,content
0,2677,Kanada (englisch und französisch Canada) ist e...
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i..."
3,16565,Frankfurt am Main () ist mit 759.224 Einwohner...
4,1200964,Australien (amtlicher deutscher Name; englisch...
5,880316,"Finnland (finnisch [ˈsuɔmi], schwedisch Finla..."
6,1428,"Eine Enzyklopädie (), früher auch aus dem Fran..."
7,290,Argentinien (spanisch [aɾxenˈtina]) ist eine ...
8,29938,Die Republik Südafrika (RSA) ist ein Staat im ...
9,2391,Indien [ˈɪndi̯ən] (Eigennamen unter anderem Hi...


## preprocess data

In [14]:
def preprocess_data(text, idx):
    text = re.sub(r"\n", "", text)
    #t = re.sub(r"\{{2}.*?\}{2}", "", t)
    #t = re.sub(r"\}{2}", "", t)
    #t = re.sub(r"\[\[[0-9A-Za-z\s()]*?\|", "", t)
    #t = re.sub(r"\[\[", "", t)
    #t = re.sub(r"\]\]", "", t)
    #t = re.sub(r"\[.*?\]", "", t)
    #t = re.sub(r"<ref.*?/ref>", "", t)
    #t = re.sub(r"<.*?>", "", t)
    #t = re.sub(r"\|", " ", t)
    #t = re.sub(r"\'", "", t)

    # source: url_extract_pattern from https://uibakery.io/regex-library/url-regex-python
    #url_extract_pattern = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
    #t = re.sub(url_extract_pattern, '', t)

    text = re.sub(r"\(=.*?\)", "", text)
    text = re.sub(r"--+", "", text)
    text = re.sub(r"==.*?==", "", text)
    #t = re.sub(r"==+", "", t)
    #t = re.sub(r"\&nbsp;", "", t)
    # Dateinamen
    #t = re.sub(r"[^ ]*\..{4}|[^ ]*\..{3}", "", t)

    #source: https://stackoverflow.com/questions/2077897/substitute-multiple-whitespace-with-single-whitespace-in-python
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
    text = _RE_COMBINE_WHITESPACE.sub(" ", text).strip()
    
    preprocessed_content.update({idx: text})
    
    if len(preprocessed_content) % 10 == 0:
        update_progress(len(preprocessed_content) / len(ids))

In [15]:
preprocessed_content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(preprocess_data, data["content"], data["pageid"])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

Progress: [#################################################-] 98.8%
Progress: [##################################################] 99.1%
Progress: [#################################################-] 98.0%
Progress: [##################################################] 99.5%
Progress: [#################################################-] 98.4%
DUR: 5.9012627601623535


In [16]:
len(preprocessed_content)

2796

In [17]:
preprocessed_df = pd.DataFrame(preprocessed_content.items())
preprocessed_df = preprocessed_df.rename({0: "pageid", 1:"content"}, axis = 1)

preprocessed_df.head()

Unnamed: 0,pageid,content
0,2677,Kanada (englisch und französisch Canada) ist e...
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i..."
3,1200964,Australien (amtlicher deutscher Name; englisch...
4,880316,"Finnland (finnisch [ˈsuɔmi], schwedisch Finlan..."


In [18]:
# save preproessed wiki pages to filesystem
preprocessed_df.to_csv("./data/preprocessed_wiki_pages.csv")

backup
```python
#!python3 -m spacy download de_core_news_md
nlp = spacy.load('de_core_news_md')
```

```python
t_word_list = t.split(" ")

# lemmatization
text_lemma = []

for ix, word in enumerate(t_word_list):
    doc = nlp(word)
    result = ' '.join([x.lemma_ for x in doc]) 
    text_lemma.append(result)

lemma_word_list = [gensim.utils.simple_preprocess(word, deacc = True) for word in text_lemma]

# remove stopwords
# source: https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
#final = [word for word in lemma_word_list if word not in german_stopwords]

final = []
for word in lemma_word_list:
    if len(word) > 0 and word[0] not in german_stopwords:
        final.append(word[0])

text_without_stopwords = " ".join(final)
```

## extract keywords
using yakes keyword extractor  
(https://liaad.github.io/yake/)

yake hat eine eigene stopwords liste (https://github.com/LIAAD/yake/blob/master/yake/StopwordsList/stopwords_de.txt)  
--> Keine weitere Stopwords suche benötigt

Lemma wird auch nicht benötigt, weil Yake ohne entsprechend trainiert wurde  

Je geringer der Wahrscheinlichkeitswert, desto relevanter das Keyword (https://liaad.github.io/yake/docs/getting_started.html#output)

```python
german_stopwords = stopwords.words("german")
```

```python
language = "de"
max_ngram_size = 20
deduplication_threshold = 0.9
numOfKeywords = 10
custom_keyword_extractor = yake.KeywordExtractor(
    lan = language, 
    n = max_ngram_size, 
    dedupLim = deduplication_threshold, 
    top = numOfKeywords, 
    features = None
)
```

In [19]:
kw_extractor = yake.KeywordExtractor(
    lan = "de",
    dedupLim = 0.99,
    top = 10,
    dedupFunc = "seqm", # default: seqm, alternative: jaro
    n = 2,
    windowsSize = 1
)

keyword extraction for the first 10 pages

```python
start = time.time()
for wiki_page in preprocessed_df["content"][:5]:
    keywords = kw_extractor.extract_keywords(wiki_page)
    prop = 0

    for kw in keywords:
    #    if kw[0] not in german_stopwords:
        print(kw)
        prop += kw[1]
            
    print("MEAN: %s" % (np.mean(prop)))
    print("#" * 30 + "\n")
end = time.time()

print("DUR: %s" % (end - start))
```

In [20]:
def extract_keywords(text, idx):
    keywords = []
    kw_extractor = yake.KeywordExtractor(
        lan = "de",
        dedupLim = 0.99,
        top = 10,
        dedupFunc = "seqm", # default: seqm, alternative: jaro
        n = 2,
        windowsSize = 1
    )
    
    extracted_keywords = kw_extractor.extract_keywords(text)
    wikipage_keywords.update({idx: extracted_keywords})
    #if len(wikipage_keywords) % 10 == 0:
    update_progress(len(wikipage_keywords) / len(ids))

In [21]:
wikipage_keywords = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(extract_keywords, preprocessed_df["content"], preprocessed_df["pageid"])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

Progress: [##################################################] 99.7%
DUR: 2454.962177991867


In [22]:
len(wikipage_keywords)

2796

In [23]:
keywords_df = pd.DataFrame(wikipage_keywords.items())
keywords_df = keywords_df.rename({0: "pageid", 1:"keywords"}, axis = 1)

In [24]:
keywords_df.head()

Unnamed: 0,pageid,keywords
0,290,"[(Buenos Aires, 0.0003009614302623394), (Argen..."
1,880316,"[(Finnland, 0.0006628069903539394), (Finnlands..."
2,1200964,"[(Australien, 0.0006621153549226484), (Austral..."
3,2391,"[(Indien, 0.00048282594130112937), (Indiens, 0..."
4,490244,"[(Philosophie, 0.0002460226827235382), (prakti..."


In [25]:
# save keywords with coresponding page id
keywords_df.to_csv("./data/wikipage_keywords.csv")

## results

In [26]:
# join wikipages with (predicted) keywords on pageid
results = pd.concat(
    [preprocessed_df.set_index("pageid"), keywords_df.set_index("pageid")], axis = 1, join = "inner"
).reset_index()

In [27]:
results.head()

Unnamed: 0,pageid,content,keywords
0,2677,Kanada (englisch und französisch Canada) ist e...,"[(Kanada, 0.0005067610926873967), (Kanadas, 0...."
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...,"[(Philosophie, 0.0002460226827235382), (prakti..."
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i...","[(Moskau, 0.00022634304800017076), (Stadt Mosk..."
3,1200964,Australien (amtlicher deutscher Name; englisch...,"[(Australien, 0.0006621153549226484), (Austral..."
4,880316,"Finnland (finnisch [ˈsuɔmi], schwedisch Finlan...","[(Finnland, 0.0006628069903539394), (Finnlands..."


## evaluation

In [28]:
#test