# Abgabe zur Vorlesung "Forschungsthemen Informatik" von Jasmin Noll

In diesem Notebook wird der **2. Shared Task: Top Modelling auf Artikel aus DE-Wikipedia** bearbeitet.  
<Kurze Erläuterung der Aufgabe und was in diesem Notebook erwartet wird>

In [1]:
import pandas as pd
import numpy as np
import time
import requests
import concurrent.futures
import re
import string
import spacy
import gensim
import yake
from gensim import corpora
from nltk.corpus import stopwords
from IPython.display import clear_output
from sklearn.metrics import mean_squared_error
#nltk.download("stopwords")

import warnings
warnings.filterwarnings("ignore")

In [2]:
# source: IBM internal
def update_progress(progress):
    bar_length = 50
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

## request data

In [3]:
URL = "https://de.wikipedia.org/w/api.php"

```python
S = requests.Session()
```

```python
# request excellent arictles from german wikipedia via wiki api (10 at a time)
params_pageid = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "rvslots": "*",
    "format": "json",
    "formatversion": 2,
    "srsearch": "incategory:Wikipedia:Exzellent",
    "list": "search",
    "sroffset": 0
}

response = S.get(url = URL, params = params_pageid)
data = response.json()

# get ids from excellent articles
ids = []

for entry in data["query"]["search"]:
    ids.append(entry["pageid"])

while data.get("continue"):
    params_pageid.update({"sroffset": data["continue"]["sroffset"]})
    
    #print("\n%s" % (PARAMS))
    response = S.get(url = URL, params = params_pageid)
    data = response.json()
    
    for entry in data["query"]["search"]:
        ids.append(entry["pageid"])

print("Anzahl gesammelter Exzellenter Artikel: %s" %(len(ids)))
```

In [4]:
def get_pages_by_id(id):
    request_params = params_content.copy()
    request_params.update({"pageids": id})
    response = S.get(url = URL, params = request_params)
    page = response.json()
    content.update({id: page["query"]["pages"][0]["extract"]})
    if len(content) % 10 == 0:
        update_progress(len(content) / len(ids))

In [5]:
# Source: https://stackoverflow.com/questions/4452102/how-to-get-plain-text-out-of-wikipedia
# Source: https://www.mediawiki.org/wiki/API:Parsing_wikitext
params_content = {
    "action": "query",
    "prop": "extracts",
    "format": "json",
    "formatversion": 2,
    "pageids": 0,
    "explaintext": True
}

```python
content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(get_pages_by_id, ids)

end = time.time()
dur = end - start
print("DUR: %s" % (dur))
```

In [7]:
#len(content)

```python
# save requested data (wikipage content) in df
df = pd.DataFrame(content.items())
df = df.rename({0: "pageid", 1:"content"}, axis = 1)
```

```python
# save data to csv for faster loading
# TODO: Den Dateinamen dynamisch anpassen
# --> Die Zahl vorne dynamisch durch die Anzahl an Seiten in content ersetzten
df.to_csv("./2783_excellent_article_extract.csv")
```

TODO: Einige Statistiken der Daten aufzeigen (Wie viele Artikel?, Wie lang sind die Artikel (im Durchschnitt)?, Duplikate, etc.)

Die angefragten Daten können auch vom filesystem geladen werden

In [8]:
# load data from file
data = pd.read_csv("./2783_excellent_article_extract.csv")

data = data[["pageid", "content"]]

In [9]:
data.head(15)

Unnamed: 0,pageid,content
0,2677,Kanada (englisch und französisch Canada) ist e...
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i..."
3,16565,Frankfurt am Main () ist mit 759.224 Einwohner...
4,1200964,Australien (amtlicher deutscher Name; englisch...
5,880316,"Finnland (finnisch [ˈsuɔmi], schwedisch Finla..."
6,1428,"Eine Enzyklopädie (), früher auch aus dem Fran..."
7,290,Argentinien (spanisch [aɾxenˈtina]) ist eine ...
8,29938,Die Republik Südafrika (RSA) ist ein Staat im ...
9,2391,Indien [ˈɪndi̯ən] (Eigennamen unter anderem Hi...


## preprocess data

In [10]:
def preprocess_data(text, idx):
    text = re.sub(r"\n", "", text)
    #t = re.sub(r"\{{2}.*?\}{2}", "", t)
    #t = re.sub(r"\}{2}", "", t)
    #t = re.sub(r"\[\[[0-9A-Za-z\s()]*?\|", "", t)
    #t = re.sub(r"\[\[", "", t)
    #t = re.sub(r"\]\]", "", t)
    #t = re.sub(r"\[.*?\]", "", t)
    #t = re.sub(r"<ref.*?/ref>", "", t)
    #t = re.sub(r"<.*?>", "", t)
    #t = re.sub(r"\|", " ", t)
    #t = re.sub(r"\'", "", t)

    # source: url_extract_pattern from https://uibakery.io/regex-library/url-regex-python
    #url_extract_pattern = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
    #t = re.sub(url_extract_pattern, '', t)

    text = re.sub(r"\(=.*?\)", "", text)
    text = re.sub(r"--+", "", text)
    text = re.sub(r"==.*?==", "", text)
    #t = re.sub(r"==+", "", t)
    #t = re.sub(r"\&nbsp;", "", t)
    # Dateinamen
    #t = re.sub(r"[^ ]*\..{4}|[^ ]*\..{3}", "", t)

    #source: https://stackoverflow.com/questions/2077897/substitute-multiple-whitespace-with-single-whitespace-in-python
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
    text = _RE_COMBINE_WHITESPACE.sub(" ", text).strip()
    
    preprocessed_content.update({idx: text})
    
    if len(preprocessed_content) % 10 == 0:
        update_progress(len(preprocessed_content) / len(ids))

In [11]:
preprocessed_content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(preprocess_data, data["content"], data["pageid"])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

DUR: 5.79799222946167


In [12]:
len(preprocessed_content)

2796

In [13]:
preprocessed_df = pd.DataFrame(preprocessed_content.items())
preprocessed_df = preprocessed_df.rename({0: "pageid", 1:"content"}, axis = 1)

preprocessed_df.head()

Unnamed: 0,pageid,content
0,2677,Kanada (englisch und französisch Canada) ist e...
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i..."
3,1200964,Australien (amtlicher deutscher Name; englisch...
4,16565,Frankfurt am Main () ist mit 759.224 Einwohner...


In [14]:
# save preproessed wiki pages to filesystem
preprocessed_df.to_csv("./data/preprocessed_wiki_pages.csv")

backup
```python
#!python3 -m spacy download de_core_news_md
nlp = spacy.load('de_core_news_md')
```

```python
t_word_list = t.split(" ")

# lemmatization
text_lemma = []

for ix, word in enumerate(t_word_list):
    doc = nlp(word)
    result = ' '.join([x.lemma_ for x in doc]) 
    text_lemma.append(result)

lemma_word_list = [gensim.utils.simple_preprocess(word, deacc = True) for word in text_lemma]

# remove stopwords
# source: https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
#final = [word for word in lemma_word_list if word not in german_stopwords]

final = []
for word in lemma_word_list:
    if len(word) > 0 and word[0] not in german_stopwords:
        final.append(word[0])

text_without_stopwords = " ".join(final)
```

## extract keywords

In [15]:
preprocessed_df = pd.read_csv("./data/preprocessed_wiki_pages.csv")
preprocessed_df = preprocessed_df[["pageid", "content"]]

In [16]:
preprocessed_df.head()

Unnamed: 0,pageid,content
0,2677,Kanada (englisch und französisch Canada) ist e...
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i..."
3,1200964,Australien (amtlicher deutscher Name; englisch...
4,16565,Frankfurt am Main () ist mit 759.224 Einwohner...


### YAKE

using yakes keyword extractor  
(https://liaad.github.io/yake/)

yake hat eine eigene stopwords liste (https://github.com/LIAAD/yake/blob/master/yake/StopwordsList/stopwords_de.txt)  
--> Keine weitere Stopwords suche benötigt

YAKE ist sehr schnell (und leichter) Ansatz (https://towardsdatascience.com/unsupervised-keyphrase-extraction-with-patternrank-28ec3ca737f0)

Lemma wird auch nicht benötigt, weil Yake ohne entsprechend trainiert wurde  

Je geringer der Wahrscheinlichkeitswert, desto relevanter das Keyword (https://liaad.github.io/yake/docs/getting_started.html#output)

In [17]:
#german_stopwords = stopwords.words("german")
german_stopwords = pd.read_csv("./yake_de-stopwords.csv")
german_stopwords = list(german_stopwords["stopwords"])

```python
language = "de"
max_ngram_size = 20
deduplication_threshold = 0.9
numOfKeywords = 10
custom_keyword_extractor = yake.KeywordExtractor(
    lan = language, 
    n = max_ngram_size, 
    dedupLim = deduplication_threshold, 
    top = numOfKeywords, 
    features = None
)
```

In [18]:
kw_extractor = yake.KeywordExtractor(
    lan = "de",
    dedupLim = 0.99,
    top = 10,
    dedupFunc = "seqm", # default: seqm, alternative: jaro
    n = 2,
    windowsSize = 1
)

keyword extraction for the first 10 pages

```python
start = time.time()
for wiki_page in preprocessed_df["content"][:5]:
    keywords = kw_extractor.extract_keywords(wiki_page)
    prop = 0

    for kw in keywords:
    #    if kw[0] not in german_stopwords:
        print(kw)
        prop += kw[1]
            
    print("MEAN: %s" % (np.mean(prop)))
    print("#" * 30 + "\n")
end = time.time()

print("DUR: %s" % (end - start))
```

In [19]:
wikipage_keywords = {}
start = time.time()

for wiki_page, idx in zip(preprocessed_df["content"], preprocessed_df["pageid"]):
    keywords = kw_extractor.extract_keywords(wiki_page)
    wikipage_keywords.update({idx: keywords})
    
    update_progress(len(wikipage_keywords) / preprocessed_df.shape[0])
    
end = time.time()

print("DUR: %s" % (end - start))

Progress: [##################################################] 100.0%
DUR: 809.5952770709991


BACKUP
```python
def extract_keywords(text, idx):
    keywords = []
    kw_extractor = yake.KeywordExtractor(
        lan = "de",
        dedupLim = 0.99,
        top = 10,
        dedupFunc = "seqm", # default: seqm, alternative: jaro
        n = 2,
        windowsSize = 1
    )
    
    extracted_keywords = kw_extractor.extract_keywords(text)
    wikipage_keywords.update({idx: extracted_keywords})
    #if len(wikipage_keywords) % 10 == 0:
    update_progress(len(wikipage_keywords) / len(ids))

    
wikipage_keywords = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(extract_keywords, preprocessed_df["content"], preprocessed_df["pageid"])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))
```

In [20]:
len(wikipage_keywords)

2796

In [21]:
keywords_df = pd.DataFrame(wikipage_keywords.items())
keywords_df = keywords_df.rename({0: "pageid", 1:"keywords"}, axis = 1)

In [22]:
keywords_df.head()

Unnamed: 0,pageid,keywords
0,2677,"[(Kanada, 0.0005067610926873967), (Kanadas, 0...."
1,490244,"[(Philosophie, 0.0002460226827235382), (prakti..."
2,3221050,"[(Moskau, 0.00022634304800017076), (Stadt Mosk..."
3,1200964,"[(Australien, 0.0006621153549226484), (Austral..."
4,16565,"[(Stadt Frankfurt, 2.584346599547873e-05), (Fr..."


In [None]:
# save keywords with coresponding page id
keywords_df.to_csv("./data/wikipage_keywords.csv")

#### results

In [23]:
# join wikipages with (predicted) keywords on pageid
results = pd.concat(
    [preprocessed_df.set_index("pageid"), keywords_df.set_index("pageid")], axis = 1, join = "inner"
).reset_index()

In [24]:
results.head()

Unnamed: 0,pageid,content,keywords
0,2677,Kanada (englisch und französisch Canada) ist e...,"[(Kanada, 0.0005067610926873967), (Kanadas, 0...."
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...,"[(Philosophie, 0.0002460226827235382), (prakti..."
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i...","[(Moskau, 0.00022634304800017076), (Stadt Mosk..."
3,1200964,Australien (amtlicher deutscher Name; englisch...,"[(Australien, 0.0006621153549226484), (Austral..."
4,16565,Frankfurt am Main () ist mit 759.224 Einwohner...,"[(Stadt Frankfurt, 2.584346599547873e-05), (Fr..."


## evaluation

In [102]:
counter_keywords = {}
start = time.time()
for idx, content, keywords in zip(results["pageid"], results["content"], results["keywords"]):
    counter = {}
    text = content.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.split(" ")
    
    for word in text:
        if word not in german_stopwords and len(word) != 0:
            if word in counter.keys():
                counter.update({word: counter.get(word) + 1})
            else:
                counter.update({word: 1})
    
    sorted_counter = dict(sorted(counter.items(), key = lambda item: item[1], reverse = True))
    
    #prop = {}
    base = list(sorted_counter.items())[0][1]
    base = base * 1.05 # TODO: explain why "* 1.05" (5% addition to max occurance of most occuring word)
    counts = []
    for key, val in sorted_counter.items():
        #prop.update({key: {"count": val, "prop": (1 - (val / base)) / 100}})
        counts.append((key, (1 - (val / base)) / 100))
    
    counter_keywords.update({idx: counts})
    
    update_progress(len(counter_keywords) / results.shape[0])
    
end = time.time()
dur = end - start
print("DUR: %s" % (dur))

Progress: [##################################################] 100.0%
DUR: 96.678386926651


In [104]:
results_counter = pd.DataFrame(counter_keywords.items())
results_counter = results_counter.rename({0: "pageid", 1: "counter_keywords"}, axis = 1)
results_counter.head(10)

Unnamed: 0,pageid,counter_keywords
0,2677,"[(kanada, 0.00047619047619047673), (québec, 0...."
1,490244,"[(philosophie, 0.00047619047619047673), (–, 0...."
2,3221050,"[(moskau, 0.00047619047619047673), (stadt, 0.0..."
3,1200964,"[(australien, 0.00047619047619047673), (austra..."
4,16565,"[(frankfurt, 0.00047619047619047673), (stadt, ..."
5,880316,"[(finnland, 0.00047619047619047673), (finnland..."
6,290,"[(argentinien, 0.00047619047619047673), (de, 0..."
7,29938,"[(südafrika, 0.00047619047619047673), (land, 0..."
8,2391,"[(indien, 0.00047619047619047673), (indiens, 0..."
9,1428,"[(enzyklopädie, 0.00047619047619047673), (enzy..."


In [107]:
eval = {} 
start = time.time()
for idx, kw_yake, kw_count in zip(results["pageid"], results["keywords"], results_counter["counter_keywords"]):
    comparison = {}

    for keyword in kw_yake:
        comparison.update({
            keyword[0].lower(): {
                "yake": keyword[1],
                "counter": 1.0
            }
        })

    for keyword in kw_count:
        if keyword[0] in comparison.keys():
            comparison.update({
                keyword[0]: {
                    "yake": comparison[keyword[0]].get("yake"),
                    "counter": keyword[1]
                }
            })
            
    pred = []
    true = []
    for key, val in comparison.items():
        pred.append(val.get("yake"))
        true.append(val.get("counter"))
    
    mse = mean_squared_error(true, pred)
    eval.update({idx: mse})
    update_progress(len(eval) / results.shape[0])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

Progress: [##################################################] 100.0%
DUR: 4.271485805511475


In [108]:
eval_df = pd.DataFrame(eval.items())
eval_df = eval_df.rename({0: "pageid", 1:"MSE"}, axis = 1)

eval_df.head()

Unnamed: 0,pageid,MSE
0,2677,0.297487
1,490244,0.495126
2,3221050,0.596327
3,1200964,0.397678
4,16565,0.498944


In [109]:
results_eval = pd.concat([
    results.set_index("pageid"), 
    results_counter.set_index("pageid"),
    eval_df.set_index("pageid")
], axis = 1).reset_index()

In [110]:
results_eval.head(10)

Unnamed: 0,pageid,content,keywords,counter_keywords,MSE
0,2677,Kanada (englisch und französisch Canada) ist e...,"[(Kanada, 0.0005067610926873967), (Kanadas, 0....","[(kanada, 0.00047619047619047673), (québec, 0....",0.297487
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...,"[(Philosophie, 0.0002460226827235382), (prakti...","[(philosophie, 0.00047619047619047673), (–, 0....",0.495126
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i...","[(Moskau, 0.00022634304800017076), (Stadt Mosk...","[(moskau, 0.00047619047619047673), (stadt, 0.0...",0.596327
3,1200964,Australien (amtlicher deutscher Name; englisch...,"[(Australien, 0.0006621153549226484), (Austral...","[(australien, 0.00047619047619047673), (austra...",0.397678
4,16565,Frankfurt am Main () ist mit 759.224 Einwohner...,"[(Stadt Frankfurt, 2.584346599547873e-05), (Fr...","[(frankfurt, 0.00047619047619047673), (stadt, ...",0.498944
5,880316,"Finnland (finnisch [ˈsuɔmi], schwedisch Finlan...","[(Finnland, 0.0006628069903539394), (Finnlands...","[(finnland, 0.00047619047619047673), (finnland...",0.393205
6,290,Argentinien (spanisch [aɾxenˈtina]) ist eine R...,"[(Buenos Aires, 0.0003009614302623394), (Argen...","[(argentinien, 0.00047619047619047673), (de, 0...",0.298368
7,29938,Die Republik Südafrika (RSA) ist ein Staat im ...,"[(Südafrika, 0.0003601999013826174), (South Af...","[(südafrika, 0.00047619047619047673), (land, 0...",0.299209
8,2391,Indien [ˈɪndi̯ən] (Eigennamen unter anderem Hi...,"[(Indien, 0.00048282594130112937), (Indiens, 0...","[(indien, 0.00047619047619047673), (indiens, 0...",0.395971
9,1428,"Eine Enzyklopädie (), früher auch aus dem Fran...","[(Encyclopaedia Britannica, 0.0002825556238659...","[(enzyklopädie, 0.00047619047619047673), (enzy...",0.199652


In [111]:
print("MAX: %s\nMIN: %s\nMEAN: %s" % (
    np.max(results_eval["MSE"]), np.min(results_eval["MSE"]), np.mean(results_eval["MSE"])
))

MAX: 0.8972913949780681
MIN: 8.389024509411331e-07
MEAN: 0.37429738489376824


In [113]:
#yake
results_eval["keywords"][0]

[('Kanada', 0.0005067610926873967),
 ('Kanadas', 0.001809861045312131),
 ('Québec', 0.0027778966414005444),
 ('Canada', 0.0033584718003683797),
 ('Provinz Kanada', 0.0038343013787126064),
 ('Vereinigten Staaten', 0.003935178926941326),
 ('Toronto', 0.004710976358477465),
 ('Vancouver Island', 0.004855747172258857),
 ('Ontario', 0.005459060069660775),
 ('Canadian', 0.006147449520330219)]

In [115]:
#counter
results_eval["counter_keywords"][0][:15]

[('kanada', 0.00047619047619047673),
 ('québec', 0.0061697722567287784),
 ('of', 0.0066873706004140785),
 ('canada', 0.00679089026915114),
 ('toronto', 0.0069979296066252595),
 ('kanadische', 0.007308488612836439),
 ('kanadas', 0.007308488612836439),
 ('ontario', 0.007412008281573499),
 ('canadian', 0.007412008281573499),
 ('kanadischen', 0.007722567287784679),
 ('chr', 0.007722567287784679),
 ('land', 0.00782608695652174),
 ('vancouver', 0.00782608695652174),
 ('bevölkerung', 0.008033126293995859),
 ('act', 0.008136645962732919)]