# Abgabe zur Vorlesung "Forschungsthemen Informatik" von Jasmin Noll

In diesem Notebook wird der **2. Shared Task: Top Modelling auf Artikel aus DE-Wikipedia** bearbeitet.  
<Kurze Erläuterung der Aufgabe und was in diesem Notebook erwartet wird>

In [1]:
import pandas as pd
import numpy as np
import time
import requests
import concurrent.futures
import re
import string
import spacy
import gensim
import yake
from gensim import corpora
from nltk.corpus import stopwords
from IPython.display import clear_output
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from rake_nltk import Rake, Metric
from keybert import KeyBERT
#nltk.download("stopwords")

import warnings
warnings.filterwarnings("ignore")

Konfigurationsvariablen/-werte setzten (API-Params, Magicnumbers, Parameter der Algorithmen, etc.)

In [2]:
# Configuration values (adjust as needed)

# search criteria for wikipedia articles to process
SR_SEARCH_VALUE = "incategory:Wikipedia:Exzellent"

In [3]:
# source: IBM internal
def update_progress(progress):
    bar_length = 50
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

## request data

In [4]:
URL = "https://de.wikipedia.org/w/api.php"

```python
S = requests.Session()
```

```python
# request excellent arictles from german wikipedia via wiki api (10 at a time)
params_pageid = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "rvslots": "*",
    "format": "json",
    "formatversion": 2,
    #"srsearch": "incategory:Wikipedia:Exzellent",
    "srsearch": SR_SEARCH_VALUE,
    "list": "search",
    "sroffset": 0
}

response = S.get(url = URL, params = params_pageid)
data = response.json()

# get ids from excellent articles
ids = []

for entry in data["query"]["search"]:
    ids.append(entry["pageid"])

while data.get("continue"):
    params_pageid.update({"sroffset": data["continue"]["sroffset"]})
    
    #print("\n%s" % (PARAMS))
    response = S.get(url = URL, params = params_pageid)
    data = response.json()
    
    for entry in data["query"]["search"]:
        ids.append(entry["pageid"])

print("Anzahl gesammelter Exzellenter Artikel: %s" %(len(ids)))
```

In [5]:
def get_pages_by_id(id):
    request_params = params_content.copy()
    request_params.update({"pageids": id})
    response = S.get(url = URL, params = request_params)
    page = response.json()
    content.update({id: page["query"]["pages"][0]["extract"]})
    if len(content) % 10 == 0:
        update_progress(len(content) / len(ids))

In [6]:
# Source: https://stackoverflow.com/questions/4452102/how-to-get-plain-text-out-of-wikipedia
# Source: https://www.mediawiki.org/wiki/API:Parsing_wikitext
params_content = {
    "action": "query",
    "prop": "extracts",
    "format": "json",
    "formatversion": 2,
    "pageids": 0,
    "explaintext": True
}

```python
content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(get_pages_by_id, ids)

end = time.time()
dur = end - start
print("DUR: %s" % (dur))
```

In [7]:
#len(content)

```python
# save requested data (wikipage content) in df
df = pd.DataFrame(content.items())
df = df.rename({0: "pageid", 1:"content"}, axis = 1)
```

```python
# save data to csv for faster loading
# TODO: Den Dateinamen dynamisch anpassen
# --> Die Zahl vorne dynamisch durch die Anzahl an Seiten in content ersetzten
df.to_csv("./2783_excellent_article_extract.csv")
```

TODO: Einige Statistiken der Daten aufzeigen (Wie viele Artikel?, Wie lang sind die Artikel (im Durchschnitt)?, Duplikate, etc.)

Die angefragten Daten können auch vom filesystem geladen werden

In [8]:
# load data from file
data = pd.read_csv("./2783_excellent_article_extract.csv")

data = data[["pageid", "content"]]

In [9]:
data.head(15)

Unnamed: 0,pageid,content
0,2677,Kanada (englisch und französisch Canada) ist e...
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i..."
3,16565,Frankfurt am Main () ist mit 759.224 Einwohner...
4,1200964,Australien (amtlicher deutscher Name; englisch...
5,880316,"Finnland (finnisch [ˈsuɔmi], schwedisch Finla..."
6,1428,"Eine Enzyklopädie (), früher auch aus dem Fran..."
7,290,Argentinien (spanisch [aɾxenˈtina]) ist eine ...
8,29938,Die Republik Südafrika (RSA) ist ein Staat im ...
9,2391,Indien [ˈɪndi̯ən] (Eigennamen unter anderem Hi...


## preprocess data

In [96]:
def preprocess_data(text, idx):
    text = re.sub(r"\n", "", text)
    #t = re.sub(r"\{{2}.*?\}{2}", "", t)
    #t = re.sub(r"\}{2}", "", t)
    #t = re.sub(r"\[\[[0-9A-Za-z\s()]*?\|", "", t)
    #t = re.sub(r"\[\[", "", t)
    #t = re.sub(r"\]\]", "", t)
    #t = re.sub(r"\[.*?\]", "", t)
    #t = re.sub(r"<ref.*?/ref>", "", t)
    #t = re.sub(r"<.*?>", "", t)
    #t = re.sub(r"\|", " ", t)
    #t = re.sub(r"\'", "", t)

    # source: url_extract_pattern from https://uibakery.io/regex-library/url-regex-python
    #url_extract_pattern = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
    #t = re.sub(url_extract_pattern, '', t)

    text = re.sub(r"\(=.*?\)", "", text)
    text = re.sub(r"--+", "", text)
    text = re.sub(r"==.*?==", "", text)
    text = re.sub(r"=", "", text)
    #t = re.sub(r"==+", "", t)
    #t = re.sub(r"\&nbsp;", "", t)
    # Dateinamen
    #t = re.sub(r"[^ ]*\..{4}|[^ ]*\..{3}", "", t)

    #source: https://stackoverflow.com/questions/2077897/substitute-multiple-whitespace-with-single-whitespace-in-python
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
    text = _RE_COMBINE_WHITESPACE.sub(" ", text).strip()
    
    preprocessed_content.update({idx: text})
    
    if len(preprocessed_content) % 10 == 0:
        update_progress(len(preprocessed_content) / len(ids))

In [97]:
preprocessed_content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(preprocess_data, data["content"], data["pageid"])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

DUR: 6.236737012863159


In [98]:
len(preprocessed_content)

2796

In [99]:
preprocessed_df = pd.DataFrame(preprocessed_content.items())
preprocessed_df = preprocessed_df.rename({0: "pageid", 1:"content"}, axis = 1)

preprocessed_df.head()

Unnamed: 0,pageid,content
0,2677,Kanada (englisch und französisch Canada) ist e...
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i..."
3,1200964,Australien (amtlicher deutscher Name; englisch...
4,16565,Frankfurt am Main () ist mit 759.224 Einwohner...


In [100]:
# save preproessed wiki pages to filesystem
preprocessed_df.to_csv("./data/preprocessed_wiki_pages.csv")

backup
```python
#!python3 -m spacy download de_core_news_md
nlp = spacy.load('de_core_news_md')
```

```python
t_word_list = t.split(" ")

# lemmatization
text_lemma = []

for ix, word in enumerate(t_word_list):
    doc = nlp(word)
    result = ' '.join([x.lemma_ for x in doc]) 
    text_lemma.append(result)

lemma_word_list = [gensim.utils.simple_preprocess(word, deacc = True) for word in text_lemma]

# remove stopwords
# source: https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
#final = [word for word in lemma_word_list if word not in german_stopwords]

final = []
for word in lemma_word_list:
    if len(word) > 0 and word[0] not in german_stopwords:
        final.append(word[0])

text_without_stopwords = " ".join(final)
```

## extract keywords

In [101]:
preprocessed_df = pd.read_csv("./data/preprocessed_wiki_pages.csv")
preprocessed_df = preprocessed_df[["pageid", "content"]]

In [102]:
preprocessed_df.head()

Unnamed: 0,pageid,content
0,2677,Kanada (englisch und französisch Canada) ist e...
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i..."
3,1200964,Australien (amtlicher deutscher Name; englisch...
4,16565,Frankfurt am Main () ist mit 759.224 Einwohner...


### YAKE

using yakes keyword extractor  
(https://liaad.github.io/yake/)

yake hat eine eigene stopwords liste (https://github.com/LIAAD/yake/blob/master/yake/StopwordsList/stopwords_de.txt)  
--> Keine weitere Stopwords suche benötigt

YAKE ist sehr schnell (und leichter) Ansatz (https://towardsdatascience.com/unsupervised-keyphrase-extraction-with-patternrank-28ec3ca737f0)

Lemma wird auch nicht benötigt, weil Yake ohne entsprechend trainiert wurde  

Je geringer der Wahrscheinlichkeitswert, desto relevanter das Keyword (https://liaad.github.io/yake/docs/getting_started.html#output)

In [103]:
#german_stopwords = stopwords.words("german")
german_stopwords = pd.read_csv("./yake_de-stopwords.csv")
german_stopwords = list(german_stopwords["stopwords"])

```python
language = "de"
max_ngram_size = 20
deduplication_threshold = 0.9
numOfKeywords = 10
custom_keyword_extractor = yake.KeywordExtractor(
    lan = language, 
    n = max_ngram_size, 
    dedupLim = deduplication_threshold, 
    top = numOfKeywords, 
    features = None
)
```

In [104]:
kw_extractor = yake.KeywordExtractor(
    lan = "de",
    dedupLim = 0.99,
    top = 10,
    dedupFunc = "seqm", # default: seqm, alternative: jaro
    n = 2,
    windowsSize = 1
)

keyword extraction for the first 10 pages

```python
start = time.time()
for wiki_page in preprocessed_df["content"][:5]:
    keywords = kw_extractor.extract_keywords(wiki_page)
    prop = 0

    for kw in keywords:
    #    if kw[0] not in german_stopwords:
        print(kw)
        prop += kw[1]
            
    print("MEAN: %s" % (np.mean(prop)))
    print("#" * 30 + "\n")
end = time.time()

print("DUR: %s" % (end - start))
```

In [105]:
wikipage_keywords = {}
start = time.time()

for wiki_page, idx in zip(preprocessed_df["content"], preprocessed_df["pageid"]):
    keywords = kw_extractor.extract_keywords(wiki_page)
    wikipage_keywords.update({idx: keywords})
    
    update_progress(len(wikipage_keywords) / preprocessed_df.shape[0])
    
end = time.time()

print("DUR: %s" % (end - start))

Progress: [##################################################] 100.0%
DUR: 838.1237599849701


BACKUP
```python
def extract_keywords(text, idx):
    keywords = []
    kw_extractor = yake.KeywordExtractor(
        lan = "de",
        dedupLim = 0.99,
        top = 10,
        dedupFunc = "seqm", # default: seqm, alternative: jaro
        n = 2,
        windowsSize = 1
    )
    
    extracted_keywords = kw_extractor.extract_keywords(text)
    wikipage_keywords.update({idx: extracted_keywords})
    #if len(wikipage_keywords) % 10 == 0:
    update_progress(len(wikipage_keywords) / len(ids))

    
wikipage_keywords = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(extract_keywords, preprocessed_df["content"], preprocessed_df["pageid"])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))
```

In [106]:
len(wikipage_keywords)

2796

In [107]:
keywords_df = pd.DataFrame(wikipage_keywords.items())
keywords_df = keywords_df.rename({0: "pageid", 1:"keywords"}, axis = 1)

In [108]:
keywords_df.head()

Unnamed: 0,pageid,keywords
0,2677,"[(Kanada, 0.0005525449347212504), (Kanadas, 0...."
1,490244,"[(Philosophie, 0.00024914998570791535), (ISBN,..."
2,3221050,"[(Moskau, 0.00024187077106055336), (Stadt Mosk..."
3,1200964,"[(Australien, 0.0006737985774128164), (Austral..."
4,16565,"[(Stadt Frankfurt, 2.626906686090427e-05), (Fr..."


In [109]:
# save keywords with coresponding page id
keywords_df.to_csv("./data/wikipage_keywords-YAKE.csv")

#### results

In [110]:
# join wikipages with (predicted) keywords on pageid
results = pd.concat(
    [preprocessed_df.set_index("pageid"), keywords_df.set_index("pageid")], axis = 1, join = "inner"
).reset_index()

In [111]:
results.head()

Unnamed: 0,pageid,content,keywords
0,2677,Kanada (englisch und französisch Canada) ist e...,"[(Kanada, 0.0005525449347212504), (Kanadas, 0...."
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...,"[(Philosophie, 0.00024914998570791535), (ISBN,..."
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i...","[(Moskau, 0.00024187077106055336), (Stadt Mosk..."
3,1200964,Australien (amtlicher deutscher Name; englisch...,"[(Australien, 0.0006737985774128164), (Austral..."
4,16565,Frankfurt am Main () ist mit 759.224 Einwohner...,"[(Stadt Frankfurt, 2.626906686090427e-05), (Fr..."


### RAKE

In [306]:
# initialize RAKE model
rake_model = Rake(
    language = "german",
    stopwords = german_stopwords,
    ranking_metric = Metric.WORD_FREQUENCY,
    max_length = 2,
    include_repeated_phrases = False,
    punctuations = ".,-;)"
)

In [307]:
rake_keywords = {}
start = time.time()

for wiki_page, idx in zip(preprocessed_df["content"], preprocessed_df["pageid"]):
    rake_model.extract_keywords_from_text(wiki_page)
    # get top 10 keyowords for wiki page
    ranking = rake_model.get_ranked_phrases_with_scores()[:10]
    # recalculate propability
    keyword_ranking = []
    base = ranking[0][0] * 1.05
    for rank in ranking:
        kw = rank[1]
        prop = rank[0]
        keyword_ranking.append((kw, (1 - (prop / base)) / 100))
        
    rake_keywords.update({idx: keyword_ranking})
    update_progress(len(rake_keywords) / preprocessed_df.shape[0])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

Progress: [##################################################] 100.0%
DUR: 46.47527194023132


In [308]:
rake_keywords_df = pd.DataFrame(rake_keywords.items())
rake_keywords_df = rake_keywords_df.rename({0: "pageid", 1:"keywords"}, axis = 1)

In [309]:
rake_keywords_df

Unnamed: 0,pageid,keywords
0,2677,"[(provinz kanada, 0.0004761904761904778), (kan..."
1,490244,"[(philosophie –, 0.00047619047619047673), (phi..."
2,3221050,"[(stadt moskau, 0.00047619047619047673), (mosk..."
3,1200964,"[(australischen regierung, 0.00047619047619047..."
4,16565,"[(stadt frankfurt, 0.00047619047619047673), (f..."
...,...,...
2791,12593625,"[(gemälde susanna, 0.00047619047619047673), (c..."
2792,10278319,"[(ikhwān al, 0.00047619047619047673), (veröffe..."
2793,12508588,"[(lokomotiven bewährt, 0.00047619047619047673)..."
2794,12587627,"[(leipzig 1990, 0.00047619047619047673), (leip..."


In [327]:
# save keywords with coresponding page id
rake_keywords_df.to_csv("./data/wikipage_keywords-RAKE.csv")

### KeyBERT

In [322]:
kw_model = KeyBERT()

In [323]:
bert_keywords = {}
start = time.time()

for wiki_page, idx in zip(preprocessed_df["content"], preprocessed_df["pageid"]): 
    # get top 10 keywords from wiki page
    ranking = kw_model.extract_keywords(
        wiki_page, 
        keyphrase_ngram_range = (1, 1), 
        stop_words = german_stopwords,
        top_n = 10,
        use_mmr = True,
        diversity = 0.3
    )
    # recalculate propability
    keyword_ranking = []
    base = ranking[0][1] * 1.05
    for rank in ranking:
        kw = rank[0]
        prop = rank[1]
        keyword_ranking.append((kw, (1 - (prop / base)) / 100))
    
    bert_keywords.update({idx: keyword_ranking})
    update_progress(len(bert_keywords) / preprocessed_df.shape[0])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

Progress: [##################################################] 100.0%
DUR: 7350.769551038742


In [324]:
bert_keywords_df = pd.DataFrame(bert_keywords.items())
bert_keywords_df = bert_keywords_df.rename({0: "pageid", 1:"keywords"}, axis = 1)

In [325]:
bert_keywords_df

Unnamed: 0,pageid,keywords
0,2677,"[(kanada, 0.0004761904761904756), (kanadiern, ..."
1,490244,"[(philosophisches, 0.00047619047619047673), (p..."
2,3221050,"[(moskau, 0.0004761904761904756), (moskausmosk..."
3,1200964,"[(australiens, 0.00047619047619047673), (austr..."
4,16565,"[(einwohnergleichwerten, 0.0004761904761904778..."
...,...,...
2791,12593625,"[(susanna, 0.00047619047619047673), (tintorett..."
2792,10278319,"[(ʿabdallāh, 0.0004761904761904756), (islamisc..."
2793,12508588,"[(coronations, 0.00047619047619047673), (bahng..."
2794,12587627,"[(bezirksleitung, 0.00047619047619047673), (le..."


In [326]:
# save keywords with coresponding page id
bert_keywords_df.to_csv("./data/wikipage_keywords-BERT.csv")

## evaluation

In [26]:
# count words (excluding german stopwords) in text
counter_keywords = {}
start = time.time()
for idx, content, keywords in zip(results["pageid"], results["content"], results["keywords"]):
    counter = {}
    text = content.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.split(" ")
    
    for word in text:
        if word not in german_stopwords and len(word) != 0:
            if word in counter.keys():
                counter.update({word: counter.get(word) + 1})
            else:
                counter.update({word: 1})
    
    sorted_counter = dict(sorted(counter.items(), key = lambda item: item[1], reverse = True))
    
    # calculate propability that these words are keywords
    #prop = {}
    # base for propability calculations -- occurance of most occuring word with 5% addition to it, 
    #   so propability isn't 1 for most occuring word
    base = list(sorted_counter.items())[0][1]
    base = base * 1.05 # TODO: rework explaination why "* 1.05" (5% addition to max occurance of most occuring word)
    counts = []
    for key, val in sorted_counter.items():
        #prop.update({key: {"count": val, "prop": (1 - (val / base)) / 100}})
        # append word with coresponding propability -- the lower the prop-value the bigger the propabiliy to be
        #   an important keyword
        counts.append((key, (1 - (val / base)) / 100))
    
    counter_keywords.update({idx: counts})
    
    update_progress(len(counter_keywords) / results.shape[0])
    
end = time.time()
dur = end - start
print("DUR: %s" % (dur))

Progress: [##################################################] 100.0%
DUR: 97.63415288925171


In [27]:
# df for keyword extraction using counting-method
results_counter = pd.DataFrame(counter_keywords.items())
results_counter = results_counter.rename({0: "pageid", 1: "counter_keywords"}, axis = 1)
results_counter.head(10)

Unnamed: 0,pageid,counter_keywords
0,2677,"[(kanada, 0.00047619047619047673), (québec, 0...."
1,490244,"[(philosophie, 0.00047619047619047673), (–, 0...."
2,3221050,"[(moskau, 0.00047619047619047673), (stadt, 0.0..."
3,1200964,"[(australien, 0.00047619047619047673), (austra..."
4,16565,"[(frankfurt, 0.00047619047619047673), (stadt, ..."
5,880316,"[(finnland, 0.00047619047619047673), (finnland..."
6,290,"[(argentinien, 0.00047619047619047673), (de, 0..."
7,29938,"[(südafrika, 0.00047619047619047673), (land, 0..."
8,2391,"[(indien, 0.00047619047619047673), (indiens, 0..."
9,18559,"[(–, 0.00047619047619047673), (kanji, 0.000915..."


In [28]:
# evaluation
eval = {} 
start = time.time()
for idx, kw_yake, kw_count in zip(results["pageid"], results["keywords"], results_counter["counter_keywords"]):
    # prep comparison of keywords from YAKE and counting
    comparison = {}
    selected_keywords = []
    """TODO: REMOVE
    # if YAKE keywords are build from single keywords, than check both separatly
    #while len(selected_keywords) < 5:
    for keyword in kw_yake:
        if len(keyword[0].split(" ")) == 1:
            selected_keywords.append(keyword[0])"""

    # set YAKE keywords propability in order of occurance
    for keyword in kw_yake:
        # if YAKE keywords are build from single keywords, than check both separatly
        for kw in keyword[0].split(" "):
            """TODO: REMOVE
            #if keyword[0] in selected_keywords:"""
            comparison.update({
                #keyword[0].lower(): {
                kw.lower(): {
                    "yake": keyword[1],
                    "counter": 1.0 # propability gets updated later on
                }
            })

    # set counter keywords propability in order of occurance (same as YAKE keywords)
    for keyword in kw_count:
        if keyword[0] in comparison.keys():
            comparison.update({
                keyword[0]: {
                    "yake": comparison[keyword[0]].get("yake"),
                    "counter": keyword[1]
                }
            })
    
    # get the prediction (YAKE) and "true keywords" (counter) for calculating metrics
    pred = []
    true = []
    for key, val in comparison.items():
        pred.append(val.get("yake"))
        true.append(val.get("counter"))
        
    # calc MSE
    mse = mean_squared_error(true, pred)
    mae = mean_absolute_error(true, pred)
    mape = mean_absolute_percentage_error(true, pred)
    eval.update({idx: [mse, mae, mape]})
    
    update_progress(len(eval) / results.shape[0])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

Progress: [##################################################] 100.0%
DUR: 5.05671501159668


In [29]:
test_df = pd.DataFrame(eval.items())
test_df = test_df.rename({0: "pageid", 1:"metrics"}, axis = 1)

mse = []
mae = []
mape = []
for metrics in test_df["metrics"]:
    mse.append(metrics[0])
    mae.append(metrics[1])
    mape.append(metrics[2])

test_df["MSE"] = mse
test_df["MAE"] = mae
test_df["MAPE"] = mape

test_df = test_df.drop(["metrics"], axis = 1)
test_df.head()

Unnamed: 0,pageid,MSE,MAE,MAPE
0,2677,1.4e-05,0.003509,1.006618
1,490244,2.3e-05,0.004377,1.880475
2,3221050,2.9e-05,0.005274,1.319461
3,1200964,1.5e-05,0.003312,0.492364
4,16565,4.4e-05,0.006184,0.787576


In [30]:
# df with eval metrics
eval_df = pd.DataFrame(eval.items())
eval_df = eval_df.rename({0: "pageid", 1:"metrics"}, axis = 1)

# move metrics to own columns
mse = []
mae = []
mape = []
for metrics in eval_df["metrics"]:
    mse.append(metrics[0])
    mae.append(metrics[1])
    mape.append(metrics[2])

eval_df["MSE"] = mse
eval_df["MAE"] = mae
eval_df["MAPE"] = mape

eval_df = eval_df.drop(["metrics"], axis = 1)
eval_df.head()

Unnamed: 0,pageid,MSE,MAE,MAPE
0,2677,1.4e-05,0.003509,1.006618
1,490244,2.3e-05,0.004377,1.880475
2,3221050,2.9e-05,0.005274,1.319461
3,1200964,1.5e-05,0.003312,0.492364
4,16565,4.4e-05,0.006184,0.787576


In [31]:
# join all dfs to one df with evaluated results
results_eval = pd.concat([
    results.set_index("pageid"), 
    results_counter.set_index("pageid"),
    eval_df.set_index("pageid")
], axis = 1).reset_index()

In [32]:
results_eval.head(10)

Unnamed: 0,pageid,content,keywords,counter_keywords,MSE,MAE,MAPE
0,2677,Kanada (englisch und französisch Canada) ist e...,"[(Kanada, 0.0005067610926873967), (Kanadas, 0....","[(kanada, 0.00047619047619047673), (québec, 0....",1.4e-05,0.003509,1.006618
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...,"[(Philosophie, 0.0002460226827235382), (prakti...","[(philosophie, 0.00047619047619047673), (–, 0....",2.3e-05,0.004377,1.880475
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i...","[(Moskau, 0.00022634304800017076), (Stadt Mosk...","[(moskau, 0.00047619047619047673), (stadt, 0.0...",2.9e-05,0.005274,1.319461
3,1200964,Australien (amtlicher deutscher Name; englisch...,"[(Australien, 0.0006621153549226484), (Austral...","[(australien, 0.00047619047619047673), (austra...",1.5e-05,0.003312,0.492364
4,16565,Frankfurt am Main () ist mit 759.224 Einwohner...,"[(Stadt Frankfurt, 2.584346599547873e-05), (Fr...","[(frankfurt, 0.00047619047619047673), (stadt, ...",4.4e-05,0.006184,0.787576
5,880316,"Finnland (finnisch [ˈsuɔmi], schwedisch Finlan...","[(Finnland, 0.0006628069903539394), (Finnlands...","[(finnland, 0.00047619047619047673), (finnland...",1.2e-05,0.002032,2.224992
6,290,Argentinien (spanisch [aɾxenˈtina]) ist eine R...,"[(Buenos Aires, 0.0003009614302623394), (Argen...","[(argentinien, 0.00047619047619047673), (de, 0...",3e-06,0.00139,0.469328
7,29938,Die Republik Südafrika (RSA) ist ein Staat im ...,"[(Südafrika, 0.0003601999013826174), (South Af...","[(südafrika, 0.00047619047619047673), (land, 0...",2.5e-05,0.004438,0.757473
8,2391,Indien [ˈɪndi̯ən] (Eigennamen unter anderem Hi...,"[(Indien, 0.00048282594130112937), (Indiens, 0...","[(indien, 0.00047619047619047673), (indiens, 0...",5e-06,0.001782,0.244738
9,18559,Die japanische Schrift besteht aus mehreren Sc...,"[(Kanji, 0.0015474253863809284), (Zeichen, 0.0...","[(–, 0.00047619047619047673), (kanji, 0.000915...",7e-06,0.001912,0.387824


In [33]:
# worst prediction (according to MSE)
results_eval[results_eval["MSE"] == np.max(results_eval["MSE"])]

Unnamed: 0,pageid,content,keywords,counter_keywords,MSE,MAE,MAPE
1484,1491190,Der Moto(r)cortex (von lateinisch motor „Beweg...,"[(primär-motorischen Rinde, 0.0022677363385406...","[(rinde, 0.00047619047619047673), (–, 0.004745...",0.429653,0.443018,5.528702


In [34]:
# best prediction (according to MSE)
results_eval[results_eval["MSE"] == np.min(results_eval["MSE"])]

Unnamed: 0,pageid,content,keywords,counter_keywords,MSE,MAE,MAPE
1463,1242449,"Das Eine (altgriechisch τὸ ἕν to hen, lateinis...","[(Einheit, 0.0012148567686220398), (Vielheit, ...","[(einheit, 0.0004761904761904756), (seienden, ...",7.640119e-07,0.000778,0.25089


In [35]:
# simple overview of quality of prediction
print("##### MSE #####")
print("max: %s\nmin: %s\nmean: %s" % (
    round(np.max(results_eval["MSE"]), 4), 
    round(np.min(results_eval["MSE"]), 4), 
    round(np.mean(results_eval["MSE"]), 4)
))

print("\n##### MAE #####")
print("max: %s\nmin: %s\nmean: %s" % (
    round(np.max(results_eval["MAE"]), 4),
    round(np.min(results_eval["MAE"]), 4),
    round(np.mean(results_eval["MAE"]), 4)
))

print("\n##### MAPE #####")
print("max: %s\nmin: %s\nmean: %s" % (
    round(np.max(results_eval["MAPE"]), 4), 
    round(np.min(results_eval["MAPE"]), 4), 
    round(np.mean(results_eval["MAPE"]), 4)
))

##### MSE #####
max: 0.4297
min: 0.0
mean: 0.0106

##### MAE #####
max: 0.443
min: 0.0007
mean: 0.0166

##### MAPE #####
max: 29.1118
min: 0.1492
mean: 2.2609
