# Abgabe zur Vorlesung "Forschungsthemen Informatik" von Jasmin Noll

In diesem Notebook wird der **2. Shared Task: Top Modelling auf Artikel aus DE-Wikipedia** bearbeitet.  
<Kurze Erläuterung der Aufgabe und was in diesem Notebook erwartet wird>

In [60]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import time
import requests
import concurrent.futures
import re
import string
import spacy
import gensim
import yake
from gensim import corpora
from nltk.corpus import stopwords
from IPython.display import clear_output
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from rake_nltk import Rake, Metric
from keybert import KeyBERT

#nltk.download("stopwords")

import warnings
warnings.filterwarnings("ignore")

Konfigurationsvariablen/-werte setzten (API-Params, Magicnumbers, Parameter der Algorithmen, etc.)

In [93]:
# Configuration values (adjust as needed)
URL = "https://de.wikipedia.org/w/api.php"

# search criteria for wikipedia articles to process
SR_SEARCH_VALUE = "incategory:Wikipedia:Exzellent"

colors = ["#3d405b", "#81b29a", "#f2cc8f", "#e07a5f", "#f4f1de"]

skip_request = True
skip_kw_extraction = False
skip_counter = False

In [24]:
# source: IBM internal
def update_progress(progress):
    bar_length = 50
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [25]:
global_start = time.time()

## 1. Exzellente Artikel vom deutschen Wikipedia abfragen

In [26]:
def get_pages_by_id(id):
    request_params = params_content.copy()
    request_params.update({"pageids": id})
    response = S.get(url = URL, params = request_params)
    page = response.json()
    content.update({id: page["query"]["pages"][0]["extract"]})
    if len(content) % 10 == 0:
        update_progress(len(content) / len(ids))

In [27]:
S = requests.Session()

In [28]:
if not skip_request:
    # request excellent arictles from german wikipedia via wiki api (10 at a time)
    params_pageid = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "*",
        "format": "json",
        "formatversion": 2,
        #"srsearch": "incategory:Wikipedia:Exzellent",
        "srsearch": SR_SEARCH_VALUE,
        "list": "search",
        "sroffset": 0
    }

    response = S.get(url = URL, params = params_pageid)
    data = response.json()

    # get ids from excellent articles
    ids = []

    for entry in data["query"]["search"]:
        ids.append(entry["pageid"])

    while data.get("continue"):
        params_pageid.update({"sroffset": data["continue"]["sroffset"]})

        #print("\n%s" % (PARAMS))
        response = S.get(url = URL, params = params_pageid)
        data = response.json()

        for entry in data["query"]["search"]:
            ids.append(entry["pageid"])

    print("Anzahl gesammelter Exzellenter Artikel: %s" %(len(ids)))
else:
    print("SKIPPED REQUEST FOR IDS")

SKIPPED REQUEST FOR IDS


In [29]:
# Source: https://stackoverflow.com/questions/4452102/how-to-get-plain-text-out-of-wikipedia
# Source: https://www.mediawiki.org/wiki/API:Parsing_wikitext
params_content = {
    "action": "query",
    "prop": "extracts",
    "format": "json",
    "formatversion": 2,
    "pageids": 0,
    "explaintext": True
}

In [30]:
if not skip_request:
    content = {}
    start = time.time()
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(get_pages_by_id, ids)

    end = time.time()
    dur = end - start
    print("DUR: %s" % (dur))
    
    # save requested data (wikipage content) in df
    data = pd.DataFrame(content.items())
    data = data.rename({0: "pageid", 1:"content"}, axis = 1)
    
    # save data to csv for faster loading
    data.to_csv("./data/excellent_article_extract.csv")
else:
    print("SKIPPED REQUEST FOR ARTICLES")

SKIPPED REQUEST FOR ARTICLES


TODO: Einige Statistiken der Daten aufzeigen (Wie viele Artikel?, Wie lang sind die Artikel (im Durchschnitt)?, Duplikate, etc.)

Die angefragten Daten können auch vom filesystem geladen werden

In [31]:
if skip_request:
    # load data from file
    data = pd.read_csv("./2783_excellent_article_extract.csv")
    #data = pd.read_csv("./data/excellent_article_extract.csv")
    data = data[["pageid", "content"]]
    data.head()

## 2. Preprocessing

In [32]:
def preprocess_data(text, idx):
    text = re.sub(r"\n", "", text)
    text = re.sub(r"\(=.*?\)", "", text)
    text = re.sub(r"--+", "", text)
    text = re.sub(r"==.*?==", "", text)
    text = re.sub(r"=", "", text)
    
    #source: https://stackoverflow.com/questions/2077897/substitute-multiple-whitespace-with-single-whitespace-in-python
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
    text = _RE_COMBINE_WHITESPACE.sub(" ", text).strip()
    
    preprocessed_content.update({idx: text})
    
    if len(preprocessed_content) % 10 == 0:
        update_progress(len(preprocessed_content) / len(ids))

In [33]:
german_stopwords = pd.read_csv("./yake_de-stopwords.csv")
german_stopwords = list(german_stopwords["stopwords"])

In [34]:
preprocessed_content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(preprocess_data, data["content"], data["pageid"])

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

DUR: 6.135132074356079


In [35]:
preprocessed_df = pd.DataFrame(preprocessed_content.items())
preprocessed_df = preprocessed_df.rename({0: "pageid", 1:"content"}, axis = 1)

preprocessed_df.head()

Unnamed: 0,pageid,content
0,2677,Kanada (englisch und französisch Canada) ist e...
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...
2,16565,Frankfurt am Main () ist mit 759.224 Einwohner...
3,1200964,Australien (amtlicher deutscher Name; englisch...
4,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i..."


needed?
```python
# save preproessed wiki pages to filesystem
preprocessed_df.to_csv("./data/preprocessed_wiki_pages.csv")
```

## 3. Extract Keywords

In [36]:
preprocessed_df = pd.read_csv("./data/preprocessed_wiki_pages.csv")
preprocessed_df = preprocessed_df[["pageid", "content"]]

In [37]:
preprocessed_df.head()

Unnamed: 0,pageid,content
0,2677,Kanada (englisch und französisch Canada) ist e...
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i..."
3,1200964,Australien (amtlicher deutscher Name; englisch...
4,880316,"Finnland (finnisch [ˈsuɔmi], schwedisch Finlan..."


### 3.1. YAKE

using yakes keyword extractor  
(https://liaad.github.io/yake/)

yake hat eine eigene stopwords liste (https://github.com/LIAAD/yake/blob/master/yake/StopwordsList/stopwords_de.txt)  
--> Keine weitere Stopwords suche benötigt

YAKE ist sehr schnell (und leichter) Ansatz (https://towardsdatascience.com/unsupervised-keyphrase-extraction-with-patternrank-28ec3ca737f0)

Lemma wird auch nicht benötigt, weil Yake ohne entsprechend trainiert wurde  

Je geringer der Wahrscheinlichkeitswert, desto relevanter das Keyword (https://liaad.github.io/yake/docs/getting_started.html#output)

In [38]:
# initialize YAKE keyword extractor
yake_model = yake.KeywordExtractor(
    lan = "de",
    dedupLim = 0.99,
    top = 10,
    dedupFunc = "seqm", # default: seqm, alternative: jaro
    n = 2,
    windowsSize = 1
)

In [39]:
if not skip_kw_extraction:
    yake_keywords = {}
    start = time.time()

    for wiki_page, idx in zip(preprocessed_df["content"], preprocessed_df["pageid"]):
        keywords = yake_model.extract_keywords(wiki_page)
        yake_keywords.update({idx: keywords})

        update_progress(len(yake_keywords) / preprocessed_df.shape[0])

    end = time.time()

    print("DUR: %s" % (end - start))
    yake_df = pd.DataFrame(yake_keywords.items())
    yake_df = yake_df.rename({0: "pageid", 1:"YAKE_keywords"}, axis = 1)
    
    # save keywords with coresponding page id
    yake_df.to_csv("./data/wikipage_keywords-YAKE.csv")
else:
    print("SKIPPED KEYWORD EXTRACTION WITH YAKE")

Progress: [##################################################] 100.0%
DUR: 837.2417452335358


In [40]:
if skip_kw_extraction:
    yake_df = pd.read_csv("./data/wikipage_keywords-YAKE.csv")
    #yake_df = yake_df[["pageid", "YAKE_keywords"]]
    yake_df = yake_df[["pageid", "keywords"]]
    yake_df = yake_df.rename({"keywords": "YAKE_keywords"}, axis = 1)
    yake_df.head()

### 3.2. RAKE

In [41]:
# initialize RAKE model
rake_model = Rake(
    language = "german",
    stopwords = german_stopwords,
    ranking_metric = Metric.WORD_FREQUENCY,
    max_length = 2,
    include_repeated_phrases = False,
    punctuations = ".,-;)"
)

In [42]:
if not skip_kw_extraction:
    rake_keywords = {}
    start = time.time()

    for wiki_page, idx in zip(preprocessed_df["content"], preprocessed_df["pageid"]):
        rake_model.extract_keywords_from_text(wiki_page)
        # get top 10 keyowords for wiki page
        ranking = rake_model.get_ranked_phrases_with_scores()[:10]
        # recalculate propability
        keyword_ranking = []
        base = ranking[0][0] * 1.05
        for rank in ranking:
            kw = rank[1]
            prop = rank[0]
            keyword_ranking.append((kw, (1 - (prop / base)) / 100))

        rake_keywords.update({idx: keyword_ranking})
        update_progress(len(rake_keywords) / preprocessed_df.shape[0])

    end = time.time()
    dur = end - start
    print("DUR: %s" % (dur))
    
    rake_df = pd.DataFrame(rake_keywords.items())
    rake_df = rake_df.rename({0: "pageid", 1:"RAKE_keywords"}, axis = 1)
    
    # save keywords with coresponding page id
    rake_df.to_csv("./data/wikipage_keywords-RAKE.csv")
else:
    print("SKIPPED KEYWORD EXTRACTION WITH RAKE")

Progress: [##################################################] 100.0%
DUR: 46.937079191207886


In [43]:
if skip_kw_extraction:
    # read rake keywords
    rake_df = pd.read_csv("./data/wikipage_keywords-RAKE.csv")
    #rake_df = rake_df[["pageid", "RAKE_keywords"]]
    rake_df = rake_df[["pageid", "keywords"]]
    rake_df = rake_df.rename({"keywords": "RAKE_keywords"}, axis = 1)
    rake_df.head()

### 3.3. KeyBERT

In [44]:
# initialize KeyBERT model
kw_model = KeyBERT()

In [45]:
if not skip_kw_extraction:
    bert_keywords = {}
    start = time.time()

    for wiki_page, idx in zip(preprocessed_df["content"], preprocessed_df["pageid"]): 
        # get top 10 keywords from wiki page
        ranking = kw_model.extract_keywords(
            wiki_page, 
            keyphrase_ngram_range = (1, 1), 
            stop_words = german_stopwords,
            top_n = 10,
            use_mmr = True,
            diversity = 0.3
        )
        # recalculate propability
        keyword_ranking = []
        base = ranking[0][1] * 1.05
        for rank in ranking:
            kw = rank[0]
            prop = rank[1]
            keyword_ranking.append((kw, (1 - (prop / base)) / 100))

        bert_keywords.update({idx: keyword_ranking})
        update_progress(len(bert_keywords) / preprocessed_df.shape[0])

    end = time.time()
    dur = end - start
    print("DUR: %s" % (dur))
    
    bert_df = pd.DataFrame(bert_keywords.items())
    bert_df = bert_df.rename({0: "pageid", 1:"KeyBERT_keywords"}, axis = 1)
    
    # save keywords with coresponding page id
    bert_df.to_csv("./data/wikipage_keywords-BERT.csv")
else:
    print("SKIPPED KEYWORD EXTRACTION WITH KeyBERT")

Progress: [##################################################] 100.0%
DUR: 7680.418473005295


In [46]:
if skip_kw_extraction:
    bert_df = pd.read_csv("./data/wikipage_keywords-BERT.csv")
    #bert_df = bert_df[["pageid", "KeyBERT_keywords"]]
    bert_df = bert_df[["pageid", "keywords"]]
    bert_df = bert_df.rename({"keywords": "KeyBERT_keywords"}, axis = 1)
    bert_df.head()

### 3.4. Results from Keyword Extraction

In [47]:
# join wikipages with (predicted) keywords on pageid
results_kw_extraction = pd.concat(
    [
        preprocessed_df.set_index("pageid"), 
        yake_df.set_index("pageid"), 
        rake_df.set_index("pageid"),
        bert_df.set_index("pageid")
    ], axis = 1, join = "inner"
).reset_index()
results_kw_extraction.head()

Unnamed: 0,pageid,content,YAKE_keywords,RAKE_keywords,KeyBERT_keywords
0,2677,Kanada (englisch und französisch Canada) ist e...,"[(Kanada, 0.0005525449347212504), (Kanadas, 0....","[(provinz kanada, 0.0004761904761904778), (kan...","[(kanada, 0.0004761904761904756), (kanadiern, ..."
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...,"[(Philosophie, 0.00024914998570791535), (ISBN,...","[(philosophie –, 0.00047619047619047673), (phi...","[(philosophisches, 0.00047619047619047673), (p..."
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i...","[(Moskau, 0.00024187077106055336), (Stadt Mosk...","[(stadt moskau, 0.00047619047619047673), (mosk...","[(moskau, 0.0004761904761904756), (moskausmosk..."
3,1200964,Australien (amtlicher deutscher Name; englisch...,"[(Australien, 0.0006737985774128164), (Austral...","[(australischen regierung, 0.00047619047619047...","[(australiens, 0.00047619047619047673), (austr..."
4,880316,"Finnland (finnisch [ˈsuɔmi], schwedisch Finlan...","[(Finnland, 0.000695009528347298), (Finnlands,...","[(sowjetunion finnland, 0.00047619047619047673...","[(finnland, 0.0004761904761904756), (finnlandi..."


## 4. Evaluation

### 4.1. Ground Truth

In [49]:
if not skip_counter:
    # count words (excluding german stopwords) in text
    counter_keywords = {}
    start = time.time()
    for idx, content in zip(preprocessed_df["pageid"], preprocessed_df["content"]):
        counter = {}
        text = content.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.split(" ")

        for word in text:
            if word not in german_stopwords and len(word) != 0:
                if word in counter.keys():
                    counter.update({word: counter.get(word) + 1})
                else:
                    counter.update({word: 1})

        sorted_counter = dict(sorted(counter.items(), key = lambda item: item[1], reverse = True))

        # calculate propability that these words are keywords
        # base for propability calculations -- occurance of most occuring word with 5% addition to it, 
        #   so propability isn't 1 for most occuring word
        base = list(sorted_counter.items())[0][1]
        base = base * 1.05 # TODO: rework explaination why "* 1.05" (5% addition to max occurance of most occuring word)
        counts = []
        for key, val in sorted_counter.items():
            # append word with coresponding propability -- the lower the prop-value the bigger the propabiliy to be
            #   an important keyword
            counts.append((key, (1 - (val / base)) / 100))

        counter_keywords.update({idx: counts[:50]})

        update_progress(len(counter_keywords) / preprocessed_df.shape[0])

    end = time.time()
    dur = end - start
    print("DUR: %s" % (dur))
    
    results_counter = pd.DataFrame(counter_keywords.items())
    results_counter = results_counter.rename({0: "pageid", 1: "counter_keywords"}, axis = 1)
    
    results_counter.to_csv("./data/counter_keywords.csv")
else:
    print("SKIPPED COUNTER")

Progress: [##################################################] 100.0%
DUR: 99.65982794761658


In [50]:
if skip_counter:
    results_counter = pd.read_csv("./data/counter_keywords.csv")
    results_counter = results_counter[["pageid", "counter_keywords"]]
    results_counter.head()

In [51]:
results = pd.concat(
    [
        results_kw_extraction.set_index("pageid"), 
        results_counter.set_index("pageid")
    ], axis = 1, join = "inner"
).reset_index()
results = results.rename({"counter_keywords": "GROUND_TRUTH"}, axis = 1)
results.head()

Unnamed: 0,pageid,content,YAKE_keywords,RAKE_keywords,KeyBERT_keywords,GROUND_TRUTH
0,2677,Kanada (englisch und französisch Canada) ist e...,"[(Kanada, 0.0005525449347212504), (Kanadas, 0....","[(provinz kanada, 0.0004761904761904778), (kan...","[(kanada, 0.0004761904761904756), (kanadiern, ...","[(kanada, 0.00047619047619047673), (québec, 0...."
1,490244,In der Philosophie (altgriechisch φιλοσοφία ph...,"[(Philosophie, 0.00024914998570791535), (ISBN,...","[(philosophie –, 0.00047619047619047673), (phi...","[(philosophisches, 0.00047619047619047673), (p...","[(philosophie, 0.00047619047619047673), (–, 0...."
2,3221050,"Moskau (russisch Москва́ [mɐskˈva] , Moskwa) i...","[(Moskau, 0.00024187077106055336), (Stadt Mosk...","[(stadt moskau, 0.00047619047619047673), (mosk...","[(moskau, 0.0004761904761904756), (moskausmosk...","[(moskau, 0.00047619047619047673), (stadt, 0.0..."
3,1200964,Australien (amtlicher deutscher Name; englisch...,"[(Australien, 0.0006737985774128164), (Austral...","[(australischen regierung, 0.00047619047619047...","[(australiens, 0.00047619047619047673), (austr...","[(australien, 0.00047619047619047673), (austra..."
4,880316,"Finnland (finnisch [ˈsuɔmi], schwedisch Finlan...","[(Finnland, 0.000695009528347298), (Finnlands,...","[(sowjetunion finnland, 0.00047619047619047673...","[(finnland, 0.0004761904761904756), (finnlandi...","[(finnland, 0.00047619047619047673), (finnland..."


### 4.2. Calc Metrics

In [52]:
eval = {}
dataframes = [yake_df, rake_df, bert_df]
#dataframes = [rake_df]
start = time.time()
for df in dataframes:
    kw_column = df.columns[1]
    model_name = kw_column[:-9]
    print(model_name)
    eval.update({model_name: {}})
    for idx, kw_results, ground_truth in zip(df["pageid"], df[kw_column], results["GROUND_TRUTH"]):
        comparison = {}
        selected_keywords = []
        
        # set respective model keywords propability in order of occurance
        for keyword in kw_results:
            # if keyword is build from multiple words check each word separatly
            for kw in keyword[0].split(" "):
                comparison.update({
                    kw.lower(): {
                        model_name: keyword[1],
                        "counter": 1.0 # propabilitx gets updated later on
                    }
                })
        
        # set counter keywords propability in order of occurance
        for keyword in ground_truth:
            if keyword[0] in comparison.keys():
                comparison.update({
                    keyword[0]: {
                        model_name: comparison[keyword[0]].get(model_name),
                        "counter": keyword[1] 
                    }
                })
        
        # get keyword prediction and ground truth values
        pred = []
        true = []
        for key, val in comparison.items():
            pred.append(val.get(model_name))
            true.append(val.get("counter"))
            
        # calc metrics
        mse = mean_squared_error(true, pred)
        mae = mean_absolute_error(true, pred)
        mape = mean_absolute_percentage_error(true, pred)
        
        # save metrics results
        eval.get(model_name).update({
            idx: [mse, mae, mape]
        })
        
        update_progress(len(eval.get(model_name)) / df.shape[0])
    print("%s done" % (model_name))

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

Progress: [##################################################] 100.0%
KeyBERT done
DUR: 14.078214883804321


In [53]:
eval_df = pd.DataFrame()
eval_df["pageid"] = results["pageid"]
for model_results in eval.keys():
    temp_df = pd.DataFrame(eval.get(model_results).items())
    temp_df = temp_df.rename({0: "pageid", 1:"metrics"}, axis = 1)

    mse = []
    mae = []
    mape = []
    for metrics in temp_df["metrics"]:
        mse.append(metrics[0])
        mae.append(metrics[1])
        mape.append(metrics[2])

    eval_df["%s_MSE" % (model_results)] = mse
    eval_df["%s_MAE" % (model_results)] = mae
    eval_df["%s_MAPE" % (model_results)] = mape

In [54]:
eval_df.head()

Unnamed: 0,pageid,YAKE_MSE,YAKE_MAE,YAKE_MAPE,RAKE_MSE,RAKE_MAE,RAKE_MAPE,KeyBERT_MSE,KeyBERT_MAE,KeyBERT_MAPE
0,2677,1.4e-05,0.003521,1.028242,0.724973,0.727668,1.184364,0.794376,0.797679,0.85619
1,490244,0.1805,0.183986,1.20809,0.49832,0.502233,1.165619,0.994648,0.997319,0.997319
2,3221050,0.297929,0.302346,1.442702,0.632491,0.636324,1.364198,0.895196,0.897595,0.897595
3,1200964,1.5e-05,0.003316,0.495245,0.536364,0.539496,1.100422,0.79612,0.798508,0.918838
4,880316,0.109024,0.112092,2.361748,0.581319,0.584775,1.092797,0.698049,0.700266,0.849942


In [55]:
eval_df.to_csv("./data/results_eval_metrics.csv")

In [59]:
# simple overview of quality of prediction
for model_name in eval.keys():
    print("####### %s #######" % (model_name))
    print("##### MSE #####")
    print("max: %s\nmin: %s\nmean: %s" % (
        round(np.max(eval_df["%s_MSE" % (model_name)]), 4), 
        round(np.min(eval_df["%s_MSE" % (model_name)]), 4), 
        round(np.mean(eval_df["%s_MSE" % (model_name)]), 4)
    ))

    print("\n##### MAE #####")
    print("max: %s\nmin: %s\nmean: %s" % (
        round(np.max(eval_df["%s_MAE" % (model_name)]), 4),
        round(np.min(eval_df["%s_MAE" % (model_name)]), 4),
        round(np.mean(eval_df["%s_MAE" % (model_name)]), 4)
    ))

    print("\n##### MAPE #####")
    print("max: %s\nmin: %s\nmean: %s\n" % (
        round(np.max(eval_df["%s_MAPE" % (model_name)]), 4), 
        round(np.min(eval_df["%s_MAPE" % (model_name)]), 4), 
        round(np.mean(eval_df["%s_MAPE" % (model_name)]), 4)
    ))

####### YAKE #######
##### MSE #####
max: 0.7369
min: 0.0
mean: 0.0897

##### MAE #####
max: 0.7549
min: 0.0008
mean: 0.0961

##### MAPE #####
max: 28.3641
min: 0.146
mean: 2.2854

####### RAKE #######
##### MSE #####
max: 0.9985
min: 0.0
mean: 0.58

##### MAE #####
max: 0.9993
min: 0.0015
mean: 0.5832

##### MAPE #####
max: 2.1823
min: 0.5334
mean: 1.1132

####### KeyBERT #######
##### MSE #####
max: 0.9983
min: 0.0
mean: 0.8304

##### MAE #####
max: 0.9992
min: 0.0045
mean: 0.8335

##### MAPE #####
max: 2.0884
min: 0.6111
mean: 0.983



In [57]:
global_end = time.time()
global_dur = round((global_end - global_start) / 60, 3)
print("GLOBALE DUR: %s Minuten" % (global_dur))

GLOBALE DUR: 151.016 Minuten


2 - 2 1/2 Stunden

In [78]:
eval_df

Unnamed: 0,pageid,YAKE_MSE,YAKE_MAE,YAKE_MAPE,RAKE_MSE,RAKE_MAE,RAKE_MAPE,KeyBERT_MSE,KeyBERT_MAE,KeyBERT_MAPE
0,2677,0.000014,0.003521,1.028242,0.724973,0.727668,1.184364,0.794376,0.797679,0.856190
1,490244,0.180500,0.183986,1.208090,0.498320,0.502233,1.165619,0.994648,0.997319,0.997319
2,3221050,0.297929,0.302346,1.442702,0.632491,0.636324,1.364198,0.895196,0.897595,0.897595
3,1200964,0.000015,0.003316,0.495245,0.536364,0.539496,1.100422,0.796120,0.798508,0.918838
4,880316,0.109024,0.112092,2.361748,0.581319,0.584775,1.092797,0.698049,0.700266,0.849942
...,...,...,...,...,...,...,...,...,...,...
2791,12593625,0.096674,0.104372,2.002789,0.497954,0.501412,1.157922,0.790596,0.795630,0.889358
2792,10278319,0.110819,0.115583,1.244733,0.815528,0.818077,0.968102,0.896968,0.899174,0.977237
2793,12508588,0.000017,0.003170,0.676339,0.907596,0.908383,0.990988,0.894728,0.897809,0.955136
2794,12587627,0.220030,0.224495,1.655242,0.632812,0.636532,1.369102,0.893308,0.896733,1.075977


In [87]:
metrics_df = pd.DataFrame(columns = [
    "max_MSE", "max_MAE", "max_MAPE",
    "min_MSE", "min_MAE", "min_MAPE",
    "mean_MSE", "mean_MAE", "mean_MAPE"
])
for model_name in eval.keys():
    metrics_df.loc[model_name] = [
        round(np.max(eval_df["%s_MSE" % (model_name)]), 4), 
        round(np.max(eval_df["%s_MAE" % (model_name)]), 4), 
        round(np.max(eval_df["%s_MAPE" % (model_name)]), 4),
        
        round(np.min(eval_df["%s_MSE" % (model_name)]), 4), 
        round(np.min(eval_df["%s_MAE" % (model_name)]), 4), 
        round(np.min(eval_df["%s_MAPE" % (model_name)]), 4),
        
        round(np.mean(eval_df["%s_MSE" % (model_name)]), 4), 
        round(np.mean(eval_df["%s_MAE" % (model_name)]), 4), 
        round(np.mean(eval_df["%s_MAPE" % (model_name)]), 4),
    ]

In [88]:
metrics_df

Unnamed: 0,max_MSE,max_MAE,max_MAPE,min_MSE,min_MAE,min_MAPE,mean_MSE,mean_MAE,mean_MAPE
YAKE,0.7369,0.7549,28.3641,0.0,0.0008,0.146,0.0897,0.0961,2.2854
RAKE,0.9985,0.9993,2.1823,0.0,0.0015,0.5334,0.58,0.5832,1.1132
KeyBERT,0.9983,0.9992,2.0884,0.0,0.0045,0.6111,0.8304,0.8335,0.983


In [89]:
metrics_df.index

Index(['YAKE', 'RAKE', 'KeyBERT'], dtype='object')

In [101]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x = metrics_df.index, y = metrics_df["mean_MSE"], 
        name = "mean_MSE", line = dict(color = colors[0], width = 3)
    )
)

fig.add_trace(
    go.Scatter(
        x = metrics_df.index, y = metrics_df["mean_MAE"], 
        name = "mean_MAE", line = dict(color = colors[1], width = 3)
    )
)

fig.add_trace(
    go.Scatter(
        x = metrics_df.index, y = metrics_df["mean_MAPE"], 
        name = "mean_MAPE", line = dict(color = colors[2], width = 3)
    )
)

fig.show()

Yake ist schon ziemlich gut, weil MSE und MAE echt sehr gut sind. MAPE ist ebenfalls sehr gut, aber am schlechtesten von llen drei Modellen.

RAKE ist auch okay-gut, da hier der MAPE-Wert fast < 1% ist. MAE und MSE sind schlechter als bei YAKE und da es schon über 0,5 ist es so semi. 

Beim KeyBERT ist MAPE noch etwas besser und damit der beste Wert. MAE und MSE sind allerdings noch schlechter.

Auch unter Betrachtung der vorhergesagten Schlagwörter der einzelnen Modelle, würde ich YAKE als bestes Modell einstufen.  
Lässt man die vorhergesagten Schlagwörter außer Sicht, so würde ich sagen, dass RAKE am besten geeignet ist, da hier das Mittelmaß zwischen MSE und MAE und MAPE ist.