# A Quantitative Analysis of Unsupervised Keyword Extraction Methods


### This code is part of a Bachelor thesis at Humboldt University Berlin. 

This code will compare commonly used unsupervised keyphrase extraction methods, which are structured in three areas: Statistical methods (TF-IDF, YAKE, RAKE), mathods that are based on graphs (SingleRank, TextRank), and deep learning methods (KeyBERT). The latter uses language-specific pre-trained models, in this case for German, whereas the statistical and graph-based methods need no training and are language unspecific.

## Import Packages

This cell connects Google Drive, where the dataset is stored, to Google Colab

In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


This call installs all necessary packages and language models

In [None]:
!pip3 install scipy
#!pip3 install pytextrank
!pip3 install git+https://github.com/boudinfl/pke.git
!pip3 install python-rake
!pip3 install yake
!pip3 install sentence_transformers
!pip3 install keybert
!pip3 install keyphrase_vectorizers
!pip3 install flair
!pip3 install futures
!pip3 install spacy-transformers
!pip3 install swifter

!python -m spacy download de_dep_news_trf
!python -m spacy download de_core_news_sm
!python -m spacy download de_core_news_lg

This cell loads the packages

In [None]:
import json
import os
import numpy as np
import pandas as pd
import time
from IPython import display
import scipy
import sys
import spacy

import platform
import functools
from string import printable
from statistics import mean
from operator import itemgetter
from itertools import islice, combinations
import glob
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
import itertools

from sentence_transformers import SentenceTransformer, util
from keybert import KeyBERT
#import pytextrank
import pke
from RAKE import Rake, NLTKStopList
from yake import KeywordExtractor
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keyphrase_vectorizers import KeyphraseTfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from flair.embeddings import TransformerDocumentEmbeddings
import spacy_transformers

from string import punctuation
from nltk.stem import SnowballStemmer
from nltk.stem.snowball import GermanStemmer
from nltk.corpus import stopwords
import tqdm as notebook_tqdm
from nltk import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
from tqdm.notebook import tqdm
tqdm.pandas()

# Collect the thrown exception
exception_texts = []

## Load Data from Excel

This function reads the excel files with all articles per month and concatenates them. It also seperates the keywords by comma and filters out observations with no text or no keywords.

In [None]:
def load_data(path):
  workbooks = [f for f in os.listdir(path) if f.endswith(".xlsx")]
  print(workbooks)

  outputxlsx = pd.DataFrame()

  for file in workbooks:
    df = pd.concat(pd.read_excel(path + '/' + file, sheet_name = None), ignore_index = True, sort = False)
    outputxlsx = outputxlsx.append(df, ignore_index=True)

  # Seperate Keywords by comma
  outputxlsx['keywords'] = outputxlsx['keywords'].str.split(',')

  outputxlsx = outputxlsx[~outputxlsx['keywords'].isnull()]
  outputxlsx = outputxlsx[~outputxlsx['text'].isnull()]

  return outputxlsx

This cell calls the function to concatenate the data and split it into a test-set (20%) and a tune-set (80%). Both datasets are then saved as excel files to be able to restore the same data split.

In [None]:
df = load_data('/content/drive/MyDrive/Bachelorarbeit/Colab_data/Datasets_test')
tune_df, test_df = train_test_split(df, test_size=0.2)

tune_df.to_excel('/content/drive/MyDrive/Bachelorarbeit/Colab_data/final_datasets/tune_df.xlsx')
test_df.to_excel('/content/drive/MyDrive/Bachelorarbeit/Colab_data/final_datasets/test_df.xlsx')

This cell loads the data, converts the keyword collumn to list format, removes all keywords that are not in the text and removes observations with less than 2 keywords.

In [None]:
from ast import literal_eval

# Function to remove keywords of one observation that are not present in the text
def remove_keywords_not_in_text(text, keywords):
    keywords_in_text = [x for x in keywords if x in text]
    return keywords_in_text

tune_df = pd.read_excel('/content/drive/MyDrive/Bachelorarbeit/Colab_data/final_datasets/tune_df.xlsx')
tune_df['keywords'] = tune_df['keywords'].apply(literal_eval)
tune_df['keywords'] = tune_df['keywords'].apply(lambda x: [s.strip() for s in x])
tune_df['keywords'] = tune_df.apply(lambda x: remove_keywords_not_in_text(x.text, x.keywords), axis=1)
tune_df = tune_df[tune_df['keywords'].map(len) >= 2]
tune_df = tune_df.reset_index(drop=True)

test_df = pd.read_excel('/content/drive/MyDrive/Bachelorarbeit/Colab_data/final_datasets/test_df.xlsx')
test_df['keywords'] = test_df['keywords'].apply(literal_eval)
test_df['keywords'] = test_df['keywords'].apply(lambda x: [s.strip() for s in x])
test_df['keywords'] = test_df.apply(lambda x: remove_keywords_not_in_text(x.text, x.keywords), axis=1)
test_df = test_df[test_df['keywords'].map(len) >= 2]
test_df = test_df.reset_index(drop=True)

### Stopwords

This cell imports a stopword list. The stopwordlist comes from: https://countwordsfree.com/stopwords/german. The cell also imprts the module printable, which is a list of printable characters and adds the german "Umlaute" ('öäüÖÄÜß') 

In [None]:
import json
f = open('/content/drive/MyDrive/Bachelorarbeit/Colab_data/stop_words_german.json')
stopwords_german = json.load(f)

# Add german 'Umlaute to prinatble list.'
from string import printable
printable = printable + 'öäüÖÄÜß'

# Models

This section defines the functions for the six keyphrase extraction methods: Statistical-methods (TF-IDF, RAKE, YAKE), graph-based (TextRank, SingleRank), deep learning (KeyBERT).

## Statistical based methods

### TF-IDF 

This function represents the TF-IDF method. TF-IDF determines the relative frequency of a term in a specific text in contrast to the inverse frequency of that term's overall occurrence in all texts.

The fist function tfidf_matrix() creates the TF-IDF matrix for the whole dataset - a TF-IDF score is being calculated for every word of the dataset. 

The second function gets the TF-IDF value.

**Hyperparameters:**

*   **n-gram:** Minimum and maximum number of words per keyphrase (default = (1, 2))

**Source:** https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [None]:
def tfidf_matrix(df, ngram_range):

    tfidf_vectorizer = TfidfVectorizer(use_idf = True, stop_words = stopwords_german, ngram_range = ngram_range) 
    tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(df['text'].tolist())
    tfidf_vectorizer_feature_names = tfidf_vectorizer.get_feature_names()
    return tfidf_vectorizer_vectors, tfidf_vectorizer, tfidf_vectorizer_feature_names 

def tfidfvectorizer(number_text, tfidf_vectorizer_vectors, tfidf_vectorizer, tfidf_vectorizer_feature_names, top_n = 10):
    
    # get vector for document
    first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[number_text] 
    # place tf-idf values in a pandas data frame 
    df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer_feature_names, columns=["tfidf"]) 
    df = df.sort_values(by=["tfidf"],ascending=False)
    df = df.reset_index(level=0)
    keywords_list = df['index'].to_list()
    return keywords_list[:top_n]

### RAKE

This function executes the RAKE method. 

**hyperparameters:**
*    **minCharacters:** Minimum characters allowed in a keyword (default = 1)
*    **maxWords:** Maximum number of words allowed in a phrase considered as a keyword (default = 5)
*    **minFrequency:** Minimum number of occurrences of a keyword in the text to be considered as a keyword (default = 1)

**Source:** https://github.com/fabianvf/python-rake


In [None]:
def rake(text, minCharacters = 1, maxWords = 5, minFrequency = 1, top_n = 10):
    
    try:
        # Clean the text from non-printable characters.
        text = ''.join(word for word in text if word in printable)

        # Uses the german stopword list.
        r = Rake(stopwords_german)
        return [keyphrase for (keyphrase, score) in r.run(text, minCharacters, maxWords, minFrequency)[:top_n]]
    
    except:
        exception_texts.append('RAKE ' + str(text))
        return ['EXCEPTION']

### YAKE

This function executes the YAKE method

**Hyperparameters:**

*   **n: (max_ngram_size)** Maximum number of words per keyword (default = 3)
*   **dedupLim:** (deduplication_thresold) Threshold for the value of the similarity measure for deduplication (default = 0.9)
*   **deduplication:** Algorithm to measure the similarity of candidate keywords: levs, jaro or seqm (default = seqm)
*   **windowsSize:** Distance (in number of tokens) considered when computing co-occurrences of tokens (default = 1)

**Source:** https://github.com/LIAAD/yake

In [None]:
def yake(text, top_n = 10, n = 3, dedupLim = 0.9, dedupFunc = 'seqm', windowsSize = 1):
    
    try:
        # Initialize the keyword extractor object and its parameters.
        kw_extractor = KeywordExtractor (
            lan = "de",
            top = top_n,
            n = n,
            dedupLim = dedupLim,
            dedupFunc = dedupFunc,
            windowsSize = windowsSize
        )
        # Return the extracted keywords, in a list.
        return [keyword for (keyword, score) in kw_extractor.extract_keywords(text)]
    
    except:
        exception_texts.append('YAKE ' + str(text))
        return ['EXCEPTION']

## Graph-based methods

### TextRank

This function executes the TextRank method.

**Hyperparameter:**

*   **window:** Window for connecting two words in the graph (default = 2)
*   **pos:** Set of valid pos for words to be considered as a node in the graph (default = {'NOUN’ ’PROPN'  'ADJ'})

*   **top_percent:** Percentage of top vertices to keep for phrase generation (default = 0.33)

**Source:** https://github.com/boudinfl/pke/blob/master/pke/unsupervised/graph_based/textrank.py 

In [None]:
def textrank(text, top_percent = 0.33, top_n = 10, pos = {'NOUN', 'PROPN', 'ADJ'}, window = 2):

    try:
        # Clean the text from non-printable characters.
        text = ''.join(word for word in text if word in printable)
        # 1. create a TextRank extractor.
        extractor_textrank = pke.unsupervised.TextRank()
        # 2. load the content of the document.
        extractor_textrank.load_document(input=text, language='de', normalization = False)
        # 3. build the graph representation of the document and rank the words.
        extractor_textrank.candidate_weighting(window = window, pos = pos, top_percent=top_percent)

        return [keyphrase for (keyphrase, score) in extractor_textrank.get_n_best(n = top_n)]

    except: 
        exception_texts.append('textrank ' + str(text))
        return ['EXCEPTION']

### SingleRank

This function calls the SingeRank method.

**Hyperparameters:**

*   **window:** Window within the sentence for connecting words in the graph (default = 10)
*   **redundancy_removal:** Boolean variable whether redundant keyphrases are filtered out from the n-best list using levenshtein distance (default = True)
*   **pos:** Set of valid pos for words to be considered as nodes in the graph (default = 'NOUN’ ’PROPN'  'ADJ')

**Source:** 
*   https://github.com/boudinfl/pke/blob/master/pke/unsupervised/graph_based/singlerank.py


In [None]:
def singlerank(text, top_n = 10, window=10, pos=None, normalized = False, redundancy_removal = True):

    try:
        # Clean the text from non-printable characters.
        text = ''.join(word for word in text if word in printable)

        # Initialize the keyphrase extraction model.
        extractor = pke.unsupervised.SingleRank()

        # Load the content of the document and preprocess it with spacy.
        # Then, select the keyphrase candidates from the document,
        # and weight them using a random walk algorithm.
        extractor.load_document(input = text, language = 'de', normalization = normalized)
        extractor.candidate_selection()
        extractor.candidate_weighting(window = window, pos = pos, normalized = normalized)

        # Return the n-highest scored candidates.
        return [
            keyphrase for (keyphrase, score)
            in extractor.get_n_best(n = top_n, redundancy_removal = redundancy_removal)
        ]
    
    except:
        exception_texts.append('singlerank ' + str(text))
        return ['EXCEPTION']

## Deep learning methods

### KeyBERT

This part loads all the language models used for the hyperparameter tuning of the KeyBERT method. It also loads the vectorizers.

**Source:** 

language models: 
*   https://spacy.io/models/de
*   https://huggingface.co/bert-base-german-cased
*   https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2

python packages:
*   https://github.com/flairNLP/flair
*   https://github.com/explosion/spaCy
*   https://github.com/TimSchopf/KeyphraseVectorizers



In [None]:
# Load  language models
nlp_kw_model_2 = spacy.load("de_core_news_sm", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
kw_model_2 = KeyBERT(model=nlp_kw_model_2)

nlp_kw_model_5 = spacy.load("de_core_news_lg", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
kw_model_5 = KeyBERT(model=nlp_kw_model_5)

bert_german = TransformerDocumentEmbeddings('bert-base-german-cased')
kw_model_3 = KeyBERT(model=bert_german)

nlp_kw_model_6 = SentenceTransformer("distiluse-base-multilingual-cased-v2")
kw_model_6 = KeyBERT(model=nlp_kw_model_6)

nlp_kw_model_8 = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
kw_model_8 = KeyBERT(model=nlp_kw_model_8)


# load vectorizer
vectorizer_keybert = KeyphraseCountVectorizer(spacy_pipeline='de_core_news_sm')

#### KeyBERT multi document extraction

This function uses the build in multi-document processing function of KeyBERT. It takes a list of texts as an input and returnes a list of keywords for every text.

**Hyperparameters:**

*   **keyphrase_ngram_range:** Minimum and maximum number of words per keyphrase (default = (1, 1))
*   **min_df:** Minimum frequency of words (default = 1)
*   **model:** BERT model (default = all-MiniLM-L6-v2)

**Source:** https://github.com/MaartenGr/KeyBERT

In [None]:
def keybert_bulk(df, keyphrase_ngram_range = (1, 2), top_n = 10, min_df = 1, vectorizer = vectorizer_1, model = kw_model_2, stopwords = stopwords_german):

    listText_keybert = df['text'].tolist()

    keywords_list = model.extract_keywords(
            listText_keybert, 
            keyphrase_ngram_range = keyphrase_ngram_range,
            stop_words = stopwords,
            top_n = top_n,
            min_df = min_df,
            vectorizer = vectorizer
    )
    
    keywords_list_clean = [[i[0] for i in liste_item] for liste_item in keywords_list]

    df['extracted'] = keywords_list_clean
    #df['extracted'] = df['extracted'].str.split(',')

    return df


#### KeyBERT single document extraction

This function is used when running KeyBERT in single document mode. It takes one text as an imput and returns keywords.

**Hyperparameters:**

*   **keyphrase_ngram_range:** Minimum and maximum number of words per keyphrase (default = (1, 1))
*   **min_df:** Minimum frequency of words (default = 1)
*   **model:** BERT model (default = all-MiniLM-L6-v2)

**Source:** https://github.com/MaartenGr/KeyBERT

In [None]:
def keybert(text, keyphrase_ngram_range = (1, 2), top_n = 10, nr_candidates = 20, measure = 'maxsum', diversity = 0.7, vectorizer = vectorizer_1, model = kw_model_2, stopwords = stopwords_german):

    # Returned the extracted keywords based on the specified arguments. 
    try:
        keywords = [
            keyphrase for (keyphrase, _) in 
            model.extract_keywords (
                text, 
                keyphrase_ngram_range = keyphrase_ngram_range,
                stop_words = stopwords,
                top_n = top_n,
                nr_candidates = nr_candidates,
                use_maxsum = True if measure == 'maxsum' else False,
                use_mmr = True if measure == 'mmr' else False,
                diversity = diversity,
                vectorizer = vectorizer
        )]
        return keywords
    
    except: 
        exception_texts.append('keybert ' + str(text))
        return ['EXCEPTION']

# F1 Score

This part calculates the F1 score to evaluate the performance of the chosen models. It can calculate either the partial F1 score or an exact match F1 score. For our experiments, the partial match framwork was used.

**Source:** https://github.com/NC0DER/KeyphraseExtraction 

### Partial Percision K

Computes the average partial precision at k, between two lists of keywords.
The partial precision is defined as the fraction between the number of correctly partially matched tokens, 
over the total number of extracted (k) tokens.

Assigned should always contain the shorter list, while extracted the longest, as to avoid counting partial matches more times than necessary.

In [None]:
def partial_precision_k(assigned, extracted, k):

    assigned, extracted = min((assigned, extracted[:k]), key = len), max((assigned, extracted[:k]), key = len)
    assigned_sets = [set(keyword.split()) for keyword in assigned]
    extracted_sets = [set(keyword.split()) for keyword in extracted]

    return sum(
        1.0 for i in assigned_sets  
            if any(True for j in extracted_sets if i & j)) / k

### Partial Recall K

Computes the average partial recall at k, between two lists of keywords.
The partial recall is defined as the fraction between the number of correctly partially matched tokens, over the total number of extracted (k) tokens.

In [None]:
def partial_recall_k(assigned, extracted, k):

    assigned_length = len(assigned)
    assigned, extracted = min((assigned, extracted[:k]), key = len), max((assigned, extracted[:k]), key = len)
    assigned_sets = [set(keyword.split()) for keyword in assigned]
    extracted_sets = [set(keyword.split()) for keyword in extracted]

    return sum(
        1.0 for i in assigned_sets
            if any(True for j in extracted_sets if i & j)) / assigned_length

### Recall K

Computes the exact match recall at k, between two lists of keywords.
The average precision is defined as the fraction between the number of correctly matched tokens (the intersection of assigned and extracted sets) over the number of assigned tokens.

In [None]:
def recall_k(assigned, extracted, k):
    return len(set(assigned) & set(extracted[:k])) / len(assigned)

### Precission K

Computes the exact match precision at k, between two lists of keywords. 
The precision is defined as the fraction between the number of correctly matched tokens (the intersection of assigned and extracted sets) over the number of extracted (k) tokens. 

In [None]:
def precision_k(assigned, extracted, k):
    return len(set(assigned) & set(extracted[:k])) / k

### F1 

Computes the f1 measure at k.
The f1 measure at k is defined as the harmonic mean of the precision at k and recall at k.

In [None]:
def f1_measure_k(assigned, extracted, k, partial = True):
    
    try: 
      # If the assigned tags list is longer than the assigned tags list, it removes the end of the assigned tags list
      while len(extracted) < len(assigned):
          assigned.pop()
      
      precision = (
          partial_precision_k(assigned, extracted, k)
          if partial else precision_k(assigned, extracted, k)
      )
      recall = (
          partial_recall_k(assigned, extracted, k)
          if partial else recall_k(assigned, extracted, k)
      )
      return (
          2 * precision * recall / (precision + recall)
          if not precision == recall == 0.0 else 0.0
      )
    except:
      return 0.0

# Stemming
Function which applies stemming to a lowercase version of each string of the list, which has all punctuation removed.

**Source:** https://pypi.org/project/snowballstemmer/

In [None]:
stemmers = {'german': SnowballStemmer('german')}

def preprocess(lis, language = 'german'):
    try:
      return list(map(stemmers[language].stem, 
            map(lambda s: s.translate(str.maketrans('', '', punctuation)),
            map(str.lower, lis))))
    except:
      return lis

# Hyperparameter Tuning 

### TF IDF

**Parameters to tune:**
*   ngram_range: Number of terms per keyphrase (default = (1, 2))

**Top hyperparameters:**
*   ngram_range: (1, 2)


In [None]:
# list of hyperparameters for tuning
params = {
    'top_n': [10],
    'ngram_range': [(1, 1), (1, 2), (1, 3)]}
keys, values = zip(*params.items())
parameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

tfidf_vectorizer_vectors, tfidf_vectorizer, tfidf_vectorizer_feature_names = tfidf_matrix(tune_df, ngram_range = (1,2))

# runs the algorithm and creates a new column with the extracted keywords. Then The score is caculated in a seperate column
# it also measures the time the algorithm takes to process the dataset
# the dataframe with the extracted keywords is saved and the function return the mean f1 score, the used parameters, the length of the df and the time
def tfidf_tuning(parameter_list, 
                 df, 
                 tfidf_vectorizer_vectors=tfidf_vectorizer_vectors, 
                 tfidf_vectorizer=tfidf_vectorizer, 
                 tfidf_vectorizer_feature_names=tfidf_vectorizer_feature_names):

    tic = time.perf_counter()
    df['extracted'] = df.progress_apply(lambda x: extract_per_row(x.name, tfidf_vectorizer_vectors, tfidf_vectorizer, tfidf_vectorizer_feature_names), axis=1)
    df['f_score'] = df.apply(bulk_evaluate, axis=1)
    
    time_loop = time.perf_counter() - tic
    df.to_excel('hyperparameter_tuning/extracted_keywords/tfidf_keywords.xlsx')
    return df['f_score'].mean(), parameter_list, len(df), time_loop 

def extract_per_row(number_text, tfidf_vectorizer_vectors, tfidf_vectorizer, tfidf_vectorizer_feature_names, parameter_list = parameter_list):
    keywords = tfidfvectorizer(number_text = number_text, 
                                tfidf_vectorizer_vectors = tfidf_vectorizer_vectors, 
                                tfidf_vectorizer = tfidf_vectorizer,
                                tfidf_vectorizer_feature_names = tfidf_vectorizer_feature_names)
    print(keywords)
    display.clear_output(wait=True)
    return keywords

# Function to evaluate the extracted keywords which returns the f1 score
def bulk_evaluate(row, top_n = 10):
    actual_tags_processed = preprocess(row['keywords'], 'german')
    predicted_tags_processed = preprocess(row['extracted'], 'german')
    f_score = f1_measure_k(assigned = actual_tags_processed, extracted = predicted_tags_processed, k = top_n, partial = False)
    return f_score


def run_tuning(parameter_list):
    # This part runs the method as a multiprocessing task to speed up the keyword extraction process
    with ProcessPoolExecutor(max_workers=1) as executor:
        results = [executor.submit(tfidf_tuning, parameters, test_df) for parameters in parameter_list]

    results_list = []
    for f in concurrent.futures.as_completed(results):
        results_list.append(f.result())
    return results_list

# returns a excel with the results
results_list = [tfidf_tuning(parameter_list[0], tune_df)]
df = pd.DataFrame(results_list)
df.to_excel('hyperparameter_tuning/final_run/tfidf_results.xlsx')

### YAKE

**Parameters to tune:**

*   **n: (max_ngram_size)** Maximum number of words per keyword (default = 3)
*   **dedupLim:** (deduplication_thresold) Threshold for the value of the similarity measure for deduplication (default = 0.9)
*   **deduplication:** Algorithm to measure the similarity of candidate keywords: levs, jaro or seqm (default = seqm)
*   **windowsSize:** Distance (in number of tokens) considered when computing co-occurrences of tokens (default = 1)


**Top hyperparameters:** 
*   **n:** 1
*   **dedupLim:** 0.8
*   **dedupFunc:** 'leve'
*   **windowsSize:** 3







In [None]:
# list of hyperparameters for tuning
params = {
    'top_n': [10],
    'n': [1, 2],
    'dedupLim': [0.8, 0.9],
    'dedupFunc': ['leve', 'jaro', 'seqm'],
    'windowsSize': [1, 2, 3, 4, 5]}
keys, values = zip(*params.items())
parameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

# runs the algorithm and creates a new column with the extracted keywords. Then The score is caculated in a seperate column
# it also measures the time the algorithm takes to process the dataset
# the dataframe with the extracted keywords is saved and the function return the mean f1 score, the used parameters, the length of the df and the time
def yake_tuning(parameter_list, df):
    def extract_per_row(text, parameter_list = parameter_list):
        keywords = yake(text, 
                        top_n = parameter_list["top_n"], 
                        n = parameter_list["n"], 
                        dedupLim = parameter_list["dedupLim"], 
                        dedupFunc = parameter_list["dedupFunc"], 
                        windowsSize = parameter_list["windowsSize"])
        
        return keywords

    # Function to evaluate the extracted keywords which returns the f1 score
    def bulk_evaluate(row, top_n = 10):
        actual_tags_processed = preprocess(row['keywords'], 'german')
        predicted_tags_processed = preprocess(row['extracted'], 'german')
        f_score = f1_measure_k(assigned = actual_tags_processed, extracted = predicted_tags_processed, k = top_n, partial = False)
        
        return f_score

    tic = time.perf_counter()
    df['extracted'] = df['text'].progress_apply(lambda x: extract_per_row(text = x))
    df['f_score'] = df.apply(bulk_evaluate, axis=1)
    time_loop = time.perf_counter() - tic
    df.to_excel('/content/drive/MyDrive/Bachelorarbeit/hyperparameter_tuning/extracted_keywords/yake_keywords.xlsx')

    return df['f_score'].mean(), parameter_list, len(df), time_loop 

def run_tuning(parameter_list, df):
    # This part runs the method as a multiprocessing task to speed up the keyword extraction process
    with ProcessPoolExecutor(max_workers=30) as executor:
        results = [executor.submit(yake_tuning, parameters, df) for parameters in parameter_list]

    results_list = []
    for f in concurrent.futures.as_completed(results):
        results_list.append(f.result())

    return results_list

# returns a excel with the results
results_list = run_tuning(parameter_list, tune_df)
df = pd.DataFrame(results_list)
df.to_excel('/content/drive/MyDrive/Bachelorarbeit/hyperparameter_tuning/final_run/yake_results.xlsx')

### RAKE

**Parameters to tune:**

*    **minCharacters:** Minimum characters allowed in a keyword (default = 1)
*    **maxWords:** Maximum number of words allowed in a phrase considered as a keyword (default = 5)
*    **minFrequency:** Minimum number of occurrences of a keyword in the text to be considered as a keyword (default = 1)

**Top hyperparameters:** 

*   **minCharacters:** 2
*   **maxWords:** 1
*   **minFrequency:** 1

In [None]:
# list of hyperparameters for tuning
params = {
    'top_n': [10],
    'minCharacters': [1, 2, 4],
    'maxWords': [2, 3, 5],
    'minFrequency': [1, 2, 3]}
keys, values = zip(*params.items())
parameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

# runs the algorithm and creates a new column with the extracted keywords. Then The score is caculated in a seperate column
# it also measures the time the algorithm takes to process the dataset
# the dataframe with the extracted keywords is saved and the function return the mean f1 score, the used parameters, the length of the df and the time
def rake_tuning(parameter_list, df):
    def extract_per_row(text, parameter_list = parameter_list):
        keywords = rake(text, 
                        minCharacters = parameter_list["minCharacters"], 
                        maxWords = parameter_list["maxWords"], 
                        minFrequency= parameter_list["minFrequency"])
        
        return keywords

    # Function to evaluate the extracted keywords which returns the f1 score
    def bulk_evaluate(row, top_n = 10):
        actual_tags_processed = preprocess(row['keywords'], 'german')
        predicted_tags_processed = preprocess(row['extracted'], 'german')
        f_score = f1_measure_k(assigned = actual_tags_processed, extracted = predicted_tags_processed, k = top_n, partial = False)
        return f_score

    tic = time.perf_counter()
    df['extracted'] = df['text'].progress_apply(lambda x: extract_per_row(text = x))
    df['f_score'] = df.apply(bulk_evaluate, axis=1)
    time_loop = time.perf_counter() - tic
    df.to_excel('/content/drive/MyDrive/Bachelorarbeit/hyperparameter_tuning/extracted_keywords/rake_keywords.xlsx')

    return df['f_score'].mean(), parameter_list, len(df), time_loop 

def run_tuning(parameter_list, df):
    # This part runs the method as a multiprocessing task to speed up the keyword extraction process
    with ProcessPoolExecutor(max_workers=30) as executor:
        results = [executor.submit(rake_tuning, parameters, df) for parameters in parameter_list]

    results_list = []
    for f in concurrent.futures.as_completed(results):
        results_list.append(f.result())

    return results_list

# returns a excel with the results
results_list = run_tuning(parameter_list, tune_df)
df = pd.DataFrame(results_list)
df.to_excel('/content/drive/MyDrive/Bachelorarbeit/hyperparameter_tuning/final_run/rake_results.xlsx')

### TextRank

**Parameters to tune:**

*   **window:** Window for connecting two words in the graph (default = 2)
*   **pos:** Set of valid pos for words to be considered as a node in the graph (default = {'NOUN’ ’PROPN'  'ADJ'})

*   **top_percent:** Percentage of top vertices to keep for phrase generation (default = 0.33)

**Top hyperparameters:** 
*   **window:** 2
*   **pos:** {'NOUN', 'PROPN'}
*   **top_percent:** 0.33

In [None]:
# list of hyperparameters for tuning
params = {
    'top_n': [10],
    'window': [2, 3, 5],
    'pos': [{'NOUN', 'PROPN'}, {'NOUN', 'PROPN', 'ADJ'}],
    'top_percent': [0.1, 0.3, 0.5]}
keys, values = zip(*params.items())
parameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

def extract_all(df):
    list_texts = df['text'].tolist()
    list_index = list(df.index.values)
    keyword_list = []
    # This part runs the method as a multiprocessing task to speed up the keyword extraction process
    with ProcessPoolExecutor() as executor:
        for arg, res in zip(list_index, executor.map(extract_per_row, list_texts, chunksize = 4)):
            print(len(keyword_list), res)
            display.clear_output(wait=True)
            keyword_list.append(res)
    df['extracted'] = keyword_list
    return df


def extract_per_row(text, parameter_list = parameter_list[0]):
    keywords = textrank(text, 
                        top_n = parameter_list['top_n'], 
                        window = parameter_list['window'], 
                        pos = parameter_list['pos'], 
                        top_percent = parameter_list['top_percent'],)

    return keywords

# Function to evaluate the extracted keywords which returns the f1 score
def bulk_evaluate(row, top_n = 10):
    actual_tags_processed = preprocess(row['keywords'], 'german')
    predicted_tags_processed = preprocess(row['extracted'], 'german')
    f_score = f1_measure_k(assigned = actual_tags_processed, extracted = predicted_tags_processed, k = top_n, partial = False)
    return f_score

# runs the algorithm and creates a new column with the extracted keywords. Then The score is caculated in a seperate column
# it also measures the time the algorithm takes to process the dataset
# the dataframe with the extracted keywords is saved and the function return the mean f1 score, the used parameters, the length of the df and the time
def evaluate_textrank(df):
    tic = time.perf_counter()
    df = extract_all(df)
    df['f_score'] = df.apply(bulk_evaluate, axis=1)
    time_loop = time.perf_counter() - tic
    
    df.to_excel('/content/drive/MyDrive/Bachelorarbeit/hyperparameter_tuning/extracted_keywords/textrank_keywords.xlsx')
    return df['f_score'].mean(), parameter_list, len(df), time_loop 

# returns a excel with the results
results = evaluate_textrank(tune_df)
df = pd.DataFrame([results])
df.to_excel('/content/drive/MyDrive/Bachelorarbeit/hyperparameter_tuning/final_run/textrank_results.xlsx')

### SingleRank 

**Parameters to tune:**

*   **window:** Window within the sentence for connecting words in the graph (default = 10)
*   **redundancy_removal:** Boolean variable whether redundant keyphrases are filtered out from the n-best list using levenshtein distance (default = True)
*   **pos:** Set of valid pos for words to be considered as nodes in the graph (default = 'NOUN’ ’PROPN'  'ADJ')

**Top hyperparameter:** 
*   **window:** 12 
*   **redundancy_removal:** True
*   **pos:** {'NOUN', 'ADJ', 'PROPN'}


In [None]:
# list of hyperparameters for tuning
params = {
    'top_n': [10],
    'window': [8, 10, 12],
    'normalized': [False],
    'redundancy_removal': [True, False],
    'pos': [{'NOUN', 'PROPN', 'ADJ'}]}
keys, values = zip(*params.items())
parameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

def extract_all(df):
    list_texts = df['text'].tolist()
    list_index = list(df.index.values)
    keyword_list = []
    # This part runs the method as a multiprocessing task to speed up the keyword extraction process
    with ProcessPoolExecutor(max_workers=4) as executor:
        for arg, res in zip(list_index, executor.map(extract_per_row, list_texts, chunksize = 3)):
            print(len(keyword_list), res)
            display.clear_output(wait=True)
            keyword_list.append(res)
    df['extracted'] = keyword_list
    return df


def extract_per_row(text, parameter_list = parameter_list[0]):
    keywords = singlerank(text, 
                    top_n = parameter_list["top_n"], 
                    window = parameter_list["window"], 
                    normalized = parameter_list["normalized"], 
                    redundancy_removal = parameter_list["redundancy_removal"], 
                    pos = parameter_list["pos"])
    #print(keywords)
    #display.clear_output(wait=True)
    return keywords

# Function to evaluate the extracted keywords which returns the f1 score
def bulk_evaluate(row, top_n = 10):
    actual_tags_processed = preprocess(row['keywords'], 'german')
    predicted_tags_processed = preprocess(row['extracted'], 'german')
    f_score = f1_measure_k(assigned = actual_tags_processed, extracted = predicted_tags_processed, k = top_n, partial = False)
    return f_score

# runs the algorithm and creates a new column with the extracted keywords. Then The score is caculated in a seperate column
# it also measures the time the algorithm takes to process the dataset
# the dataframe with the extracted keywords is saved and the function return the mean f1 score, the used parameters, the length of the df and the time
def evaluate_singlerank(df):
    tic = time.perf_counter()
    df = extract_all(df)
    df['f_score'] = df.apply(bulk_evaluate, axis=1)
    time_loop = time.perf_counter() - tic
    
    df.to_excel('/content/drive/MyDrive/Bachelorarbeit/hyperparameter_tuning/extracted_keywords/singlerank_keywords.xlsx')
    return df['f_score'].mean(), parameter_list, len(df), time_loop 

# returns a excel with the results
results = evaluate_singlerank(tune_df)
df = pd.DataFrame([results])
df.to_excel('/content/drive/MyDrive/Bachelorarbeit/hyperparameter_tuning/final_run/singlerank_results.xlsx')

### KeyBert

**Parameters to tune:**

*   **keyphrase_ngram_range:** Minimum and maximum number of words per keyphrase (default = (1, 1))
*   **min_df:** Minimum frequency of words (default = 1)
*   **model:** BERT model (default = all-MiniLM-L6-v2)

**Top hyperparameters:**

*   **keyphrase_ngram_range:** (1, 1)
*   **min_df:** 1
*   **model:** paraphrase-multilingual-MiniLM-L12-v2

In [None]:
# list of hyperparameters for tuning
keybert_params = {
    'keyphrase_ngram_range': [(1, 2)],
    'top_n': [10],
    'min_df': [1, 2, 3],
    'model': ['kw_model_2', 'kw_model_3', 'kw_model_5', 'kw_model_6', 'kw_model_8'],
    'vectorizer': [vectorizer_keybert]}
keys, values = zip(*keybert_params.items())
parameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

def keybert_bulk_tuning(parameters, df):
    keyphrase_ngram_range = parameters["keyphrase_ngram_range"]
    top_n = parameters["top_n"]
    min_df = parameters["min_df"]
    # Take the model name as string and find the corresponding global KeyBERT object
    model = getattr(sys.modules[__name__], parameters["model"])
    vectorizer = parameters["vectorizer"]

    return keybert_bulk(df, keyphrase_ngram_range, top_n, min_df, vectorizer, model)

# Function to evaluate the extracted keywords which returns the f1 score
def keybert_bulk_evaluate(row, top_n = 10):
    actual_tags_processed = preprocess(row['keywords'], 'german')
    predicted_tags_processed = preprocess(row['extracted'], 'german')
    f_score = f1_measure_k(assigned = actual_tags_processed, extracted = predicted_tags_processed, k = top_n, partial = True)

    return f_score

# runs the algorithm and creates a new column with the extracted keywords. Then The score is caculated in a seperate column
# it also measures the time the algorithm takes to process the dataset
# the dataframe with the extracted keywords is saved and the function return the mean f1 score, the used parameters, the length of the df and the time
def keybert_bulk_run(parameters, df):
    print('Started to extract keywords for ', str(parameters['model']), str(parameters['vectorizer']))
    tic = time.perf_counter()
    df_extracted = keybert_bulk_tuning(parameters, df)
    df_extracted['f_score'] = df_extracted.apply(keybert_bulk_evaluate, axis=1)
    time_loop = time.perf_counter() - tic
    path = '/content/drive/MyDrive/Bachelorarbeit/hyperparameter_tuning/extracted_keywords/keybert_keywords.xlsx'
    df_extracted.to_excel(path)

    return df_extracted["f_score"].mean(), parameters, len(df), time_loop

def keybert_bulk_multi(df, parameter_list):
    # This part runs the method as a multiprocessing task to speed up the keyword extraction process
    with ThreadPoolExecutor(max_workers = 8) as executor:
        results = [executor.submit(keybert_bulk_run, parameters, df) for parameters in parameter_list]
   
    results_list = []
    for f in concurrent.futures.as_completed(results):
        results_list.append(f.result(), )

    return results_list

# returns a excel with the results
results_list = keybert_bulk_multi(tune_df, parameter_list)
df_keybert = pd.DataFrame(results_list)
df_keybert.to_excel('/content/drive/MyDrive/Bachelorarbeit/hyperparameter_tuning/final_run/keybert_results.xlsx')

# Final Run

## TF-IDF

**Top hyperparameters:**
*   ngram_range: (1, 2)

In [None]:
# top hyperparameters after tuning
params = {
    'top_n': [10],
    'ngram_range': [(1, 2)]}

keys, values = zip(*params.items())
parameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

tfidf_vectorizer_vectors, tfidf_vectorizer, tfidf_vectorizer_feature_names = tfidf_matrix(tune_df, ngram_range = (1,2))

def tfidf_tuning(parameter_list, 
                 df, 
                 tfidf_vectorizer_vectors=tfidf_vectorizer_vectors, 
                 tfidf_vectorizer=tfidf_vectorizer, 
                 tfidf_vectorizer_feature_names=tfidf_vectorizer_feature_names):

    tic = time.perf_counter()
    df['extracted'] = df.progress_apply(lambda x: extract_per_row(x.name, tfidf_vectorizer_vectors, tfidf_vectorizer, tfidf_vectorizer_feature_names), axis=1)
    df['f_score'] = df.apply(bulk_evaluate, axis=1)
    
    time_loop = time.perf_counter() - tic
    df.to_excel('hyperparameter_tuning/extracted_keywords/tfidf_keywords.xlsx')
    return df['f_score'].mean(), parameter_list, len(df), time_loop 

def extract_per_row(number_text, tfidf_vectorizer_vectors, tfidf_vectorizer, tfidf_vectorizer_feature_names, parameter_list = parameter_list):
    keywords = tfidfvectorizer(number_text = number_text, 
                                tfidf_vectorizer_vectors = tfidf_vectorizer_vectors, 
                                tfidf_vectorizer = tfidf_vectorizer,
                                tfidf_vectorizer_feature_names = tfidf_vectorizer_feature_names)
    print(keywords)
    display.clear_output(wait=True)
    return keywords

# Function to evaluate the extracted keywords which returns the f1 score
def bulk_evaluate(row, top_n = 10):
    actual_tags_processed = preprocess(row['keywords'], 'german')
    predicted_tags_processed = preprocess(row['extracted'], 'german')
    f_score = f1_measure_k(assigned = actual_tags_processed, extracted = predicted_tags_processed, k = top_n, partial = False)
    return f_score

def run_tuning(parameter_list):
    # This part runs the method as a multiprocessing task to speed up the keyword extraction process
    with ProcessPoolExecutor(max_workers=1) as executor:
        results = [executor.submit(tfidf_tuning, parameters, test_df) for parameters in parameter_list]

    results_list = []
    for f in concurrent.futures.as_completed(results):
        results_list.append(f.result())
    return results_list

# returns a excel with the results
results_list = [tfidf_tuning(parameter_list[0], test_df)]
df = pd.DataFrame(results_list)
df.to_excel('hyperparameter_tuning/final_run/tfidf_results.xlsx')

## YAKE

**Top hyperparameters:** 
*   **n:** 1
*   **dedupLim:** 0.8
*   **dedupFunc:** 'leve'
*   **windowsSize:** 3

In [None]:
# top hyperparameters after tuning
params = {
    'top_n': [10],
    'n': [1],
    'dedupLim': [0.8],
    'dedupFunc': ['leve'],
    'windowsSize': [3]}
keys, values = zip(*params.items())
parameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

def yake_tuning(parameter_list, df):
    def extract_per_row(text, parameter_list = parameter_list):
        keywords = yake(text, 
                        top_n = parameter_list["top_n"], 
                        n = parameter_list["n"], 
                        dedupLim = parameter_list["dedupLim"], 
                        dedupFunc = parameter_list["dedupFunc"], 
                        windowsSize = parameter_list["windowsSize"])

        return keywords

    # Function to evaluate the extracted keywords which returns the f1 score
    def bulk_evaluate(row, top_n = 10):
        actual_tags_processed = preprocess(row['keywords'], 'german')
        predicted_tags_processed = preprocess(row['extracted'], 'german')
        f_score = f1_measure_k(assigned = actual_tags_processed, extracted = predicted_tags_processed, k = top_n, partial = False)
        return f_score

    tic = time.perf_counter()
    df['extracted'] = df['text'].progress_apply(lambda x: extract_per_row(text = x))
    df['f_score'] = df.apply(bulk_evaluate, axis=1)
    time_loop = time.perf_counter() - tic
    df.to_excel('/content/drive/MyDrive/Bachelorarbeit/extracted_keywords/yake_keywords.xlsx')

    return df['f_score'].mean(), parameter_list, len(df), time_loop 

def run_tuning(parameter_list, df):
    # This part runs the method as a multiprocessing task to speed up the keyword extraction process
    with ProcessPoolExecutor(max_workers=30) as executor:
        results = [executor.submit(yake_tuning, parameters, df) for parameters in parameter_list]

    results_list = []
    for f in concurrent.futures.as_completed(results):
        results_list.append(f.result())

    return results_list

# returns a excel with the results
results_list = run_tuning(parameter_list, test_df)
df = pd.DataFrame(results_list)
df.to_excel('/content/drive/MyDrive/Bachelorarbeit/final_run/yake_results.xlsx')

## RAKE

**Top hyperparameters:** 

*   **minCharacters:** 2
*   **maxWords:** 1
*   **minFrequency:** 1

In [None]:
# top hyperparameters after tuning
params = {
    'top_n': [10],
    'minCharacters': [2],
    'maxWords': [1],
    'minFrequency': [1]}
keys, values = zip(*params.items())
parameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

def rake_tuning(parameter_list, df):
    def extract_per_row(text, parameter_list = parameter_list):
        keywords = rake(text, 
                        minCharacters = parameter_list["minCharacters"], 
                        maxWords = parameter_list["maxWords"], 
                        minFrequency= parameter_list["minFrequency"])

        return keywords

    # Function to evaluate the extracted keywords which returns the f1 score
    def bulk_evaluate(row, top_n = 10):
        actual_tags_processed = preprocess(row['keywords'], 'german')
        predicted_tags_processed = preprocess(row['extracted'], 'german')
        f_score = f1_measure_k(assigned = actual_tags_processed, extracted = predicted_tags_processed, k = top_n, partial = False)
        return f_score

    tic = time.perf_counter()
    df['extracted'] = df['text'].progress_apply(lambda x: extract_per_row(text = x))
    df['f_score'] = df.apply(bulk_evaluate, axis=1)
    time_loop = time.perf_counter() - tic
    df.to_excel('/content/drive/MyDrive/Bachelorarbeit/extracted_keywords/rake_keywords.xlsx')

    return df['f_score'].mean(), parameter_list, len(df), time_loop 

def run_tuning(parameter_list, df):
    # This part runs the method as a multiprocessing task to speed up the keyword extraction process
    with ProcessPoolExecutor(max_workers=30) as executor:
        results = [executor.submit(rake_tuning, parameters, df) for parameters in parameter_list]

    results_list = []
    for f in concurrent.futures.as_completed(results):
        results_list.append(f.result())

    return results_list

# returns a excel with the results
results_list = run_tuning(parameter_list, test_df)
df = pd.DataFrame(results_list)
df.to_excel('/content/drive/MyDrive/Bachelorarbeit/final_run/rake_results.xlsx')

## TextRank

**Top hyperparameters:** 
*   **window:** 2
*   **pos:** {'NOUN', 'PROPN'}
*   **top_percent:** 0.33

In [None]:
# top hyperparameters after tuning
params = {
    'top_n': [10],
    'window': [2],
    'pos': [{'NOUN', 'PROPN'}],
    'top_percent': [0.33]}
keys, values = zip(*params.items())
parameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

def extract_all(df):
    list_texts = df['text'].tolist()
    list_index = list(df.index.values)
    keyword_list = []
    # This part runs the method as a multiprocessing task to speed up the keyword extraction process
    with ProcessPoolExecutor() as executor:
        for arg, res in zip(list_index, executor.map(extract_per_row, list_texts, chunksize = 4)):
            print(len(keyword_list), res)
            display.clear_output(wait=True)
            keyword_list.append(res)
    df['extracted'] = keyword_list
    return df


def extract_per_row(text, parameter_list = parameter_list[0]):
    keywords = textrank(text, 
                        top_n = parameter_list['top_n'], 
                        window = parameter_list['window'], 
                        pos = parameter_list['pos'], 
                        top_percent = parameter_list['top_percent'],)

    return keywords

# Function to evaluate the extracted keywords which returns the f1 score
def bulk_evaluate(row, top_n = 10):
    actual_tags_processed = preprocess(row['keywords'], 'german')
    predicted_tags_processed = preprocess(row['extracted'], 'german')
    f_score = f1_measure_k(assigned = actual_tags_processed, extracted = predicted_tags_processed, k = top_n, partial = False)
    return f_score


def evaluate_textrank(df):
    tic = time.perf_counter()
    df = extract_all(df)
    df['f_score'] = df.apply(bulk_evaluate, axis=1)
    time_loop = time.perf_counter() - tic
    
    df.to_excel('/content/drive/MyDrive/Bachelorarbeit/extracted_keywords/textrank_keywords.xlsx')
    return df['f_score'].mean(), parameter_list, len(df), time_loop 

# returns a excel with the results
results = evaluate_textrank(test_df)
df = pd.DataFrame([results])
df.to_excel('/content/drive/MyDrive/Bachelorarbeit/final_run/textrank_results.xlsx')

## SingleRank

**Top hyperparameter:** 
*   **window:** 12 
*   **redundancy_removal:** True
*   **pos:** {'NOUN', 'ADJ', 'PROPN'}

In [None]:
# top hyperparameters after tuning
params = {
    'top_n': [10],
    'window': [12],
    'normalized': [False],
    'redundancy_removal': [True],
    'pos': [{'NOUN', 'PROPN', 'ADJ'}]}
keys, values = zip(*params.items())
parameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

def extract_all(df):
    list_texts = df['text'].tolist()
    list_index = list(df.index.values)
    keyword_list = []
    # This part runs the method as a multiprocessing task to speed up the keyword extraction process
    with ProcessPoolExecutor(max_workers=4) as executor:
        for arg, res in zip(list_index, executor.map(extract_per_row, list_texts, chunksize = 3)):
            print(len(keyword_list), res)
            display.clear_output(wait=True)
            keyword_list.append(res)
    df['extracted'] = keyword_list
    return df


def extract_per_row(text, parameter_list = parameter_list[0]):
    keywords = singlerank(text, 
                    top_n = parameter_list["top_n"], 
                    window = parameter_list["window"], 
                    normalized = parameter_list["normalized"], 
                    redundancy_removal = parameter_list["redundancy_removal"], 
                    pos = parameter_list["pos"])

    return keywords

# Function to evaluate the extracted keywords which returns the f1 score
def bulk_evaluate(row, top_n = 10):
    actual_tags_processed = preprocess(row['keywords'], 'german')
    predicted_tags_processed = preprocess(row['extracted'], 'german')
    f_score = f1_measure_k(assigned = actual_tags_processed, extracted = predicted_tags_processed, k = top_n, partial = False)
    return f_score

def evaluate_singlerank(df):
    tic = time.perf_counter()
    df = extract_all(df)
    df['f_score'] = df.apply(bulk_evaluate, axis=1)
    time_loop = time.perf_counter() - tic
    
    df.to_excel('/content/drive/MyDrive/Bachelorarbeit/extracted_keywords/singlerank_keywords.xlsx')
    return df['f_score'].mean(), parameter_list, len(df), time_loop 

# returns a excel with the results
results = evaluate_singlerank(test_df)
df = pd.DataFrame([results])
df.to_excel('/content/drive/MyDrive/Bachelorarbeit/final_run/singlerank_results.xlsx')

## KeyBERT

**Top hyperparameters:**

*   **keyphrase_ngram_range:** (1, 1)
*   **min_df:** 1
*   **model:** paraphrase-multilingual-MiniLM-L12-v2 (kw_model_8); distiluse-base-multilingual-cased-v2 (kw_model_6)
*   **vectorizer:** KeyphraseCountVectorizer(spacy_pipeline='de_core_news_sm') (vectorizer_keybert)

In [None]:
# top hyperparameters after tuning
keybert_params = {
    'keyphrase_ngram_range': [(1, 1)],
    'top_n': [10],
    'min_df': [1],
    'model': ['kw_model_6', 'kw_model_8'], # We take the top 2 best performing language models.
    'vectorizer': [vectorizer_keybert]}
keys, values = zip(*keybert_params.items())
parameter_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

def keybert_bulk_tuning(parameters, df):
    keyphrase_ngram_range = parameters["keyphrase_ngram_range"]
    top_n = parameters["top_n"]
    min_df = parameters["min_df"]
    # Take the model name as string and find the corresponding global KeyBERT object
    model = getattr(sys.modules[__name__], parameters["model"])
    vectorizer = parameters["vectorizer"]

    return keybert_bulk(df, keyphrase_ngram_range, top_n, min_df, vectorizer, model)

# Function to evaluate the extracted keywords which returns the f1 score
def keybert_bulk_evaluate(row, top_n = 10):
    actual_tags_processed = preprocess(row['keywords'], 'german')
    predicted_tags_processed = preprocess(row['extracted'], 'german')
    f_score = f1_measure_k(assigned = actual_tags_processed, extracted = predicted_tags_processed, k = top_n, partial = True)

    return f_score

def keybert_bulk_run(parameters, df):
    print('Started to extract keywords for ', str(parameters['model']), str(parameters['vectorizer']))
    tic = time.perf_counter()
    df_extracted = keybert_bulk_tuning(parameters, df)
    df_extracted['f_score'] = df_extracted.apply(keybert_bulk_evaluate, axis=1)
    time_loop = time.perf_counter() - tic
    path = '/content/drive/MyDrive/Bachelorarbeit/extracted_keywords/keybert_keywords.xlsx'
    df_extracted.to_excel(path)

    return df_extracted["f_score"].mean(), parameters, len(df), time_loop

def keybert_bulk_multi(df, parameter_list):
    # This part runs the method as a multiprocessing task to speed up the keyword extraction process
    with ThreadPoolExecutor(max_workers = 8) as executor:
        results = [executor.submit(keybert_bulk_run, parameters, df) for parameters in parameter_list]
   
    results_list = []
    for f in concurrent.futures.as_completed(results):
        results_list.append(f.result(), )

    return results_list

# returns a excel with the results
results_list = keybert_bulk_multi(test_df, parameter_list)
df_keybert = pd.DataFrame(results_list)
df_keybert.to_excel('/content/drive/MyDrive/Bachelorarbeit/final_run/keybert_results.xlsx')