In [1]:
import pandas as pd
import numpy as np
from typing import *
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data

In [2]:
df = pd.read_csv('../../dataset/movies_complete.csv')

# Features

We use different features:
1. Tfidf-Features (different params)
    1. on raw text
    2. on lemmas
2. Fasttext-Embedding (mean, max, min pooled)
3. Bert-Embeddings (mean, max, min pooled)
4. DistilBert-Embeddings (Head and Tail)
5. All other embeddings concatenated

In [3]:
from pathlib import Path
import numpy as np

def read_embeddings(embedding_file: str):
    data = Path(embedding_file).read_text()
    lines = data.split('\n')
    filenames = []
    embeddings = []
    for line in lines:
        line_data = line.split(' ')
        if len(line_data) >= 2:
            filenames.append(line_data[0])
            embeddings.append(list(map(float, line_data[1:])))
    return np.asarray(filenames), np.asarray(embeddings)

In [4]:
mean_filenames, embeddings_mean = read_embeddings('../../dataset/embeddings_mean.txt')
max_filenames, embeddings_max = read_embeddings('../../dataset/embeddings_max.txt')
min_filenames, embeddings_min = read_embeddings('../../dataset/embeddings_min.txt')

bert_mean_filenames, bert_mean = read_embeddings('../../dataset/embeddings_bert_mean.txt')
bert_max_filenames, bert_max = read_embeddings('../../dataset/embeddings_bert_max.txt')
bert_min_filenames, bert_min = read_embeddings('../../dataset/embeddings_bert_min.txt')

bert_headtail_filenames, bert_headtail = read_embeddings('../../dataset/embeddings_bert_headtail.txt')

In [5]:
bert_headtail[bert_headtail.shape == (0,)] = np.zeros((768,))

In [6]:
mean_embeddings = []
max_embeddings = []
min_embeddings = []

mean_bert = []
max_bert = []
min_bert = []
ht_bert = []
for index, row in df.iterrows():
    mean_embeddings.append(embeddings_mean[np.where(mean_filenames == row.filename)].ravel())
    max_embeddings.append(embeddings_max[np.where(max_filenames == row.filename)].ravel())
    min_embeddings.append(embeddings_min[np.where(min_filenames == row.filename)].ravel())
    
    mean_bert.append((bert_mean[np.where(bert_mean_filenames == row.filename)].ravel()))
    max_bert.append((bert_max[np.where(bert_max_filenames == row.filename)].ravel()))
    min_bert.append((bert_min[np.where(bert_min_filenames == row.filename)].ravel()))
    
    ht_bert.append((bert_headtail[np.where(bert_headtail_filenames == row.filename)].ravel()))
    
df['fasttext_mean'] = mean_embeddings
df['fasttext_max'] = max_embeddings
df['fasttext_min'] = min_embeddings

df['bert_mean'] = mean_bert
df['bert_max'] = max_bert
df['bert_min'] = min_bert

mean_embeddings = np.asarray(mean_embeddings)
max_embeddings = np.asarray(max_embeddings)
min_embeddings = np.asarray(min_embeddings)

mean_bert = np.asarray(mean_bert)
max_bert = np.asarray(max_bert)
min_bert = np.asarray(min_bert)

ht_bert = np.asarray(ht_bert)


del embeddings_mean, embeddings_max, embeddings_min, bert_mean, bert_max, bert_min, bert_headtail

In [7]:
embedding_comb = np.hstack([
    mean_embeddings,
    max_embeddings,
    min_embeddings,
    mean_bert,
    max_bert,
    min_bert,
])

In [8]:
from stop_words import get_stop_words
from itertools import repeat

tfidf_params = {
    'max_features': [500, 1000, 2500, 5000, 10000, 15000, 20000],
    'max_df': np.linspace(0, 1, 10),
    'stop_words': [get_stop_words('de'), None],
    'ngram_range': [(i, j) for i, j in zip(repeat(1), range(1, 6))]
}

tfidf_params

{'max_features': [500, 1000, 2500, 5000, 10000, 15000, 20000],
 'max_df': array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
        0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ]),
 'stop_words': [['aber',
   'alle',
   'allem',
   'allen',
   'aller',
   'alles',
   'als',
   'also',
   'am',
   'an',
   'ander',
   'andere',
   'anderem',
   'anderen',
   'anderer',
   'anderes',
   'anderm',
   'andern',
   'anders',
   'auch',
   'auf',
   'aus',
   'bei',
   'bin',
   'bis',
   'bist',
   'da',
   'damit',
   'dann',
   'das',
   'dass',
   'dasselbe',
   'dazu',
   'daß',
   'dein',
   'deine',
   'deinem',
   'deinen',
   'deiner',
   'deines',
   'dem',
   'demselben',
   'den',
   'denn',
   'denselben',
   'der',
   'derer',
   'derselbe',
   'derselben',
   'des',
   'desselben',
   'dessen',
   'dich',
   'die',
   'dies',
   'diese',
   'dieselbe',
   'dieselben',
   'diesem',
   'diesen',
   'dieser',
   'dieses',
   'dir',
   'doch',
   'dor

# Define custom evaluation scores

In [9]:
def cluster_size_std(labels):
    cluster, cluster_sizes = np.unique(labels, return_counts=True)
    return cluster_sizes.std()


# Define tuning function

We do not use sklearns GridsearchCV class since we do not want to use cross-validation.
Rather we use metrics which are computed on the clusters found while training.

In [10]:
from sklearn.base import ClusterMixin
from tqdm import tqdm
from sklearn.base import clone as clone_estimator
from time import time

def _call_scoring(entry, score, data):
    try:
        entry[score.__name__] = score(data)
    except Exception as e:
        entry['Exception'] = e
    return entry

def tune(clu: ClusterMixin,
         X: np.ndarray,
         params: ParameterGrid,
         scoring: callable = None,
         verbose: bool = True):
    
    results = []
    params = tqdm(params) if verbose else params
    for param_comb in params:
        entry = {**param_comb}
        clu_instance = clone_estimator(clu)
        
        fit_start = time()
        clu_instance.fit(X)
        fit_time = time() - fit_start
        entry['fit_time'] = fit_time
        
        labels = clu_instance.predict(X)
        
        if scoring:
            if isinstance(scoring, list):
                for score in scoring:
                    _call_scoring(entry=entry, score=score, data=labels)
            if callable(scoring):
                _call_scoring(entry=entry, score=scoring, data=labels)
            else:
                raise Exception('Invalid scoring parameter passed')
        else:
            if hasattr(clu_instance, 'score') and callable(clu_instance.score):
                _call_scoring(entry=entry, score=clu_instance.score, data=X)
        results.append(entry)
        
    return pd.DataFrame.from_records(results)

# Define some helpers

In [11]:
def merge_params_dicts(*args, sep='__'):
    merged = {}
    for entry in args:
        if isinstance(entry, tuple):
            prefix = entry[0]
            merged.update({prefix + sep + key: value for key, value in entry[1].items()})
        if isinstance(entry, dict):
            merged.update(entry)
    return merged

In [12]:
merge_params_dicts(('pca', {'n_components': 2}), ('kmeans', {'n_cluster': 2}))

{'pca__n_components': 2, 'kmeans__n_cluster': 2}

# 1. Clustering-Algorithm: KMeans

In [13]:
kmeans_params = {
    'max_iter': [300, 500, 1000],
    'n_cluster': list(range(1, 21, 3)),  
}

In [14]:
from sklearn.cluster import KMeans


kmeans = KMeans()
# monkey patch kmeans score method to get inertia
def get_intertia(self):
    return self.inertia_
kmeans.score = get_intertia

kmeans_tdidf = make_pipeline(TfidfVectorizer(), kmeans)
kmeans_tdidf

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()), ('kmeans', KMeans())])

In [15]:
kmean_tfidf_params = ParameterGrid(
    merge_params_dicts(('tfidfvectorizer', tfidf_params), ('kmeans', kmeans_params))
)

In [17]:
kmean_tfidf_rawtext = tune(clu=kmeans_tdidf,
                           X=df.text,
                           params=kmean_tfidf_params,
                           verbose=True)
kmean_tfidf_rawtext.to_csv('../../Results/KMeans_Tfidf_Rawtext.csv')

100%|██████████| 1/1 [01:17<00:00, 77.34s/it]


In [None]:
kmean_tfidf_lemma = tune(clu=kmeans_tdidf,
                           X=df.lemma,
                           params=kmean_tfidf_params,
                           verbose=True)
kmean_tfidf_lemma.to_csv('../../Results/KMeans_Tfidf_Lemma.csv')