In [46]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
from sklearn.cluster import DBSCAN
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from stop_words import get_stop_words
from sklearn.base import clone as clone_estimator
from tqdm.notebook import tqdm
from collections import Counter

In [47]:
df = pd.read_csv('../dataset/movies_complete.csv')

# Tuning DBScan

We want to tune the hyperparamter of DBScan.
Since choosing a clustering perfomance metric is not a trivial task our first objective is to get balanced sized clusters.
To achieve this we consider the std-deviation of the number of instances per cluster. Since DBScan marks points which could not be assigned to one cluster as noise we also want to reduce the number of points marked as noise. Also we want to get more than one Cluster (ignoring the noise cluster)

$perf_{dbscan}(labels) = (\frac{N_{noise}}{N})*std(labels) * log(N_{cluster})$

In [146]:
def get_noise_ratio(labels):
    noise = labels[labels == -1]
    return noise.shape[0] / labels.shape[0]

def dbscan_objective(labels):
    noise_ratio = get_noise_ratio(labels) +  0.0001
    # get clusters
    cluster = labels[labels != -1]
    n_cluster = len(cluster)
    # get cluster distribution
    bins = np.array(list(Counter(cluster).values()))
    cluster_size_std = bins.std() if len(bins) > 2 else 0.0001
    return (1- noise_ratio) * n_cluster * np.abs(np.log(cluster_size_std))

In [149]:
import random

labels = np.array([random.randint(-1, 5) for _ in range(100)])
#labels = np.array([-1 for _ in range(100)])
dbscan_objective(labels)

76.17753448333968

# DBScan-Parameter:

In [150]:
params = ParameterGrid({
    'metric': ['cosine', 'euclidean', 'manhattan'],
    'eps': np.linspace(0, 5, 10)[1:],
    'min_samples': list(range(1,20)) 
})

In [164]:
def run(clu: DBSCAN, params: ParameterGrid, X: np.ndarray, verbose=True) -> pd.DataFrame:
    results = []
    params = tqdm(params) if verbose else params
    for param_comb in params:
        labels = clone_estimator(clu).fit_predict(X)
        score = dbscan_objective(labels)
        noise_ratio = get_noise_ratio(labels)
        param_comb['score'] = score
        param_comb['noise_ration'] = noise_ratio
        params.set_description(f'{score}|{noise_ratio}')
        results.append(param_comb)
    return pd.DataFrame.from_records(results)

# 1. Run
We use only Tfidf-Features

In [158]:
tfidf = TfidfVectorizer(max_df=0.5, max_features=10000, stop_words=get_stop_words('de'))
Xtfidf = tfidf.fit_transform(df.text).todense()

In [None]:
tifidf_simple = run(clu=DBSCAN(n_jobs=12), params=params, X=Xtfidf, verbose=True)
tfidf_simple.to_csv('../Results/DBScan_Tfidf_Simple.csv')

HBox(children=(FloatProgress(value=0.0, max=513.0), HTML(value='')))