# Unsupervised Classfication

In [1]:
# PARAMS
lang = 'en'
embeddings = "doc2vec"

%config IPCompleter.greedy=True
%config IPCompleter.use_jedi=False

from IPython.display import clear_output

In [2]:
!pip install scikit-learn

clear_output

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


<function IPython.core.display.clear_output(wait=False)>

In [3]:
# Imports the OS library
import os

# Imports the time library
from time import time

# Imports numpy
import numpy as np

import pandas as pd

# Imports the document class
from document import Document

# Import TQDM for time measurements
from tqdm import tqdm

# Imports scikit learn
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

from sklearn.preprocessing import LabelEncoder

# Imports matplotlib
import matplotlib.pyplot as plt
from matplotlib import ticker

# Imports wordcloud
from wordcloud import WordCloud

## Load Data

In [4]:
path_to_data = "../../data/datasets/" + lang + '/' + embeddings + '/'

X_train = np.load(path_to_data + "X_train_" + embeddings + '_' + lang + '.npy')
X_test = np.load(path_to_data + "X_test_" + embeddings + '_' + lang + '.npy')
X_val = np.load(path_to_data + "X_val_" + embeddings + '_' + lang + '.npy')



## Model Selection

In [5]:
clusters_list = [2, 3, 4, 5, 10, 15, 20, 25, 50, 100]

ss_list, ch_list, db_list = [], [], []

for n_clusters in clusters_list:
    
    print(f"Training model with {n_clusters} clusters")
    
    # Creates the model
    k_model = KMeans(n_clusters)
    k_model.fit(X_train)
    
    # Inferes predicitons
    y_pred = k_model.predict(X_test)
    
    # Calculates metrics
    ss_score = silhouette_score(X_test, y_pred)
    ch_index = calinski_harabasz_score(X_test, y_pred)
    db_score = davies_bouldin_score(X_test, y_pred)

    # Prints metrics for model
    print("\t Got an silhouette score of {:4.2f}".format(ss_score))
    print("\t Got an Calinski-Harabasz index of {:4.2f}".format(ch_index))
    print("\t Got an Davies-Bouldin index of {:4.2f}".format(db_score))
    
    ss_list.append(ss_score)
    ch_list.append(ch_index)
    db_list.append(db_score)
    

Training model with 2 clusters
	 Got an silhouette score of 0.76
	 Got an Calinski-Harabasz index of 30.19
	 Got an Davies-Bouldin index of 0.17
Training model with 3 clusters
	 Got an silhouette score of 0.17
	 Got an Calinski-Harabasz index of 54.23
	 Got an Davies-Bouldin index of 1.77
Training model with 4 clusters
	 Got an silhouette score of 0.16
	 Got an Calinski-Harabasz index of 54.75
	 Got an Davies-Bouldin index of 1.79
Training model with 5 clusters
	 Got an silhouette score of 0.08
	 Got an Calinski-Harabasz index of 50.25
	 Got an Davies-Bouldin index of 3.16
Training model with 10 clusters
	 Got an silhouette score of 0.02
	 Got an Calinski-Harabasz index of 31.76
	 Got an Davies-Bouldin index of 4.09
Training model with 15 clusters
	 Got an silhouette score of 0.02
	 Got an Calinski-Harabasz index of 31.83
	 Got an Davies-Bouldin index of 4.01
Training model with 20 clusters
	 Got an silhouette score of -0.00
	 Got an Calinski-Harabasz index of 23.60
	 Got an Davies-Bou

In [6]:
results_dict = {
    "clusters":clusters_list,
    "sh_score":ss_list,
    "ch_score":ch_list,
    "db_score":db_list
}

results_df = pd.DataFrame(results_dict)

In [7]:
results_df.to_csv("kmeans_{}_{}.csv".format(lang, embeddings))