# Preamble

In [None]:
CODE_PATH = r'C:/Git/HonoursProject/ipw-clusterer/ipw_clusterer/'
DATA_PATH = r'C:/Git/HonoursProject/ipw-clusterer/data/'

EDA = True #Show Exploratory Data Analyis
SHOW_MODEL_PLOTS = True #Show model plots

In [None]:
# add source python files to project
import sys
sys.path.insert(0, CODE_PATH) 
import ipw

In [None]:
#import io
import logging
import numpy as np
import pandas as pd
#import re
#import math

In [None]:
logging.basicConfig(filename = 'log.txt', level = logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')

# Load the data

In [None]:
df = ipw.parse.read(DATA_PATH)

## Nr of records

In [None]:
if EDA:
    print(f'Total records: {len(df)}')
    print('Records per status:')
    print(df['status'].value_counts(dropna = False))

In [None]:
# Select closed records
df = df[df['status'] == 'closed']
df.drop('status', axis = 1, inplace = True)

## Clean the text fields

In [None]:
df = ipw.parse.clean(df)

# Exploratory Data Analytics

In [None]:
import matplotlib.pyplot as plt

## Summary statistics

In [None]:
if EDA:  
    df_stats = df.apply(lambda x: x.fillna('').str.split().apply(len))   
    summary_stats = df_stats.describe()  
    for col in df_stats.columns:  
        summary_stats.loc['empty', col] = df_stats[col].value_counts(sort = False).get(0, 0)  
        summary_stats.loc['not_empty', col] = summary_stats.loc['count', col] - summary_stats.loc['empty', col]  
    # Sort the columns by the 'not_empty' row in descending order  
    df_stats = df_stats[summary_stats.loc['not_empty'].sort_values(ascending=False).index]  
    display(summary_stats)  
              
    # Create a box and whisker plot  
    plt.boxplot(df_stats,  
               flierprops={'marker': '.'},  
               notch = True)  
    plt.xticks(rotation=45)  
    plt.xticks(range(1, len(df_stats.columns) + 1), df_stats.columns)  
    plt.show()  


## Unique words and occurances

In [None]:
# All columns
if EDA:
    words_dict = {}

    for column in df:
        for index, row in df.iterrows():
                words_dict = ipw.text.add_word_count_to_dict(row[column], words_dict)

    total_words = sum(words_dict.values())
    print(f'Total words: {total_words}')
    print(f'Unique words: {len(words_dict)}')
    print(f'Average occurance: {total_words / len(words_dict):.2f}')
    
    ipw.plots.wordcloud(words_dict).show()
    ipw.plots.histogram_wordfreq(words_dict).show()

In [None]:
# Description column
if EDA:
    words_dict = {}

    for index, row in df.iterrows():
        words_dict = ipw.text.add_word_count_to_dict(row['description'], words_dict)

    total_words = sum(words_dict.values())
    print(f'Total words: {total_words}')
    print(f'Unique words: {len(words_dict)}')
    print(f'Average occurance: {total_words / len(words_dict):.2f}')
    
    ipw.plots.wordcloud(words_dict).show()
    ipw.plots.histogram_wordfreq(words_dict).show()

# Transform the data

## Nouns in description column

In [None]:
df_model_1 = df['description'].apply(ipw.text.filter).to_frame('text')
df_model_1['description'] = df['description']

obs_dict = {}

for index, row in df_model_1.iterrows():
    obs_dict = ipw.text.add_word_count_to_dict(row['text'], obs_dict)
     
total_words = sum(obs_dict.values())
print(f'Total words for model: {total_words}')
print(f'Unique words: {len(obs_dict)}')
print(f'Average occurance: {total_words / len(obs_dict):.2f}')

In [None]:
ipw.plots.wordcloud(obs_dict, '').show()
ipw.plots.histogram_wordfreq(obs_dict).show()

## spaCy vector

In [None]:
import spacy
import nl_core_news_lg

In [None]:
# load natural language model for dutch
nlp = nl_core_news_lg.load()

In [None]:
# Create a list of series objects representing the columns of the new DataFrame  
vector_names = [f"V{i}" for i in range(nlp.vocab.vectors_length)]  
column_list = [] 

for name in vector_names:
     column_list.append(pd.Series(name=name, index=df.index, dtype=float))  
  
# Loop over the strings in the original DataFrame and add their spaCy vectors to the column Series objects  
for i, text in enumerate(df_model_1['text']):  
    doc = nlp(text)  
    for j, value in enumerate(doc.vector):  
        column_list[j][i] = value  
  
# Concatenate the column Series objects to create the new DataFrame  
df_vector_1 = pd.concat(column_list, axis=1)  

## Remove samples that have empty vectors

In [None]:
# Calculate the norm of each row using np.linalg.norm()  
norms = df_vector_1.apply(lambda row: np.linalg.norm(row), axis=1)  

print(f'Number of samples before selection: {len(df)}')
# Filter out the rows where the norm is zero  
df_vector_1 = df_vector_1[norms != 0]  

print(f'Number of samples for clustering: {len(df_vector_1)}')

# also remove from the original dataframe with text
df_model_1 = df_model_1[df_model_1.index.isin(df_vector_1.index)]

## Correlation

In [None]:
import seaborn as sns

In [None]:
if EDA:
    plt.figure(figsize=(10,10))
    correlation = df_vector_1.corr()
    correlation = np.triu(correlation, k=1)  # Keep only the upper triangle of the correlation matrix  

    sns.heatmap(abs(correlation), center = 0, cmap="RdBu", vmax = 1.0, vmin = 0.0)
    print(f'Max correlation: {correlation.max() :.3f}')  
    print(f'Min correlation: {correlation.min() :.3f}')  
    print(f'Mean absolute correlation: {(abs(correlation)).mean():.3f}')

# Models

## Set Random State

In [None]:
# Set to be able to get repeatable results
RANDOM_STATE = 42

## Cosine Distance matrix

In [None]:
dist_matrix = ipw.models.distance_matrix(df_vector_1)

In [None]:
from sklearn.manifold import MDS 
cmap = 'viridis' #standard
#cmap = 'hot' #oranges and reds
mds = MDS(n_components=4, dissimilarity='precomputed', normalized_stress = 'auto', random_state=RANDOM_STATE, )  
coords = mds.fit_transform(dist_matrix)  
  
# Create a scatterplot of the coordinates  
fig = plt.figure()  
ax = fig.add_subplot(111, projection='3d')  
ax.scatter(coords[:, 0], coords[:, 1], coords[:, 2], c= coords[:, 3], cmap = cmap)
plt.show() 

In [None]:
mds = MDS(n_components=3, dissimilarity='precomputed', normalized_stress = 'auto', random_state=RANDOM_STATE, )  
coords = mds.fit_transform(dist_matrix)  
  
# Create a scatterplot of the coordinates  
fig = plt.figure()  
plt.scatter(coords[:, 0], coords[:, 1], c= coords[:, 2], cmap = cmap)
plt.show() 

## Affinity Propagation

In [None]:
model = ipw.enums.Model.AFFINITY_PROPAGATION
dampings = np.arange(0.5, 1.0, 0.05) 
af_results, af_labels, af_centers = ipw.models.affinity_propagation(dist_matrix, dampings, random_state = RANDOM_STATE)
df_model_1[model.col(1)] = af_labels
ipw.models.output(dist_matrix, af_results, af_labels)

In [None]:
if SHOW_MODEL_PLOTS:
    ipw.plots.bar_labels(af_labels)
    ipw.plots.silhouette(dist_matrix, af_labels)
    ipw.plots.model_wordclouds(df_model_1, model, 1)

# Agglomerative Clustering

In [None]:
model = ipw.enums.Model.AGGLOMERATIVE_CLUSTERING
n_clusters = range(2, 20)
ac_results, ac_labels = ipw.models.agglomerative_clustering(dist_matrix, n_clusters, RANDOM_STATE)
df_model_1[model.col(1)] = ac_labels
ipw.models.output(dist_matrix, ac_results, ac_labels)

In [None]:
if SHOW_MODEL_PLOTS:
    ipw.plots.bar_labels(ac_labels)
    ipw.plots.silhouette(dist_matrix, ac_labels)
    ipw.plots.model_wordclouds(df_model_1, model, 1)

# DBScan

In [None]:
model = ipw.enums.Model.DBSCAN
eps_arr = np.arange(0.01, 0.2, 0.01) 
min_samples = range(2, 10)
db_results, db_labels = ipw.models.dbscan(dist_matrix, eps_arr, min_samples)
df_model_1[model.col(1)] = db_labels
ipw.models.output(dist_matrix, db_results, db_labels)

In [None]:
if SHOW_MODEL_PLOTS:
    ipw.plots.bar_labels(db_labels)
    ipw.plots.silhouette(dist_matrix, db_labels)
    ipw.plots.model_wordclouds(df_model_1, model, 1)

# Spectral Clustering

In [None]:
model = ipw.enums.Model.SPECTRAL_CLUSTERING
n_clusters = range(2, 10)
sc_results, sc_labels = ipw.models.spectral_clustering(dist_matrix, n_clusters, RANDOM_STATE)
df_model_1[model.col(1)] = sc_labels
ipw.models.output(dist_matrix, sc_results, sc_labels)

In [None]:
if SHOW_MODEL_PLOTS:
    ipw.plots.bar_labels(sc_labels)
    ipw.plots.silhouette(dist_matrix, sc_labels)
    ipw.plots.model_wordclouds(df_model_1, model, 1)

In [None]:
descriptions = ipw.text.description_per_label(df_model_1, model, 1)
print(ipw.text.summary(descriptions[0], 5))
print('-----')
print(ipw.text.summary(descriptions[1], 5))

# Second Round on column description with Spectral Clustering

In [None]:
from sklearn import metrics

In [None]:
model1 = ipw.enums.Model.SPECTRAL_CLUSTERING
n_clusters = range(2, 10)

In [None]:
df_model_2a = df_model_1[df_model_1[model1.col(1)] == 0].copy()
df_vector_2a = df_vector_1[df_vector_1.index.isin(df_model_2a.index)]
dist_matrix_2a = ipw.distance_matrix(df_vector_2a)

sc_results2a, sc_labels2a = ipw.models.spectral_clustering(dist_matrix_2a, n_clusters, RANDOM_STATE)
df_model_2a[model1.col(2)] = sc_labels2a + 1

In [None]:
df_model_2b = df_model_1[df_model_1[model1.col(1)] == 1].copy()
df_vector_2b = df_vector_1[df_vector_1.index.isin(df_model_2b.index)]
dist_matrix_2b = ipw.distance_matrix(df_vector_2b)

sc_results2b, sc_labels2b = ipw.models.spectral_clustering(dist_matrix_2b, n_clusters, RANDOM_STATE)
df_model_2b[model1.col(2)] = sc_labels2b + 2 + sc_labels2a.max()

In [None]:
merged = pd.concat([df_model_2a, df_model_2b], axis = 0)[[model1.col(2)]]
df_model_2 = df_model_1.join(merged, how = 'left')
df_model_2[df_model_2[model1.col(2)] == None] = 0
labels_2 = df_model_2[model1.col(2)]

n = labels_2.max()
print('Cluster size per cluster')
for i in range(min(labels_2), max(labels_2+1)):
    print(f'Cluster {i}: {sum(labels_2 == i)}')
print('----')
sc = metrics.silhouette_score(dist_matrix, labels_2, metric="precomputed")
vrc = metrics.calinski_harabasz_score(dist_matrix, labels_2)
dbi = metrics.davies_bouldin_score(dist_matrix, labels_2)

print(f'Number of clusters: {labels_2.max() - labels_2.min() + 1}')
print(f'Silhouette Coefficient: {sc:0.3f}')
print(f'Calinski-Harabasz Index / Variance Ratio Criterion: {vrc:0.3f}')
print(f'Davies-Bouldin Index: {dbi:0.3f}')

In [None]:
if SHOW_MODEL_PLOTS:
    ipw.plots.bar_labels(labels_2)
    ipw.plots.silhouette(dist_matrix, labels_2)

# Second Round on column solution with Spectral Clustering

In [None]:
df_model_3 = df['solution'].apply(ipw.text.filter).to_frame('text')
df_model_3['solution'] = df['solution']
df_model_3 = df_model_3[df_model_3.index.isin(df_model_1.index)].copy()

In [None]:
# Create a list of series objects representing the columns of the new DataFrame  
vector_names = [f"V{i}" for i in range(nlp.vocab.vectors_length)]  
column_list = [] 

for name in vector_names:
     column_list.append(pd.Series(name=name, index=df_model_3.index, dtype=float))  
  
# Loop over the strings in the original DataFrame and add their spaCy vectors to the column Series objects  
for i, text in enumerate(df_model_3['text']):  
    doc = nlp(text)  
    for j, value in enumerate(doc.vector):  
        column_list[j][i] = value  
  
# Concatenate the column Series objects to create the new DataFrame  
df_vector_3 = pd.concat(column_list, axis=1)  

In [None]:
# Calculate the norm of each row using np.linalg.norm()  
norms = df_vector_3.apply(lambda row: np.linalg.norm(row), axis=1)  

print(f'Number of samples before selection: {len(df)}')
# Filter out the rows where the norm is zero  
df_vector_3 = df_vector_3[norms != 0]  

print(f'Number of samples for clustering: {len(df_vector_3)}')

# also remove from the original dataframe with text
df_model_3 = df_model_1.copy()

In [None]:
df_vector_3a = df_vector_3[df_vector_3.index.isin(df_model_2a.index)].copy()
df_model_3a = df_model_3[df_model_3.index.isin(df_vector_3a.index)].copy()
dist_matrix_3a = ipw.distance_matrix(df_vector_3a)

sc_results3a, sc_labels3a = ipw.models.spectral_clustering(dist_matrix_3a, n_clusters, RANDOM_STATE)
df_model_3a[model1.col(3)] = (sc_labels3a + 2)
ipw.models.output(dist_matrix_3a, sc_results3a, sc_labels3a)

In [None]:
df_vector_3b = df_vector_3[df_vector_3.index.isin(df_model_2b.index)].copy()
df_model_3b = df_model_3[df_model_3.index.isin(df_vector_3b.index)].copy()
dist_matrix_3b = ipw.distance_matrix(df_vector_3b)

sc_results3b, sc_labels3b = ipw.models.spectral_clustering(dist_matrix_3b, n_clusters, RANDOM_STATE)
df_model_3b[model1.col(3)] = (sc_labels3a.max() + 3 + sc_labels3b)
ipw.models.output(dist_matrix_3b, sc_results3b, sc_labels3b)

## Output with distance matrix based on description column

In [None]:
merged = pd.concat([df_model_3a, df_model_3b], axis = 0)[[model1.col(3)]]
df_model_3 = df_model_3.join(merged, how = 'left')
df_model_3[model1.col(3)] = df_model_3[model1.col(3)].fillna(df_model_3[model1.col(1)]).astype(int)
labels_3 = df_model_3[model1.col(3)]

In [None]:
labels_3 = df_model_3[model1.col(3)]

In [None]:
n = int(labels_3.max())
print('Cluster size per cluster')
for i in range(n+1):
    print(f'Cluster {i}: {sum(labels_3 == i)}')
print('----')
sc = metrics.silhouette_score(dist_matrix, labels_3, metric="precomputed")
vrc = metrics.calinski_harabasz_score(dist_matrix, labels_3)
dbi = metrics.davies_bouldin_score(dist_matrix, labels_3)

print(f'Number of clusters: {labels_3.max() - labels_3.min() + 1}')
print(f'Silhouette Coefficient: {sc:0.3f}')
print(f'Calinski-Harabasz Index / Variance Ratio Criterion: {vrc:0.3f}')
print(f'Davies-Bouldin Index: {dbi:0.3f}')

In [None]:
if SHOW_MODEL_PLOTS:
    ipw.plots.bar_labels(labels_3)
    ipw.plots.silhouette(dist_matrix, labels_3)

## Output with distance matrix based on solution column

In [None]:
df_output = df_model_3[df_model_3.index.isin(df_vector_3.index)].copy()
labels_4 = df_output[model1.col(3)]
distance_sol = ipw.models.distance_matrix(df_vector_3)

In [None]:
n = int(labels_4.max())
print('Cluster size per cluster')
for i in range(n+1):
    print(f'Cluster {i}: {sum(labels_4 == i)}')
print('----')
sc = metrics.silhouette_score(distance_sol, labels_4, metric="precomputed")
vrc = metrics.calinski_harabasz_score(distance_sol, labels_4)
dbi = metrics.davies_bouldin_score(distance_sol, labels_4)

print(f'Number of clusters: {labels_3.max() - labels_3.min() + 1}')
print(f'Silhouette Coefficient: {sc:0.3f}')
print(f'Calinski-Harabasz Index / Variance Ratio Criterion: {vrc:0.3f}')
print(f'Davies-Bouldin Index: {dbi:0.3f}')

In [None]:
if SHOW_MODEL_PLOTS:
    ipw.plots.bar_labels(labels_4)
    ipw.plots.silhouette(distance_sol, labels_4)

In [None]:
import matplotlib.pyplot as plt  
  
def histogram_labels(labels):  
    label_dict = {}  
          
    for label in labels:  
        if label in label_dict:  
            label_dict[label] += 1  
        else:  
            label_dict[label] = 1  
  
    sorted_labels = sorted(label_dict.items())  
  
    plt.bar([f'cluster {key}' for key, value in sorted_labels], [value for key, value in sorted_labels])  
  
    # add labels and title to the chart    
    plt.xlabel("Cluster size")    
    plt.ylabel("Frequency")    
    plt.title("Cluster size Frequency Histogram")    
    plt.show()  

