# Preamble

In [None]:
CODE_PATH = r'C:/Git/HonoursProject/ipw-classifier/ipw_classifier/src'
DATA_PATH = r'C:/Git/HonoursProject/ipw-classifier/data/'

In [None]:
# add source python files to project
import sys
sys.path.insert(0, CODE_PATH) 

In [None]:
import io
import logging
import numpy as np
import pandas as pd
import re
import math
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import nl_core_news_lg
from typing import Dict, List
from sklearn.model_selection import train_test_split, KFold
from sklearn.cluster import AffinityPropagation, AgglomerativeClustering, SpectralClustering
from sklearn import metrics
from wordcloud import WordCloud
import matplotlib.cm as cm
from matplotlib.ticker import MaxNLocator  
from spacy.lang.nl.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

In [None]:
logging.basicConfig(filename = 'log.txt', level = logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')

# Load the data

In [None]:
import parse_input
df = parse_input.load(DATA_PATH)

## Nr of records

In [None]:
print(f'Total records: {len(df)}')
print('Records per status:')
print(df['status'].value_counts(dropna = False))

In [None]:
# Select closed records
df = df[df['status'] == 'closed']
df.drop('status', axis = 1, inplace = True)

## Clean the text fields

In [None]:
df = parse_input.clean(df)

# Summary statistics

In [None]:
# Create a new DataFrame with the length of each string field in words per record  
df_stats = df.apply(lambda x: x.fillna('').str.split().apply(len)) 
summary_stats = df_stats.describe()
for col in df_stats.columns:
    summary_stats.loc['empty', col] = df_stats[col].value_counts(sort = False).get(0, 0)
    summary_stats.loc['not_empty', col] = summary_stats.loc['count', col] - summary_stats.loc['empty', col]
display(summary_stats)

In [None]:
# Create a box and whisker plot
plt.boxplot(df_stats)
plt.xticks(rotation=45)
plt.xticks(range(1, len(df_stats.columns) + 1), df_stats.columns)
plt.show()

## Unique words and occurances

In [None]:
def word_count(string:str, returnvalue: Dict[str, int]) -> Dict[str, int]:
    words = string.split()
    for word in words:
        key = re.sub(r'[^a-z]', '', word.lower())
        if key in returnvalue:
            returnvalue[key] += 1
        else:
            returnvalue[key] = 1
    return returnvalue

In [None]:
def histogram(dict):
#calculate the optimal distribution of bins according to Freedman-Draconis
    data = list(dict.values())

    # create bins for the histogram  
    #bins = np.exp(bins_sturge(np.log(data)))
    log_data = np.log(data)
    
    iqr = np.percentile(log_data, 75) - np.percentile(log_data, 25)
    bin_width = (2 * iqr) / (len(log_data) ** (1 / 3))
    log_bins = np.arange(min(log_data), max(log_data), bin_width)
    bins = np.exp(log_bins)

    # create the histogram  
    alpha = 1
    plt.hist(data, bins=bins, align='left', color = 'blue', alpha = alpha)

    # add labels and title to the chart  
    plt.xlabel("Frequency (log scale)")  
    plt.ylabel("Occurrences (log scale)")  
    plt.title("Word Frequency Histogram")  

    # set the axis to a logarithmic scale  
    plt.xscale('log')  
    plt.yscale('log')  

    return plt

In [None]:
def wordcloud_from_dict(dict: Dict[str, int], w:int = 12, h:int = 8)-> WordCloud:
    if not dict:  
        dict = {"NO WORDS": 1} 
    
    return WordCloud(width = w * 100, 
                     height= h * 100,
                     background_color="white", 
                     prefer_horizontal=0.8,  
                     min_font_size=10, 
                     max_font_size=400).generate_from_frequencies(dict) 

In [None]:
def plot_wordcloud_from_dict(dict: Dict[str, int], w:int = 12, h:int = 8)-> plt:
    wordcloud = wordcloud_from_dict(dict, w, h)
    
# Display the generated image  
    plt.figure(figsize=(w, h))  
    plt.imshow(wordcloud, interpolation="bilinear")  
    plt.axis("off")  
    return plt

In [None]:
words_dict = {}

for column in df:
    for index, row in df.iterrows():
            words_dict = word_count(row[column], words_dict)
     
total_words = sum(words_dict.values())
print(f'Total words: {total_words}')
print(f'Unique words: {len(words_dict)}')
print(f'Average occurance: {total_words / len(words_dict):.2f}')

In [None]:
plot_wordcloud_from_dict(words_dict, 12, 8).show()
histogram(words_dict).show()

# Spacy

In [None]:
# load natural language model for dutch
nlp = nl_core_news_lg.load()

# Text summary
Source: https://www.kaggle.com/code/itsmohammadshahid/nlp-text-summarizer-using-spacy

In [None]:
def textSummarizer(text, sentences):
    # pass the text into the nlp function
    doc= nlp(text)
    
    ## The score of each word is kept in a frequency table
    tokens=[token.text for token in doc]
    freq_of_word=dict()
    
    # Text cleaning and vectorization 
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in freq_of_word.keys():
                    freq_of_word[word.text] = 1
                else:
                    freq_of_word[word.text] += 1

    if not bool(freq_of_word): return ''
    # Maximum frequency of word
    max_freq=max(freq_of_word.values())
    
    # Normalization of word frequency
    for word in freq_of_word.keys():
        freq_of_word[word]=freq_of_word[word]/max_freq
        
    # In this part, each sentence is weighed based on how often it contains the token.
    sent_tokens= [sent for sent in doc.sents]
    sent_scores = dict()
    for sent in sent_tokens:
        for word in sent:
            if word.text.lower() in freq_of_word.keys():
                if sent not in sent_scores.keys():                            
                    sent_scores[sent]=freq_of_word[word.text.lower()]
                else:
                    sent_scores[sent]+=freq_of_word[word.text.lower()]
    
    
    # Summary for the sentences with maximum score. Here, each sentence in the list is of spacy.span type
    summary = nlargest(n = sentences, iterable = sent_scores, key = sent_scores.get)
    
    # Prepare for final summary
    final_summary=[word.text for word in summary]
    
    #convert to a string
    summary=" ".join(final_summary)
    
    # Return final summary
    return summary

In [None]:
df['summary'] = df['description'].apply(textSummarizer, sentences = 5)

## Transform the data

In [None]:
def filter_tokens(string: str) -> str:
    doc = nlp(string)
    tokens = [token for token in doc if token.pos_ == 'NOUN']
    token_str = [token.text for token in tokens if len(token.text) > 2]
    returnvalue = ' '.join(token_str)
    return returnvalue

In [None]:
#df['obs'] = df['summary'].apply(filter_tokens)
df['obs'] = df['description'].apply(filter_tokens)

In [None]:
obs_dict = {}

for index, row in df.iterrows():
    obs_dict = word_count(row['obs'], obs_dict)
     
total_words = sum(obs_dict.values())
print(f'Total nouns: {total_words}')
print(f'Unique nouns: {len(obs_dict)}')
print(f'Average occurance: {total_words / len(obs_dict):.2f}')


In [None]:
plot_wordcloud_from_dict(obs_dict, 12, 8).show()
histogram(obs_dict).show()

# Transform the column we want to consider to a SpaCy vector

In [None]:
# Create a list of series objects representing the columns of the new DataFrame  
vector = nlp('tekst for vector').vector # create an arbitrary vector to be certain that we have the correct length
vector_names = [f"V{i}" for i in range(len(vector))]  
column_list = [] 

for name in vector_names:
     column_list.append(pd.Series(name=name, index=df.index, dtype=float))  
  
# Loop over the strings in the original DataFrame and add their spaCy vectors to the column Series objects  
for i, text in enumerate(df['obs']):  
    doc = nlp(text)  
    for j, value in enumerate(doc.vector):  
        column_list[j][i] = value  
  
# Concatenate the column Series objects to create the new DataFrame  
df_vec = pd.concat(column_list, axis=1)  

## Remove samples that have empty vectors

In [None]:
# Calculate the norm of each row using np.linalg.norm()  
norms = df_vec.apply(lambda row: np.linalg.norm(row), axis=1)  

print(f'Number of samples before selection: {len(df)}')
# Filter out the rows where the norm is zero  
df_vec = df_vec[norms != 0]  

print(f'Number of samples for clustering: {len(df_vec)}')

## Set Random State

In [None]:
RANDOM_STATE = None
RANDOM_STATE = 42

## Correlation

In [None]:
# Lets explore the correlations in our data set 
plt.figure(figsize=(10,10))
correlation = df_vec.corr()
# Keep only the upper triangle of the correlation matrix  
correlation = np.triu(correlation, k=1)  

sns.heatmap(abs(correlation), center = 0, cmap="RdBu", vmax = 1.0, vmin = 0.0)

# The correlation measure used here is Pearson’s correlation. 
# In our case the lighter the square the stronger the correlation between two variables.

# Print the maximum and minimum correlations  
max_corr = correlation.max()  
min_corr = correlation.min()
avg_corr = abs(correlation).mean()
print(f'Max correlation: {max_corr:.3f}')  
print(f'Min correlation: {min_corr:.3f}')  
print(f'Mean absolute correlation: {avg_corr:.3f}')

# Affinity Propagation

## Distance matrix

In [None]:
n = len(df_vec)

dist_matrix = np.zeros((n, n))

for i in range(n):
    for j in range(i+1, n):
        vector_i = df_vec.iloc[i]
        vector_j = df_vec.iloc[j]
        
        similarity = np.dot(vector_i, vector_j) / (np.linalg.norm(vector_i) * np.linalg.norm(vector_j))
        similarity = max(-1.0, min(1.0, similarity)) # to remove corner cases from rounding
        # to create a distance that is 0 for equal cases (similarity = 1) and 2 for cases that are very far apart
        dist_matrix[i][j] = dist_matrix[j][i] = 1 - similarity

## Hyperparameter space

In [None]:
# Dataframe for hyperparameter damping and results
stepsize = 0.025

damping = np.arange(0.5, 1.0, stepsize)
af_results = pd.DataFrame(index=damping, columns=['Clusters', 'SC', 'VRC', 'DBI'])  

In [None]:
best_damping = 0.5
best_VRC = 0
n_clusters = 0

for index in af_results.index:
    af = AffinityPropagation(
        damping = index,
        max_iter = 500,
        affinity = 'precomputed',
        verbose = False,
        random_state = RANDOM_STATE).fit(dist_matrix)
    
    #cluster_centers_indices = af.cluster_centers_indices_
    n_clust = len(af.cluster_centers_indices_)
    af_results.loc[index]['Clusters'] = len(af.cluster_centers_indices_)
    if n_clusters == 0:
        n_clusters = n_clust
        
    if n_clust > 1:
        VRC = metrics.calinski_harabasz_score(dist_matrix, af.labels_)
        if VRC > best_VRC:
            best_damping = index
            best_VRC = VRC
            best_aflabels = af.labels_
            centers = af.cluster_centers_indices_
            n_clusters = len(centers)
    
    
        af_results.loc[index]['SC'] = metrics.silhouette_score(dist_matrix, af.labels_, metric="precomputed")
        af_results.loc[index]['VRC'] = metrics.calinski_harabasz_score(dist_matrix, af.labels_)
        af_results.loc[index]['DBI'] = metrics.davies_bouldin_score(dist_matrix, af.labels_)

In [None]:
print(f'Selected damping factor: {best_damping:0.3f}')

SC = af_results.loc[best_damping]['SC']
DBI = af_results.loc[best_damping]['DBI']

print(f'Estimated number of clusters: {n_clusters}')
if n_clusters > 1:

    print(f'Silhouette Coefficient: {SC:0.3f}')
    print(f'Calinski-Harabasz Index / Variance Ratio Criterion: {best_VRC:0.3f}')
    print(f'Davies-Bouldin Index: {DBI:0.3f}')

In [None]:
# Dotplot for nr of cluster per dampening factor
fig, ax = plt.subplots()  
ax.plot(af_results.index * 100, af_results['Clusters'], marker='.', linestyle='', markersize=10)  
ax.set_xlabel('Dampening factor (%)')  
ax.set_ylabel('Nr of clusters')  
ax.set_title('Nr of clusters per dampening factor') 
ax.set_ylim(bottom=0, top = af_results['Clusters'].max() + 1)  
ax.yaxis.set_major_locator(MaxNLocator(integer=True))  

# Set x-axis labels for values between 50 to 100  
ax.set_xticks(range(50, 101, 10))  
ax.set_xticklabels([str(i) for i in range(50, 101, 10)])  

plt.show()  

In [None]:
# Dotplot for silhouette score per dampening factor
fig, ax = plt.subplots()  
ax.plot(af_results.index * 100, af_results['SC'] * 100, marker='.', linestyle='', markersize=10)  
ax.set_xlabel('Dampening factor (%)')  
ax.set_ylabel('Silhouette score (%)')  
ax.set_title('Silhouette score per dampening factor') 
ax.set_ylim(bottom=0, top = af_results['SC'].max()*100 + 1)  
ax.yaxis.set_major_locator(MaxNLocator(integer=True))  

# Set x-axis labels for values between 50 to 100  
ax.set_xticks(range(50, 101, 10))  
ax.set_xticklabels([str(i) for i in range(50, 101, 10)])  

plt.show()  

In [None]:
# Dotplot for VRC per dampening factor
fig, ax = plt.subplots()  
ax.plot(af_results.index * 100, af_results['VRC'], marker='.', linestyle='', markersize=10)  
ax.set_xlabel('Dampening factor (%)')  
ax.set_ylabel('VRC')  
ax.set_title('VRC per dampening factor') 
ax.set_ylim(bottom=0, top = af_results['VRC'].max() + 2)  
ax.yaxis.set_major_locator(MaxNLocator(integer=True))  

# Set x-axis labels for values between 50 to 100  
ax.set_xticks(range(50, 101, 10))  
ax.set_xticklabels([str(i) for i in range(50, 101, 10)])  

plt.show()  

In [None]:
# Dotplot for DBI per dampening factor
fig, ax = plt.subplots()  
ax.plot(af_results.index * 100, af_results['DBI'], marker='.', linestyle='', markersize=10)  
ax.set_xlabel('Dampening factor (%)')  
ax.set_ylabel('DBI')  
ax.set_title('DBI per dampening factor') 
ax.set_ylim(bottom=0, top = af_results['DBI'].max() + 2)  
ax.yaxis.set_major_locator(MaxNLocator(integer=True))  

# Set x-axis labels for values between 50 to 100  
ax.set_xticks(range(50, 101, 10))  
ax.set_xticklabels([str(i) for i in range(50, 101, 10)])  

plt.show()  

In [None]:
label_dicts = {}

for i, label in enumerate(best_aflabels):
    words = df.loc[df_vec.index[i]]['obs']
    
    if label in label_dicts:
        label_dicts[label] = word_count(words, label_dicts[label])
    else:
        dict = {}
        label_dicts[label] = word_count(words, dict)

In [None]:
def plot_wordclouds(labels):
    n = labels.max() + 1
    label_dicts = {}

    for i, label in enumerate(labels):
        words = df.loc[df_vec.index[i]]['obs']

        if label in label_dicts:
            label_dicts[label] = word_count(words, label_dicts[label])
        else:
            dict = {}
            label_dicts[label] = word_count(words, dict)

    wordclouds = [wordcloud_from_dict(value, 5, 5) for value in label_dicts.values()]

    # Create a grid of subplots, ncols wide or less if there are less clusters
    ncols = min(5, n)
    nrows = int(np.ceil(len(wordclouds) / ncols))
    nplots = ncols * nrows

    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 20))  
    keys = list(label_dicts.keys())

    # Plot each individual wordcloud in a separate subplot  
    for i in range(nplots): 
        row = i // ncols
        col = i % ncols
        if nrows == 1:
            if i < len(wordclouds):
                index = keys.index(i)
                axs[col].imshow(wordclouds[index].to_array(), interpolation='bilinear')  
                axs[col].set_title(f'Cluster {keys[index]}', pad = 15)
            axs[col].axis('off')  
        else:
            if i < len(wordclouds):
                index = keys.index(i)
                axs[row, col].imshow(wordclouds[index].to_array(), interpolation='bilinear')  
                axs[row, col].set_title(f'Cluster {keys[index]}')
            axs[row, col].axis('off')  


    # Show the grid of subplots  
    plt.show() 

In [None]:
plot_wordclouds(best_aflabels)

In [None]:
def plot_cluster_size_hist(labels):
    freq = {}
    for label in labels:
        if label in freq:
            freq[label] += 1
        else:
            freq[label] = 1
        
    data = freq.values()
    plt.hist(data)

    # add labels and title to the chart  
    plt.xlabel("Cluster size")  
    plt.ylabel("Frequency")  
    plt.title("Cluster size Frequency Histogram")  
    plt.show()

In [None]:
def silhouette_plot(dist_matrix, labels):
    n_clusters = labels.max() + 1
    # Compute the silhouette scores for each sample
    sample_silhouette_values = metrics.silhouette_samples(dist_matrix, labels, metric = 'precomputed')
    min_sil = math.floor(min(sample_silhouette_values) / 0.2) * 0.2
    max_sil = max(sample_silhouette_values)

    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)

    ax.set_xlim([min_sil, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette plots of individual clusters,
    # to demarcate them clearly.    
    ax.set_ylim([0, len(df_vec) + (n_clusters + 1) * 10])

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = metrics.silhouette_score(dist_matrix, labels, metric = 'precomputed')

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax.text(min_sil, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

        ax.set_title("Silhouette plot for each cluster")
        ax.set_xlabel("Silhouette coefficient")
        ax.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax.set_yticks([])  # Clear the yaxis labels / ticks
        xticks = np.arange(min_sil, 1.1, 0.2)  
        ax.set_xticks(xticks)

In [None]:
plot_cluster_size_hist(best_aflabels)
silhouette_plot(dist_matrix, best_aflabels)

In [None]:
for i in range(max(best_aflabels+1)):
    print(f'Cluster {i} size: {sum(best_aflabels == i)}')

# Spectral Clustering

## Affinity matrix

In [None]:
aff_matrix = 2 - dist_matrix

## Hyperparameter tuning

In [None]:
# Dataframe for hyperparameter n_clusters and results
n_clusters = range(2, 10)
sc_results = pd.DataFrame(index=n_clusters, columns=['SC', 'VRC', 'DBI'])  

In [None]:
best_VRC = 0
n = 0

for index in sc_results.index:
    sc = SpectralClustering(
        n_clusters = index,
        random_state = RANDOM_STATE, 
        affinity = 'precomputed',
        verbose = False,
        assign_labels='discretize').fit(aff_matrix)
            
    VRC = metrics.calinski_harabasz_score(dist_matrix, sc.labels_)
    if VRC > best_VRC:    
        n = index
        best_VRC = VRC
        best_SC = SC
        best_sclabels = sc.labels_
        
    sc_results.loc[index]['SC'] = metrics.silhouette_score(dist_matrix, sc.labels_, metric="precomputed")
    sc_results.loc[index]['VRC'] = metrics.calinski_harabasz_score(dist_matrix, sc.labels_)
    sc_results.loc[index]['DBI'] = metrics.davies_bouldin_score(dist_matrix, sc.labels_)

In [None]:
print(f'Selected number of clusters: {n}')

SC = sc_results.loc[n]['SC']
DBI = sc_results.loc[n]['DBI']
print(f'Silhouette Coefficient: {SC:0.3f}')
print(f'Calinski-Harabasz Index / Variance Ratio Criterion: {best_VRC:0.3f}')
print(f'Davies-Bouldin Index: {DBI:0.3f}')

In [None]:
plot_wordclouds(best_sclabels)

In [None]:
plot_cluster_size_hist(best_sclabels)
silhouette_plot(dist_matrix, best_sclabels)

In [None]:
for i in range(max(best_sclabels+1)):
    print(f'Cluster {i} size: {sum(best_sclabels == i)}')

# Agglomerative Clustering

In [None]:
# Dataframe for hyperparameter damping and results
n_clusters = range(2, 20)
ac_results = pd.DataFrame(index=n_clusters, columns=['SC', 'VRC', 'DBI'])  

In [None]:
best_VRC = 0
n = 0

for index in ac_results.index:
    ac = AgglomerativeClustering(
        n_clusters = index,
        metric = 'precomputed',
        linkage = 'average').fit(dist_matrix)
            
    VRC = metrics.calinski_harabasz_score(dist_matrix, ac.labels_)
    if VRC > best_VRC:
        n = index
        best_VRC = VRC
        best_aclabels = ac.labels_
        
    ac_results.loc[index]['SC'] = metrics.silhouette_score(dist_matrix, ac.labels_, metric="precomputed")
    ac_results.loc[index]['VRC'] = metrics.calinski_harabasz_score(dist_matrix, ac.labels_)
    ac_results.loc[index]['DBI'] = metrics.davies_bouldin_score(dist_matrix, ac.labels_)

In [None]:
print(f'Selected number of clusters: {n}')

SC = ac_results.loc[n]['SC']
DBI = ac_results.loc[n]['DBI']
print(f'Silhouette Coefficient: {SC:0.3f}')
print(f'Calinski-Harabasz Index / Variance Ratio Criterion: {best_VRC:0.3f}')
print(f'Davies-Bouldin Index: {DBI:0.3f}')

In [None]:
plot_cluster_size_hist(best_aclabels)
plot_wordclouds(best_aclabels)

In [None]:
silhouette_plot(dist_matrix, best_aclabels)

In [None]:
for i in range(max(best_aclabels)+1):
    print(f'Cluster {i} size: {sum(best_aclabels == i)}')

In [None]:
ac_results

In [None]:
df['description'][100]

In [None]:
df['obs'][100]

In [None]:
best_sclabels[100]