In [None]:
import io
import logging
import pyarrow.dataset as ds
import numpy as np
import pandas as pd
import re
from pathlib import Path
import math
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import nl_core_news_lg
from typing import Dict, List
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split, KFold
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from wordcloud import WordCloud
import matplotlib.cm as cm

In [None]:
logging.basicConfig(filename = 'log.txt', level = logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')

# Load the data

In [None]:
PATH_DATA = Path(r'C:/Git/HonoursProject/ipw-classifier/ipw_classifier/data/')
PARQUET_SUFFIX = '.parquet'

In [None]:
def _parse(table_name: str) -> pd.DataFrame:
    parquet_path = PATH_DATA / f"{table_name}{PARQUET_SUFFIX}"
    table = ds.dataset(parquet_path).to_table()
    df = table.to_pandas()

    logging.info(f'Number of records in table {table_name}: {len(df)}')

    if 'case_id' in df.columns:
        if not df['case_id'].is_unique:
            raise ValueError(f'Duplicate values found in "case_id" column in table {table_name}.')
        
        df.rename(columns = {'id': f'{table_name}_id'}, inplace = True)
    
    columns_to_drop = set([
        'author',
        'rights', 
        'created',
        'updated',
        'deleted',
        'owners',
        'source',
        'closed'
        ])
    columns = columns_to_drop.intersection(set(df.columns))  
    
    for col_name in columns:
        df.drop(col_name, axis = 1, inplace = True)
    return df

In [None]:
def main():
    case = _parse('case')
    situation = _parse('situation')
    plan = _parse('plan')
    
    df_sit_pln = plan.merge(situation, left_on = 'case_id', right_on = 'case_id', how = 'outer', suffixes = ('_pln', '_sit'))
    df = df_sit_pln.merge(case, left_on = 'case_id', right_on = 'id', how = 'left')
    logging.info(f'Number of records in combined table: {len(df)}')
    df = df[df['status'] == 'closed']
    logging.info(f'Number of closed records in combined table: {len(df)}')
    
    df.drop('case_id', axis = 1, inplace = True)
    return df

In [None]:
df_in = main()

In [None]:
to_drop = [
    'plan_id',
    'situation_id',
    'title',
    'status',
    'collection_id',
    'author_id'
]

df = df_in.drop(to_drop, axis = 1)
df = df.set_index('id')

## Clean the text fields

In [None]:
def clean_string(string:str) -> str:
    returnvalue = ''
    if string is not None and not isinstance(string, float):
        
        # parse html
        soup = BeautifulSoup(string, 'html.parser')
        returnvalue = soup.getText()
        
        # remove '\n'
        returnvalue = returnvalue.replace('\\n', '')
    return returnvalue

In [None]:
for column in df.columns:  
    df[column] = df[column].apply(clean_string)

# Summary statistics

In [None]:
# Create a new DataFrame with the length of each string field in words per record  
df_stats = df.apply(lambda x: x.fillna('').str.split().apply(len)) 
summary_stats = df_stats.describe()
for col in df_stats.columns:
    summary_stats.loc['empty', col] = df_stats[col].value_counts(sort = False).get(0, 0)
    summary_stats.loc['not_empty', col] = summary_stats.loc['count', col] - summary_stats.loc['empty', col]
display(summary_stats)

In [None]:
# Create a box and whisker plot
plt.boxplot(df_stats)
plt.xticks(rotation=45)
plt.xticks(range(1, len(df_stats.columns) + 1), df_stats.columns)
plt.show()

## Unique words and occurances

In [None]:
def word_count(string:str, returnvalue: Dict[str, int]) -> Dict[str, int]:
    words = string.split()
    for word in words:
        key = re.sub(r'[^a-z]', '', word.lower())
        if key in returnvalue:
            returnvalue[key] += 1
        else:
            returnvalue[key] = 1
    return returnvalue

In [None]:
def histogram(dict):
#calculate the optimal distribution of bins according to Freedman-Draconis
    data = list(dict.values())

    # create bins for the histogram  
    #bins = np.exp(bins_sturge(np.log(data)))
    log_data = np.log(data)
    
    iqr = np.percentile(log_data, 75) - np.percentile(log_data, 25)
    bin_width = (2 * iqr) / (len(log_data) ** (1 / 3))
    log_bins = np.arange(min(log_data), max(log_data), bin_width)
    bins = np.exp(log_bins)

    # create the histogram  
    alpha = 1
    plt.hist(data, bins=bins, align='left', color = 'blue', alpha = alpha)

    # add labels and title to the chart  
    plt.xlabel("Frequency (log scale)")  
    plt.ylabel("Occurrences (log scale)")  
    plt.title("Word Frequency Histogram")  

    # set the axis to a logarithmic scale  
    plt.xscale('log')  
    plt.yscale('log')  

    return plt

In [None]:
def wordcloud_from_dict(dict: Dict[str, int], w:int = 12, h:int = 8)-> WordCloud:
    if not dict:  
        dict = {"NO WORDS": 1} 
    
    return WordCloud(width = w * 100, 
                     height= h * 100,
                     background_color="white", 
                     prefer_horizontal=0.8,  
                     min_font_size=10, 
                     max_font_size=400).generate_from_frequencies(dict)  
  


In [None]:
def plot_wordcloud_from_dict(dict: Dict[str, int], w:int = 12, h:int = 8)-> plt:
    wordcloud = wordcloud_from_dict(dict, w, h)
    
# Display the generated image  
    plt.figure(figsize=(w, h))  
    plt.imshow(wordcloud, interpolation="bilinear")  
    plt.axis("off")  
    return plt

In [None]:
words_dict = {}

for column in df:
    for index, row in df.iterrows():
            words_dict = word_count(row[column], words_dict)
     
total_words = sum(words_dict.values())
print(f'Total words: {total_words}')
print(f'Unique words: {len(words_dict)}')
print(f'Average occurance: {total_words / len(words_dict):.2f}')

In [None]:
plot_wordcloud_from_dict(words_dict, 12, 8).show()
histogram(words_dict).show()

# Spacy

In [None]:
# load natural language model for dutch
nlp = nl_core_news_lg.load()

## Transform the data to only consider nouns

In [None]:
def filter_nouns(string: str) -> str:
    doc = nlp(string)
    nouns = [token for token in doc if token.pos_ == "NOUN"]
    noun_str = [noun.text for noun in nouns]
    returnvalue = ' '.join(noun_str)
    return returnvalue

In [None]:
nouns = []
for column in df.columns:  
    nouns.append(df[column].apply(filter_nouns))

In [None]:
n = len(nouns[0])
strings = []
for i in range (n):
    string = ' '.join([row[i] for row in nouns])
    strings.append(string)
df['obs'] = strings

In [None]:
nouns_dict = {}

for index, row in df.iterrows():
    nouns_dict = word_count(row['obs'], nouns_dict)
     
total_words = sum(nouns_dict.values())
print(f'Total nouns: {total_words}')
print(f'Unique nouns: {len(nouns_dict)}')
print(f'Average occurance: {total_words / len(nouns_dict):.2f}')


In [None]:
plot_wordcloud_from_dict(nouns_dict, 12, 8).show()
histogram(nouns_dict).show()

# Transform the column we want to consider to a SpaCy vector

In [None]:
# Create a list of series objects representing the columns of the new DataFrame  
vector = nlp('tekst for vector').vector # create an arbitrary vector to be certain that we have the correct length
vector_names = [f"V{i}" for i in range(len(vector))]  
column_list = [] 

for name in vector_names:
     column_list.append(pd.Series(name=name, index=df.index, dtype=float))  
  
# Loop over the strings in the original DataFrame and add their spaCy vectors to the column Series objects  
for i, text in enumerate(df['obs']):  
    doc = nlp(text)  
    for j, value in enumerate(doc.vector):  
        column_list[j][i] = value  
  
# Concatenate the column Series objects to create the new DataFrame  
df_vec = pd.concat(column_list, axis=1)  

## Remove samples that have empty vectors

In [None]:
# Calculate the norm of each row using np.linalg.norm()  
norms = df_vec.apply(lambda row: np.linalg.norm(row), axis=1)  
  
# Filter out the rows where the norm is zero  
df_vec = df_vec[norms != 0]  

print(f'Number of samples for clustering: {len(df_vec)}')

## Set Random State

In [None]:
RANDOM_STATE = None
RANDOM_STATE = 42

## Correlation

In [None]:
# Lets explore the correlations in our data set 
plt.figure(figsize=(10,10))
correlation = df_vec.corr()
# Keep only the upper triangle of the correlation matrix  
correlation = np.triu(correlation, k=1)  

sns.heatmap(abs(correlation), center = 0, cmap="RdBu", vmax = 1.0, vmin = 0.0)

# The correlation measure used here is Pearson’s correlation. 
# In our case the lighter the square the stronger the correlation between two variables.

# Print the maximum and minimum correlations  
max_corr = correlation.max()  
min_corr = correlation.min()
avg_corr = abs(correlation).mean()
print(f'Max correlation: {max_corr:.3f}')  
print(f'Min correlation: {min_corr:.3f}')  
print(f'Mean absolute correlation: {avg_corr:.3f}')

# Affinity Propagation

In [None]:
n = len(df_vec)

aff_matrix = np.zeros((n, n))

for i in range(n):
    for j in range(i+1, n):
        vector_i = df_vec.iloc[i]
        vector_j = df_vec.iloc[j]
        
        similarity = np.dot(vector_i, vector_j) / (np.linalg.norm(vector_i) * np.linalg.norm(vector_j))
        similarity = max(-1.0, min(1.0, similarity)) # to remove corner cases from rounding
        # to create a distance that is 0 for equal cases (similarity = 1) and 2 for cases that are very far apart
        distance = 1 - similarity
        aff_matrix[i][j] = aff_matrix[j][i] = distance

In [None]:
af1 = AffinityPropagation(random_state = RANDOM_STATE, verbose = False, max_iter = 500).fit(df_vec)
cluster_centers_indices1 = af1.cluster_centers_indices_
labels1 = af1.labels_
n_clusters1 = len(cluster_centers_indices1)
  
print(f'Estimated number of clusters: {n_clusters1}')
print(f'Silhouette Coefficient: {metrics.silhouette_score(df_vec, labels1, metric="sqeuclidean"):0.3f}')
print(f'Calinski-Harabasz Index / Variance Ratio Criterion: {metrics.calinski_harabasz_score(df_vec, labels1):0.3f}')
print(f'Davies-Bouldin Index: {metrics.davies_bouldin_score(df_vec, labels1):0.3f}')

In [None]:
af = AffinityPropagation(
    random_state = RANDOM_STATE, 
    verbose = False,
    max_iter = 500, 
    affinity = 'precomputed').fit(aff_matrix)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters = len(cluster_centers_indices)
  
print(f'Estimated number of clusters: {n_clusters}')
print(f'Silhouette Coefficient: {metrics.silhouette_score(aff_matrix, labels, metric="sqeuclidean"):0.3f}')
print(f'Calinski-Harabasz Index / Variance Ratio Criterion: {metrics.calinski_harabasz_score(aff_matrix, labels):0.3f}')
print(f'Davies-Bouldin Index: {metrics.davies_bouldin_score(aff_matrix, labels):0.3f}')

In [None]:
label_dicts = {}

for i, label in enumerate(labels):
    words = df.loc[df_vec.index[i]]['obs']
    
    if label in label_dicts:
        label_dicts[label] = word_count(words, label_dicts[label])
    else:
        dict = {}
        label_dicts[label] = word_count(words, dict)

In [None]:
wordclouds = [wordcloud_from_dict(value, 7, 7) for value in label_dicts.values()]

# Create a grid of subplots, ncols wide
ncols = 3
nrows = int(np.ceil(len(wordclouds) / ncols))
nplots = ncols * nrows

fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 20))  
keys = list(label_dicts.keys())
    
# Plot each individual wordcloud in a separate subplot  
for i in range(nplots): 
    row = i // ncols
    col = i % ncols
    if nrows == 1:
        if i < len(wordclouds):
            axs[col].imshow(wordclouds[i].to_array(), interpolation='bilinear')  
            axs[col].set_title(f'Cluster {keys[i]}', pad = 15)
        axs[col].axis('off')  
    else:
        if i < len(wordclouds):
            axs[row, col].imshow(wordclouds[i].to_array(), interpolation='bilinear')  
            axs[row, col].set_title(f'Cluster {keys[i]}')
        axs[row, col].axis('off')  

  
# Show the grid of subplots  
plt.show() 

In [None]:
freq = {}
for label in labels:
    if label in freq:
        freq[label] += 1
    else:
        freq[label] = 1
        
data = freq.values()
plt.hist(data)

# add labels and title to the chart  
plt.xlabel("Cluster size")  
plt.ylabel("Frequency")  
plt.title("Cluster size Frequency Histogram")  
plt.show()

In [None]:
# Compute the silhouette scores for each sample
sample_silhouette_values = metrics.silhouette_samples(df_vec, labels)
min_sil = math.floor(min(sample_silhouette_values) / 0.2) * 0.2
max_sil = max(sample_silhouette_values)

fig, ax = plt.subplots(1, 1)
fig.set_size_inches(18, 7)

ax.set_xlim([min_sil, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette plots of individual clusters,
# to demarcate them clearly.    
ax.set_ylim([0, len(df_vec) + (n_clusters + 1) * 10])

# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = metrics.silhouette_score(df_vec, labels)

y_lower = 10
for i in range(n_clusters):
    # Aggregate the silhouette scores for samples belonging to
    # cluster i, and sort them
    ith_cluster_silhouette_values = sample_silhouette_values[labels == i]

    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = cm.nipy_spectral(float(i) / n_clusters)
    ax.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        ith_cluster_silhouette_values,
        facecolor=color,
        edgecolor=color,
        alpha=0.7,
    )

    # Label the silhouette plots with their cluster numbers at the middle
    ax.text(min_sil, y_lower + 0.5 * size_cluster_i, str(i))

    # Compute the new y_lower for next plot
    y_lower = y_upper + 10  # 10 for the 0 samples

    ax.set_title("The silhouette plot for the various clusters.")
    ax.set_xlabel("The silhouette coefficient values")
    ax.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax.set_yticks([])  # Clear the yaxis labels / ticks
    xticks = np.arange(min_sil, 1.1, 0.2)  
    ax.set_xticks(xticks)