## Import Libraries

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from string import punctuation
from collections import defaultdict
from heapq import nlargest

## Load Data

In [None]:
jc_number = pd.read_csv(r"C:\Users\Olga\Becode_Olga\KPMG-Team-3\data_csv\C\jc_119_0.csv")

## Define Vectorizer to Extract Features for Text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
_stopwords = list(stopwords.words('dutch') + list(punctuation) + ["les","'s","''","``","du","la","par","et","à", "aux","«","le", "des"])
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=_stopwords)

## Build a Corpus of Texts

In [None]:
posts = jc_number['nl_text'].to_list()
X = vectorizer.fit_transform(posts)

## Define a Clustering Algorithm

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=6, init='k-means++', max_iter=100, n_init=1, verbose=True, random_state=42)
km.fit(X)

## Cluster Numbers Stored in Array Labels and in Added Column 'class' 

In [None]:
import numpy as np
np.unique(km.labels_, return_counts=True)
jc_number['class'] = km.labels_.tolist()

## Find Out the Keywords for Each Cluster

In [None]:
### For the complete text of each cluster, we can use an NLTK function to find out the most frequent words within each cluster

text = {}
file_ind = {}
ind = []
for i, cluster in enumerate(km.labels_):
    oneDocument = posts[i]
    if cluster not in text.keys():
        text[cluster] = oneDocument
    else:
        text[cluster] += oneDocument

keywords = {}
counts = {}
for cluster in range(6):
    word_sent = word_tokenize(text[cluster].lower())
    word_sent = [word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    keywords[cluster] = nlargest(100, freq, key=freq.get)
    counts[cluster] = freq

## Find the 10 Keywords that are Unique to Each Cluster and Add them to the Column "key_words"

In [None]:
unique_keys={}
for cluster in range(6):   
    other_clusters = list(set(range(6))-set([cluster]))
    keys_other_clusters = set(keywords[other_clusters[0]]).union(set(keywords[other_clusters[1]]))
    unique = set(keywords[cluster])-keys_other_clusters
    unique_keys[cluster] = nlargest(15, unique, key=counts[cluster].get)

jc_number['key_words'] = jc_number.apply( lambda row : unique_keys[row['class']], axis = 1)

## Save Dataframe to CSV for Summarization

In [None]:
jc_number.to_csv(r"C:\Users\Olga\Becode_Olga\KPMG-Team-3\data_csv\C\clustered\jc_119_0_cluster.csv")