In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from gensim.models import KeyedVectors
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from collections import Counter
from tslearn.clustering import KernelKMeans

# One-Hot-Encoding

In [59]:
mails = pd.read_json('data/mails_ohe_bigrams.json')
print(mails.shape)
mails.head()

(8564, 12624)


Unnamed: 0,_questionmark_count_,_AJD_count_,_ADP_count_,_ADV_count_,_AUX_count_,_CCONJ_count_,_DET_count_,_INTJ_count_,_NOUN_count_,_NUM_count_,...,pack,articulation,ronde,nail,ecol,immobilier_lieu,lanrivain,batard,leudet,_label_
0,2,0,7,1,1,0,3,0,18,2,...,0,0,0,0,0,0,0,0,0,1
1,2,0,3,3,0,2,5,0,12,2,...,0,0,0,0,0,0,0,0,0,1
2,1,0,6,3,0,2,6,0,23,1,...,0,0,0,0,0,0,0,0,0,1
3,1,0,2,1,0,1,2,0,11,1,...,0,0,0,0,0,0,0,0,0,1
4,1,0,38,6,6,2,31,0,56,3,...,0,0,0,0,0,0,0,0,0,1


In [None]:
X = mails.drop(['_label_', '_X_count_'], axis = 1).values
X_not_scaled = X.copy()
y = mails._label_.values
X_labeled = mails[mails._label_!=2].drop(['_label_', '_X_count_'], axis = 1).values
y_labeled = mails[mails._label_!=2]._label_.values
X_unlabeled = mails[mails._label_==2].drop(['_label_', '_X_count_'], axis = 1).values
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_labeled = scaler.transform(X_labeled)
X_unlabeled = scaler.transform(X_unlabeled)

## LDA

In [19]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

class_weigth = compute_class_weight('balanced', classes = [0,1], y = y_labeled)

reduction = LinearDiscriminantAnalysis(priors=class_weigth)
reduction.fit(X_labeled, y_labeled)
reduction_result = reduction.transform(X_labeled)

fig = px.scatter(
    reduction_result, x=0,
    color=y_labeled,
    title = 'Actual labels'
)
fig.show()


The priors do not sum to 1. Renormalizing



In [21]:
reduction_result = reduction.transform(X)

fig = px.scatter(
    reduction_result, x=0,
    color = y,
    title = 'Actual labels'
)
fig.show()

fig = px.scatter(
    reduction_result, x=0,
    log_x = True,
    color = y,
    title = 'Actual labels log-scaled'
)
fig.show()

fig = px.scatter(
    reduction_result, x=0,
    log_x = True,
    color = reduction.predict(X),
    title = 'Predicted labels log-scaled'
)
fig.show()

# TF-IDF Lemmas

In [41]:
mails = pd.read_json('data/mails_preprocessing.json')
print(mails.shape)
mails.head()

(8564, 22)


Unnamed: 0,from,label,text,_questionmark_count_,text_lem,_AJD_count_,_ADP_count_,_ADV_count_,_AUX_count_,_CCONJ_count_,...,_NOUN_count_,_NUM_count_,_PRON_count_,_PROPN_count_,_PUNCT_count_,_SCONJ_count_,_SYM_count_,_VERB_count_,_X_count_,unique_words_count
0,=?iso-8859-1?q?guillaume_v=e9ronique?=\r\n\t<v...,1,acces decibel bonjour pouvez vous donner les a...,2,acce decibel bonjour pouvoir donner acces deci...,0,7,1,1,0,...,18,2,3,0,2,0,0,3,1,233
1,levisse xavier <xavier.levisse@harmonie-mutuel...,1,actes indemnités hospitalières tu sais ce que ...,2,acte indemnite hospitalier savoir acte frais r...,0,3,3,0,2,...,12,2,5,0,2,1,0,6,0,177
2,courtais yohan <yohan.courtais@harmonie-mutuel...,1,analyse des obsèques naissances appareils audi...,1,analyse obseque naissance appareil auditif bon...,0,6,3,0,2,...,23,1,8,0,4,3,0,8,0,274
3,levisse xavier <xavier.levisse@harmonie-mutuel...,1,ano ihm bonjour j ai un multivalue filtres eta...,1,ano ihm bonjour multivalu filtre etablissement...,0,2,1,0,1,...,11,1,1,0,1,0,0,2,3,172
4,=?iso-8859-1?q?pernot_val=e9rie?= <valerie.per...,1,ano alimentation réseau sur dcb bonjour je m i...,1,ano alimentation reseau dob bonjour metre inte...,0,38,6,6,2,...,56,3,14,1,10,1,0,20,0,659


In [51]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_labeled = vectorizer.fit_transform(mails[mails.label!=2].text_lem.values)
y_labeled = mails[mails.label!=2].label.values
X_unlabeled = vectorizer.transform(mails[mails.label==2].text_lem.values)

In [52]:
df_features = mails.iloc[:, 3::].drop('text_lem', axis = 1)
X_labeled = np.concatenate([X_labeled.toarray(), df_features[mails.label!=2].values], axis = 1)
X_unlabeled = np.concatenate([X_unlabeled.toarray(), df_features[mails.label==2].values], axis = 1)

In [53]:
scaler = StandardScaler()
X_labeled = scaler.fit_transform(X_labeled)
X_unlabeled = scaler.transform(X_unlabeled)

## K-means

In [55]:
clustering = KMeans(n_clusters=2)
y_pred = clustering.fit_predict(X_labeled)
print(classification_report(y_labeled, y_pred))

              precision    recall  f1-score   support

           0       0.11      1.00      0.20       590
           1       0.00      0.00      0.00      4759

    accuracy                           0.11      5349
   macro avg       0.06      0.50      0.10      5349
weighted avg       0.01      0.11      0.02      5349



In [56]:
order_centroids = clustering.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(2):
    print("Cluster", i),
    for ind in order_centroids[i, :5]:
        print(terms[ind])

Cluster 0
re
harmonie
direction
technique
cordialement
Cluster 1
contrat slide
vyv onglet
mutuelle possibilite
nb devis
rouge nb


## LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

reduction = LinearDiscriminantAnalysis()
reduction.fit(X_labeled, y_labeled)
reduction_result = reduction.transform(X_labeled)

fig = px.scatter(
    reduction_result, x=0,
    color=y_labeled,
    title = 'Actual labels'
)
fig.show()

In [518]:
reduction_result = reduction.transform(X_unlabeled)
y_pred = reduction.predict(X_unlabeled)
print(Counter(y_pred))

fig = px.scatter(
    reduction_result, x=0,
    color=y_pred,
    title = 'Predicted labels'
)
fig.show()

Counter({0: 6019, 1: 2068})


# Embeddings

In [2]:
mails = pd.read_pickle('data/mails_embedded_doc2vec_bigrams.pkl')
print(mails.shape)
mails.head()

(8564, 519)


Unnamed: 0,_questionmark_count_,_AJD_count_,_ADP_count_,_ADV_count_,_AUX_count_,_CCONJ_count_,_DET_count_,_INTJ_count_,_NOUN_count_,_NUM_count_,...,491,492,493,494,495,496,497,498,499,_label_
0,2,0,7,1,1,0,3,0,18,2,...,7.552959,-7.75819,-17.432704,7.485336,-6.11666,5.172562,7.584496,5.250462,-1.320692,1
1,2,0,3,3,0,2,5,0,12,2,...,-7.110094,-5.106038,-6.896137,-6.444591,4.211719,10.020246,-0.814034,-3.377685,1.531344,1
2,1,0,6,3,0,2,6,0,23,1,...,-12.163507,5.605024,-18.668275,-20.377143,-3.610264,0.414532,-11.114808,-3.477304,-4.977881,1
3,1,0,2,1,0,1,2,0,11,1,...,2.105754,-1.29743,-9.423036,-1.073635,8.888151,6.058524,4.917495,7.490066,-1.712665,1
4,1,0,38,6,6,2,31,0,56,3,...,-19.045707,-35.856213,-24.074799,-23.920559,16.694527,10.680893,-16.323111,40.139483,-10.349179,1


In [3]:
X = mails.drop('_label_', axis = 1).values
y = mails._label_.values
X_labeled = mails[mails._label_!=2].drop('_label_', axis = 1).values
y_labeled = mails[mails._label_!=2]._label_.values
X_unlabeled = mails[mails._label_==2].drop('_label_', axis = 1).values

In [6]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_labeled = scaler.fit_transform(X_labeled)
X_unlabeled = scaler.transform(X_unlabeled)

## K-means

In [10]:
clustering = KMeans(n_clusters=2, max_iter=50, n_init=10, n_jobs = -1)
y_pred = clustering.fit_predict(X_labeled)
print(classification_report(y_labeled, y_pred))



              precision    recall  f1-score   support

           0       0.16      0.13      0.15       590
           1       0.89      0.91      0.90      4759

    accuracy                           0.83      5349
   macro avg       0.53      0.52      0.52      5349
weighted avg       0.81      0.83      0.82      5349



In [7]:
tsne = TSNE(n_components=2, random_state=42)
tsne_emb = tsne.fit_transform(X)

fig = px.scatter(
    tsne_emb, x=0, y=1
)
fig.show()

In [8]:
fig = px.scatter(
    tsne_emb, x=0, y=1,
    color_discrete_sequence=y,
    symbol=y,
)
fig.show()

In [4]:
tsne = TSNE(n_components=2, random_state=42)
tsne_emb = tsne.fit_transform(X_labeled)

fig = px.scatter(
    tsne_emb, x=0, y=1,
    color_discrete_sequence=y_pred,
    symbol=y_pred,
    title = 'Predicted labels'
)
fig.show()

fig = px.scatter(
    tsne_emb, x=0, y=1,
    color_discrete_sequence=y_labeled,
    symbol=y_labeled,
    title = 'Actual labels'
)
fig.show()

NameError: name 'y_pred' is not defined

## Kernel K-means

In [13]:
#kernels : ['gak', ‘additive_chi2’, ‘chi2’, ‘linear’, ‘poly’, ‘polynomial’, ‘rbf’, ‘laplacian’, ‘sigmoid’, ‘cosine’]
kmeans_emb = KernelKMeans(n_clusters=2, kernel = 'gak', max_iter=50, n_init=10, n_jobs = -1)
y_pred = kmeans_emb.fit_predict(X_labeled)
print(classification_report(y_labeled, y_pred))



              precision    recall  f1-score   support

           0       0.09      0.56      0.16       590
           1       0.86      0.34      0.49      4759

    accuracy                           0.36      5349
   macro avg       0.48      0.45      0.32      5349
weighted avg       0.78      0.36      0.45      5349



In [19]:
KernelKMeans(n_clusters=2, kernel = 'sigmoid', max_iter=50, n_init=10, n_jobs = -1)
y_pred = kmeans_emb.fit_predict(X_labeled)



KeyboardInterrupt: 

In [None]:
res_df = pd.DataFrame()
kernels = ['gak', 'linear', 'poly', 'polynomial', 'rbf', 'laplacian', 'sigmoid', 'cosine']
for kernel in kernels:
    kmeans_emb = KernelKMeans(n_clusters=2, kernel = 'gak', max_iter=50, n_init=10, n_jobs = -1)
    y_pred = kmeans_emb.fit_predict(X_labeled)
    res_df[kernel] = [f1_score(y_labeled, y_pred)]

In [None]:
res_df

In [None]:
tsne = TSNE(n_components=2, random_state=42)
tsne_emb = tsne.fit_transform(X_labeled)

fig = px.scatter(
    tsne_emb, x=0, y=1,
    color_discrete_sequence=y_pred,
    symbol=y_pred,
    title = 'Predicted labels'
)
fig.show()

fig = px.scatter(
    tsne_emb, x=0, y=1,
    color_discrete_sequence=y_labeled,
    symbol=y_labeled,
    title = 'Actual labels'
)
fig.show()

## Hierarchical Clustering

In [320]:
from sklearn.cluster import AgglomerativeClustering

clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
y_pred = clustering.fit_predict(X_labeled)
print(classification_report(y_labeled, y_pred))

fig = px.scatter(
    tsne_emb, x=0, y=1,
    color=y_pred,
    title='Predicted labels'
)
fig.show()

fig = px.scatter(
    tsne_emb, x=0, y=1,
    color=y_labeled,
    title = 'Actual Labels'
)
fig.show()

              precision    recall  f1-score   support

           0       0.65      1.00      0.79       116
           1       1.00      0.02      0.03        64

    accuracy                           0.65       180
   macro avg       0.82      0.51      0.41       180
weighted avg       0.77      0.65      0.52       180



## SpectralClustering

In [351]:
from sklearn.cluster import SpectralClustering

clustering = SpectralClustering(n_clusters=2, eigen_solver = 'arpack', affinity = 'laplacian', assign_labels = 'discretize')
y_pred = clustering.fit_predict(X_labeled)
print(classification_report(y_labeled, y_pred))

fig = px.scatter(
    tsne_emb, x=0, y=1,
    color=y_pred,
    title='Predicted labels'
)
fig.show()

fig = px.scatter(
    tsne_emb, x=0, y=1,
    color=y_labeled,
    title = 'Actual Labels'
)
fig.show()

              precision    recall  f1-score   support

           0       0.68      0.53      0.60       116
           1       0.39      0.55      0.46        64

    accuracy                           0.54       180
   macro avg       0.54      0.54      0.53       180
weighted avg       0.58      0.54      0.55       180




Graph is not fully connected, spectral embedding may not work as expected.



## LDA

In [653]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

reduction = LinearDiscriminantAnalysis()
reduction.fit(X_labeled, y_labeled)
reduction_result = reduction.transform(X_labeled)

fig = px.scatter(
    reduction_result, x=0,
    color_discrete_sequence=y_labeled,
    symbol = y_labeled,
    labels = {'symbol':'class'},
    title = 'Actual labels'
)
fig.show()

y_pred = reduction.predict(X_labeled)
fig = px.scatter(
    reduction_result, x=0,
    color_discrete_sequence=y_pred,
    symbol = y_pred,
    labels = {'symbol':'class'},
    title = 'Predicted labels'
)
fig.show()

In [654]:
reduction_result = reduction.transform(X_unlabeled)
y_pred = reduction.predict(X_unlabeled)
print(Counter(y_pred))

fig = px.scatter(
    reduction_result, x=0,
    color_discrete_sequence=y_pred,
    symbol = y_pred,
    labels = {'symbol':'class'},
    title = 'Predicted labels'
)
fig.show()

Counter({0: 5208, 1: 2879})


## QDA

In [685]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis()
qda.fit(X_labeled, y_labeled)
reduction = LinearDiscriminantAnalysis()
reduction_result = reduction.fit_transform(X_labeled, y_labeled)
y_pred = qda.predict(X_labeled)

fig = px.scatter(
    reduction_result, x=0,
    color_discrete_sequence=y_labeled,
    symbol = y_labeled,
    labels = {'symbol':'class'},
    title = 'Actual labels'
)
fig.show()

fig = px.scatter(
    reduction_result, x=0,
    color_discrete_sequence=y_pred,
    symbol = y_pred,
    labels = {'symbol':'class'},
    title = 'Predicted labels'
)
fig.show()


Variables are collinear



In [686]:
reduction_result = reduction.transform(X_unlabeled)
y_pred = qda.predict(X_unlabeled)
print(Counter(y_pred))

fig = px.scatter(
    reduction_result, x=0,
    color_discrete_sequence=y_pred,
    symbol = y_pred,
    labels = {'symbol':'class'},
    title = 'Predicted labels'
)
fig.show()

Counter({1: 7849, 0: 238})


## Isomap

In [710]:
from sklearn.manifold import Isomap

reduction = Isomap(n_components=2, metric = 'cosine')
reduction_result = reduction.fit_transform(X_labeled)

fig = px.scatter(
    reduction_result, x=0,
    color_discrete_sequence=y_labeled,
    symbol = y_labeled,
    labels = {'symbol':'class'},
    title = 'Actual labels'
)
fig.show()

## Locally Linear Embedding

In [713]:
from sklearn.manifold import LocallyLinearEmbedding

reduction = LocallyLinearEmbedding()
reduction_result = reduction.fit_transform(X_labeled)

fig = px.scatter(
    reduction_result, x=0,
    color_discrete_sequence=y_labeled,
    symbol = y_labeled,
    labels = {'symbol':'class'},
    title = 'Actual labels'
)
fig.show()

## Spectral Embedding

In [719]:
from sklearn.manifold import SpectralEmbedding

reduction = SpectralEmbedding()
reduction_result = reduction.fit_transform(X_labeled)

fig = px.scatter(
    reduction_result, x=0,
    color_discrete_sequence=y_labeled,
    symbol = y_labeled,
    labels = {'symbol':'class'},
    title = 'Actual labels'
)
fig.show()

## UMAP + HDBSCAN

In [679]:
import umap
import hdbscan

ModuleNotFoundError: No module named 'hdbscan'

In [None]:
def generate_clusters(message_embeddings,
                      n_neighbors,
                      n_components, 
                      min_cluster_size,
                      random_state = None):
    """
    Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP
    """
    
    umap_embeddings = (umap.UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                metric='cosine', 
                                random_state=random_state)
                            .fit_transform(message_embeddings))

    clusters = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size,
                               metric='euclidean', 
                               cluster_selection_method='eom').fit(umap_embeddings)

    return clusters