In [1]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
from sklearn.mixture import GaussianMixture
from nltk.corpus import stopwords  # Import the stop word list
from sklearn import datasets
import pandas as pd
import os
import sys
import re
import nltk

import numpy as np
import collections
import matplotlib.pyplot as plt
import numpy as np

In [2]:
nltk.download("stopwords")  # Download text data sets, including stop words
from nltk.corpus import stopwords # Import the stop word list
print(stopwords.words("french"))
import nltk
nltk.download('punkt')

['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aur

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def word_tokenizer(text):
            # tokens = RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')
            # tokens.tokenize(str(text))
            #tokenizes and stems the text
            tokens = word_tokenize(text)
            stemmer = PorterStemmer()
            tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('french')]
            return tokens

In [4]:
def cluster_sentences(sentences,nb_of_clusters=5):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                       stop_words=stopwords.words('french'),
                                       max_df=0.9,
                                       min_df=0.1,
                                       lowercase=True)
    #builds a tf-idf matrix for the sentences
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    kmeans = KMeans(n_clusters=nb_of_clusters)
    kmeans.fit(tfidf_matrix)
    clusters = collections.defaultdict(list)
    for i, label in enumerate(kmeans.labels_):
            clusters[label].append(i)
    return dict(clusters)

In [5]:
def body_to_words(raw_body):
    # Convert to lower case, split into individual words
    words = raw_body.lower().split()                             
    #
    # In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("french"))     
    # 
    # Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join(meaningful_words))

In [6]:
def k_means_model(train_data_features):
    # KMeans
    km = KMeans(n_clusters=5)
    km.fit(train_data_features)
    km.predict(train_data_features)
    labels = km.labels_

    # Plotting
    fig = plt.figure(1, figsize=(7, 7))
    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)
    ax.scatter(train_data_features[:, 3], train_data_features[:, 0], train_data_features[:, 2],
               c=labels.astype(np.float), edgecolor="k", s=50)
    ax.set_xlabel("ham width")
    ax.set_ylabel("spam length")
    ax.set_zlabel("ham length")
    plt.title("K Means", fontsize=14)

In [7]:
def gaussian_mixture_model(train_data_features):
    # Gaussian Mixture Model
    gmm = GaussianMixture(n_components=3)
    gmm.fit(train_data_features)
    proba_lists = gmm.predict_proba(train_data_features)

    # Plotting
    colored_arrays = np.matrix(proba_lists)
    colored_tuples = [tuple(i.tolist()[0]) for i in colored_arrays]
    fig = plt.figure(1, figsize=(7, 7))
    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)
    ax.scatter(train_data_features[:, 3], train_data_features[:, 0], train_data_features[:, 2],
               c=colored_tuples, edgecolor="k", s=50)
    ax.set_xlabel("ham width")
    ax.set_ylabel("spam length")
    ax.set_zlabel("hamp length")
    plt.title("Gaussian Mixture Model", fontsize=14)

# MAIN

In [8]:
train = pd.read_csv('./dataset/manitra_mails.csv', encoding='utf-8')

In [9]:
# Obtenez le nombre de mail en fonction de la taille de la colonne dataframe
num_reviews = train["body"].size

# Initialise une liste vide pour contenir les mails
clean_train_reviews = []

# Boucle sur chaque mail; créer un index i qui va de 0 à la longueur du Nombre de mail
for i in range(0, num_reviews):
    # Appelez notre fonction pour chacun et ajoutez le résultat à la liste des mails prétraité
    clean_train_reviews.append(body_to_words(str(train["body"][i])))

print("Cleaning and parsing the training set mail body...\n")
clean_train_reviews = []
for i in range(0, num_reviews):

    # Si l’index est divisible par 1000, affiche un message.
    if ((i + 1) % 1000 == 0):
        print("body %d of %d\n" % (i + 1, num_reviews))
    clean_train_reviews.append(body_to_words(str(train["body"][i])))

Cleaning and parsing the training set mail body...

body 1000 of 2664

body 2000 of 2664



In [10]:
nclusters= 5

In [11]:
clusters = cluster_sentences(clean_train_reviews, nclusters)

In [12]:
 for cluster in range(nclusters):
        print("cluster ", cluster, ":")
        for i, sentence in enumerate(clusters[cluster]):
            print("\tsentence ", i, ": ", clean_train_reviews[sentence])

cluster  0 :
	sentence  0 :  kraken greetings have hours remaining join 1500 people around world have invested combined million kraken exceeding initial goal days minimum investment
	sentence  1 :  kraken greetings pleased announce that kraken will listing cosmos atom atom trading starts 22nd april funding trading start time cosmos plans enable atom transfers block
	sentence  2 :  kraken greetings when first listed bitcoin warned clients blog post that meet usual listing requirements among many flags noted threatening openly hostile
	sentence  3 :  kraken greetings pleased announce that kraken will list digital assets cardano ada quantum qtum trading starts friday september details blog
	sentence  4 :  adresse introuvable message parvenu adresse introuvable peut recevoir messages reponse serveur distant
	sentence  5 :  adresse introuvable message parvenu hugo link outlook fr adresse introuvable peut recevoir messages reponse serveur distant 5 5 0 requested
	sentence  6 :  adresse intro

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 :  dear ranaivoharison manitra keep learning join readers more than countries most important global public health news free easy-to-scan enewsletter save time latest news your fingertips look smart share important articles with your boss friends connect meet players global public health subscribe here free global public health newsletter global health essential weekday reading tens thousands subscribers features up-to-the-minute news well exclusive commentaries news articles q amp give risk click unsubscribes already subscriber help spread news please share this email with friend best brian simpson dayna kerecman myers editors global health johns hopkins bloomberg school public health johns hopkins university recevez e-mail inscrit e promotional emails johns hopkins university coursera veuillez repondre directement e-mail questions commentaires rendez-vous notresite assistance facebook twitter blog blog copyright 2018 coursera evelyn avenue mountain view 94041 etats-unis plus recevoir

In [None]:
gaussian_mixture_model(train_data_features)

In [None]:
k_means_model(train_data_features)