In [26]:
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords  # Import the stop word list
import re
import nltk
import collections
from sklearn.metrics import silhouette_samples, silhouette_score
from metrics import *
import random

In [2]:
FILE = open("./resultat_clustering.txt", "w")

In [3]:
TOKENIZED_WORDS = ['import', 'px', 'width', 'class', 'pad', 'none', 'td', 'height', 'tabl', 'font', 'px', 'font',
                   'pad', 'margin', 'serif', 'helvetica', 'text', 'color', 'san', 'arial', 'read', 'your', 'thi',
                   'with', 'data', 'learn', 'from', 'email', 'that', 'have', 'bonjour', 'plu', 'compt', 'cordial',
                   'pari', 'mail', 'bien', 'tout', 'screen', 'content', 'don', 'imag', 'med', 'util', 'page',
                   'aur', 'aurion', 'auron', 'avi', 'avon', 'ayon', 'dan', 'e', 'euss', 'eussion', 'eûm',
                   'fuss', 'fussion', 'fûm', 'mêm', 'notr', 'ser', 'serion', 'seron', 'soi', 'somm', 'soyon',
                   'votr', 'éti', 'étion', 'ête', 'dat', 'hav', 'helvetic', 'non', 'decor', 'lin', 'underlin', 'bord',
                   'left', 'top', 'auto', 'display', 'padding', 'bottom', 'this', 'learning', 'about', 'mor', 'will',
                   'siz', 'decor', 'max', 'right', 'a', 'externalclass', 'img', 'block', 'align', 'body,', 'remerc',
                   'bon', 'merc', 'envoy', 'souhait', 'messag', 'fair', 'unsubscrib', 'mak', 'sent', 'help',
                   'vis', 'part', 'amp', 'com', 'cod', 'plus', 'messag', 'cet', 'servic', 'tous',
                   'inform', 'merc', 'pass', 'adress', 'souh', 'body', 'background', 'hov', 'span', 'styl', 'solid',
                   'family', 'cliqu', 'lien', 'fr', 'consult', 'pouv', 'lign', 'jour', 'utilis', 'demand', 'veuill',
                   'appliqu', 'don', 'chang', 'social', 'manitr', 'imag', 'min', 'only', 'equip', 'only', 'medi',
                   'plac', 'question', 'recevoir', 'relat', 'repondr', 'repons', 'reserv', 'googl', 'view', 'tim',
                   'aide', 'autre', 'bientot', 'cedex', 'cent', 'detail', 'direct', 'droit', 'espace', 'ete',
                   'etre', 'foot', 'hide', 'inlin', 'link', 'mobil', 'outlook', 'site', 'sous', 'suivie',
                   'trouve', 'weight', 'aid', 'autr', 'espac', 'hid', 'pag', 'repon', 'sit', 'style', 'time', 'use',
                   'veuillez', 'visit', 'merci', 'more', 'outlin', 'place', 'pouvez', 'rel', 'securit', 'size',
                   'size line', 'important', 'mcntextcontent',
                   'suiv', 'trouv', 'what', 'you', 'non', ',', 'bodi', 'capit', 'etr', 'famili', 'helvet',
                   'onli', 'sou', 'suivi', 'tou', 'utili', 'vi', 'applic', 'bonn', 'border', 'border border',
                   'center', 'cett', 'cliquez', 'code', 'commun', 'concern', 'consultez', 'date',
                   'float', 'footer', 'hover', 'line', 'make', 'media', 'souhaitez', 'ranaivoharison', 'manitra',
                   'manitra ranaivoharison', 'francisco']

In [4]:
def body_to_words(raw_body):
    letters_only = re.sub("[^a-zA-Z]", " ", raw_body)
    text = re.sub('<[^<]+?>', '', letters_only)
    text_clean = ' '.join([w for w in text.split() if ((len(w) > 3) and (len(w) < 23))])
    words = text_clean.lower().split()
    stop_words = set(stopwords.words('french') + stopwords.words('english') + TOKENIZED_WORDS)
    meaningful_words = [w for w in words if w not in stop_words]
    # clean_words = [w for w in meaningful_words if w not in TOKENIZED_WORDS]
    return " ".join(meaningful_words)

In [5]:
def word_tokenizer(text):
    tokens = word_tokenize(text, language='french')
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens if t not in (stopwords.words('french') + stopwords.words('english'))]
    return tokens

In [6]:
def tokenize_and_stem(text):
    tokens = [word for sent in
              nltk.sent_tokenize(text, language='french') for word in nltk.word_tokenize(sent, language='french')]
    filtered_tokens = []
    stemmer = SnowballStemmer(language='french')
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [7]:
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text, language='french') for word in
              nltk.word_tokenize(sent, language='french')]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [8]:
def pre_processing_dataset(dataset_train):
    # Obtenez le nombre de mail en fonction de la taille de la colonne dataframe
    num_reviews = dataset_train["body"].size

    # Initialise une liste vide pour contenir les mails
    clean_train_reviews = []
    for i in range(0, num_reviews):

        # Si l’index est divisible par 1000, affiche un message.
        if (i + 1) % 1000 == 0:
            print("body %d of %d\n" % (i + 1, num_reviews))
        clean_train_reviews.append({'body': body_to_words(str(dataset_train["body"][i])),
                                    'idMail': dataset_train["idMail"][i]})

    print("Creating the bag of words...\n")
    return clean_train_reviews

In [9]:
def predict_clustering_group(k_means_model, tfidf_matrix):
    cluster_labels = k_means_model.fit_predict(tfidf_matrix)
    silhouette_avg = silhouette_score(tfidf_matrix, cluster_labels)
    sample_silhouette_values = silhouette_samples(tfidf_matrix, cluster_labels)
    centers = k_means_model.cluster_centers_
    n_clusters = centers.shape[0]
    
    #print('cluster_labels', cluster_labels)
    #print('silhouette_avg', silhouette_avg)
    #print('sample_silhouette_values', sample_silhouette_values)
    #print('centers', centers)
    #print('n_clusters', n_clusters)
    return cluster_labels, silhouette_avg, sample_silhouette_values

In [10]:
def show_details_cluster(vocab_frame, k_means_model, tfidf_matrix, tfidf_vectorizer, clusters, clean_train_reviews, n_clusters):
    print("___________________________")
    order_centroids = k_means_model.cluster_centers_.argsort()[:, ::-1]
    terms = tfidf_vectorizer.get_feature_names()
    dist = 1 - cosine_similarity(tfidf_matrix)
    print(dist)

    print('ORDER_CENTROIDS')
    label = []
    for cluster in range(n_clusters):
        print("Cluster %d:" % cluster)
        cluster_label = []
        print("TERMS", terms),
        for ind in order_centroids[cluster, :n_clusters]:
            print("%s" % terms[ind])
            label_name = vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore')
            cluster_label.insert(cluster,label_name.decode('utf-8'))
        label.append(cluster_label)
    
    print(label)
        
    for cluster in range(n_clusters):
        FILE.write("cluster " + str(cluster) + ":" + "\n")
        FILE.write("centroid" + str(cluster) + "\n")
        
        for i, sentence in enumerate(clusters[cluster]):
            clean_train_reviews[sentence]['cluster_group'] = str(cluster)
            clean_train_reviews[sentence]['label'] = str(label[cluster])
            # print("\tsentence ", i, ": ", clean_train_reviews[sentence])
            FILE.write("mail :" + str(i) + ": " + str(clean_train_reviews[sentence]) + "\n")

    
    print("___________________________ \n")
    centers = k_means_model.cluster_centers_

    print('SCORE %s \n', k_means_model.score(tfidf_matrix))
    print('INERTIA %s \n', k_means_model.inertia_)
    print("CENTRE : %s", centers)
    print("TERMS", terms)
    return order_centroids, centers

In [11]:
def build_cluster_from_model(n_clusters, tfidf_matrix):
    k_means_model = KMeans(n_clusters=n_clusters,
                           init='k-means++',
                           max_iter=300,
                           n_init=1)

    k_means_model.fit(tfidf_matrix)

    clusters = collections.defaultdict(list)
    for i, label in enumerate(k_means_model.labels_):
        # print(k_means_model.labels_)
        clusters[label].append(i)

    return dict(clusters), k_means_model

In [12]:
def build_tfidf_matrix_vector(dataset):
    train_body = []
    for i in range(0, len(dataset)):
        train_body.append(dataset[i]['body'])

    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem,
                                       analyzer='word',
                                       stop_words=stopwords.words('french') +
                                                  TOKENIZED_WORDS +
                                                  stopwords.words('english'),
                                       max_df=0.8,
                                       min_df=0.15,
                                       lowercase=False,
                                       use_idf=True,
                                       max_features=200000,
                                       ngram_range=(1, 3))

    tfidf_matrix = tfidf_vectorizer.fit_transform(train_body)
    print(tfidf_matrix.shape)
    return tfidf_matrix, tfidf_vectorizer

In [13]:
import pandas as pd
import nltk

In [14]:
nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [198]:
def body_to_words_lda(raw_body):
    text_data = []
    letters_only = re.sub("[^a-zA-Z]", " ", raw_body)
    text = re.sub('<[^<]+?>', '', letters_only)
    text_clean = ' '.join([w for w in text.split() if ((len(w) > 3) and (len(w) < 23))])
    words = text_clean.lower().split()
    stop_words = set(stopwords.words('french') + stopwords.words('english') + TOKENIZED_WORDS)
    meaningful_words = [w for w in words if w not in stop_words]
    text_data.append(meaningful_words)
    return text_data

In [199]:
def pre_processing_dataset_lda(dataset_train):
    # Obtenez le nombre de mail en fonction de la taille de la colonne dataframe
    num_reviews = dataset_train["body"].size

    # Initialise une liste vide pour contenir les mails
    clean_train_reviews = []
    for i in range(0, num_reviews):

        # Si l’index est divisible par 1000, affiche un message.
        if (i + 1) % 1000 == 0:
            print("body %d of %d\n" % (i + 1, num_reviews))
        clean_train_reviews.append({'body': body_to_words_lda(str(dataset_train["body"][i])),
                                    'idMail': dataset_train["idMail"][i]})
    
    print("Creating the bag of words...\n")
    return clean_train_reviews

In [200]:
PATH ='./dataset/'

In [201]:
data = 'mada.apps.creation@gmail.com1563099050.csv'

In [202]:
train = pd.read_csv(PATH + data, encoding='utf-8')

In [203]:
clean_train_reviews_lda = pre_processing_dataset_lda(train)

Creating the bag of words...



In [205]:
clean_train_reviews_lda[0]['body'][0]

['souhaitons',
 'mettre',
 'plateforme',
 'mobile',
 'desktop',
 'client',
 'interne',
 'souhaite',
 'mettre',
 'contact',
 'parler',
 'besoins',
 'projet',
 'mamisoa']

In [218]:
body = [d['body'] for d in clean_train_reviews]

In [220]:
import pickle
pickle.dump(body, open('body.pkl', 'wb'))
dictionary.save('dictionary.gensim')

NameError: name 'dictionary' is not defined

In [None]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in body:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:
print(len(totalvocab_stemmed))

In [None]:
print(len(totalvocab_tokenized))

In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [None]:
print(vocab_frame.head())

In [None]:
tfidf_matrix, tfidf_vectorizer = build_tfidf_matrix_vector(clean_train_reviews)

In [None]:
terms = tfidf_vectorizer.get_feature_names()

In [None]:
print(terms)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
print
print

In [None]:
# calculating the within clusters sum-of-squares for 19 cluster amounts
sum_of_squares = calculate_wcss(tfidf_matrix)

In [None]:
# calculating the optimal number of clusters
n_clusters = optimal_number_of_clusters(sum_of_squares)

In [None]:
n_clusters

In [None]:
clusters, k_means_model = build_cluster_from_model(n_clusters, tfidf_matrix)

In [None]:
cluster_labels, silhouette_avg, sample_silhouette_values = predict_clustering_group(k_means_model, tfidf_matrix)

In [None]:
order_centroids, centers = show_details_cluster(vocab_frame,k_means_model, tfidf_matrix, tfidf_vectorizer, clusters,
                                                clean_train_reviews, n_clusters)

In [None]:
#keyValList = ['0']
#expectedResult = [d['body'] for d in clean_train_reviews if d['cluster_group'] in keyValList]