In [305]:
import pandas as pd
import unidecode
import re

import nltk
from nltk.corpus import stopwords

import spacy

In [306]:
gmail = pd.read_csv("data/gmail_oscar.CSV")

zimbra = pd.read_csv("data/zimbra_oscar.CSV")

In [307]:
def text_cleaning(text):
    text = unidecode.unidecode(text)
    text = text.lower()
    text = re.sub(r"""[.,"'-:!;]""", '', text)
    text = re.sub('\s+', ' ', text)
    text = text.split(' ')
    return text

In [308]:
def stop_words_french(text):
    stop_words = stopwords.words('french')
    text = [word for word in text if word not in stopwords.words('french') and len(word)>1]
    return text

def stop_words_english(text):
    stop_words = stopwords.words('english')
    text = [word for word in text if word not in stopwords.words('english') and len(word)>1]
    return text

In [309]:
def lemmatization(nlp, texte):
    i = 0
    # On regarde chaque mot dans le texte
    # Chaque mot a le numéro i
    for mot in texte:
        # on va lemmatizer
        doc = nlp(mot)
        for token in doc:
            texte[i] = token.lemma_
        i += 1
            
    return texte

In [310]:
def remove_https(corps):
    
    text_file = open(r'corps.txt', 'w',  encoding="utf-8")
    text_file.write(corps)
    text_file.close()

        
    final_text_file = open(r'final_corps.txt', 'w',  encoding="utf-8")
    reading_text_file = open(r'corps.txt', 'r',  encoding="utf-8")
    for line in reading_text_file:
        if "https" not in line:
            final_text_file.write(line)
            
    final_text_file.close()
    reading_text_file.close()

    with open('final_corps.txt', 'r', encoding="utf-8") as file:
        return file.read()

In [311]:
def columns_treatment(df):
    df = df.iloc[:,[0,1,2,3]]
    df = df.rename(columns={'Objet':'objet','Corps':'corps','De: (nom)':'nom', 'De: (adresse)':'adresse'})
    return df

In [312]:
def data_cleaning(df):
    
    df = columns_treatment(df)
    
    nlp_fr = spacy.load('fr_core_news_md')
    nlp_en = spacy.load('en_core_web_md')
    
    is_english = False

    for i in df.index:
        
        corps = remove_https(df['corps'][i])

        corps = text_cleaning(corps)
        
        objet = text_cleaning(df['objet'][i])

        #stop words cleaning for object
        objet = stop_words_english(objet)
        objet = stop_words_french(objet)
        
        # Lemmatization
        objet_fr = lemmatization(nlp_fr, objet)
        
        objet_en = lemmatization(nlp_en, objet)
        
        # on vérifie si l'objet a été lemmatizé en anglais ou non
        if objet_en != objet:
            is_english = True
        if is_english:
            objet = objet_en
            corps = stop_words_english(corps)
            corps = lemmatization(nlp_en, corps)
        else:
            objet = objet_fr
            corps = stop_words_french(corps)
            corps = lemmatization(nlp_fr, corps)

        df['objet'][i] = objet
        df['corps'][i] = corps

    return df

In [313]:
print(len(gmail),len(zimbra))

10 120


In [314]:
gmail = data_cleaning(gmail)

In [315]:
gmail.head()

Unnamed: 0,objet,corps,nom,adresse
0,"[course, lundi, matin, uber]","[octobre, bientot, oscar, voici, facture, cour...",Reçu Uber,noreply@uber.com
1,"[verifier, frais, dannulation]","[vouloir, repondre, audessus, ce, ligne, repon...",Uber Support,contact_b92ffc2b-70bb-4116-ab15-6363af4dc0ce@e...
2,"[course, lundi, matin, uber]","[octobre, oscar, merci, davoir, utilise, uber,...",Reçu Uber,noreply@uber.com
3,"[vente, flash, biere, blonde]","[decouvrer, vite, selection, jusqua, demain, s...",Saveur Bière By PerfectDraft,saveur-biere@m.perfectdraft.com
4,"[alerte, mail, appartement, piece, pari]","[annoncer, correspondre, recherche, location, ...",PAP.fr,users-alertes@pap.fr
