# Data preparation for NLP algorithms

### Import libraries 

In [None]:
import pandas as pd
import numpy as np
import glob
import unidecode
import re

import nltk
from nltk.corpus import stopwords
import os
import spacy
from happytransformer import HappyTextToText
from happytransformer import TTSettings
from langdetect import detect

### Recovering the emails from different email boxes

For this first step, we use Outlook to export in a csv file all the emails from different boxes

### CSVs fusion
it permits to fusionnate the CSVs files coming from different emails box from Outlook, in order to get only one dataset with all the emails.

In [None]:
def fusion_csv(liste_csv):
    dataset = pd.read_csv(liste_csv[0])
    for ds in liste_csv[1:]:
        dataset = pd.concat([dataset, pd.read_csv(ds)], ignore_index=True)
    
    return dataset

In [None]:
liste_csv = glob.glob('data'+os.path.sep+'*.CSV')

dataset = fusion_csv(liste_csv)

dataset = dataset.sample(frac=1).reset_index(drop=True)

### Columns treatment
The aim here is just to select the columns we wanted to keep, and to rename them. The three columns at the end are : object, body and address

In [None]:
def columns_treatment(df):
    df = df.iloc[:,[0,1,3]]
    df = df.rename(columns={'Objet':'objet','Corps':'corps', 'De: (adresse)':'adresse'})
    return df

### Remove https
Doing our data exploration, we saw that there were a lot of links in the different body emails and we needed to remove them because it would have disturbed our model. So we removed it.

In [None]:
def remove_https(corps):

    text_file = open(r'corps.txt', 'w',  encoding="utf-8")
    text_file.write(corps)
    text_file.close()

        
    final_text_file = open(r'final_corps.txt', 'w',  encoding="utf-8")
    reading_text_file = open(r'corps.txt', 'r',  encoding="utf-8")
    for line in reading_text_file:
        if "http" not in line:
            final_text_file.write(line)
            
    final_text_file.close()
    reading_text_file.close()

    with open('final_corps.txt', 'r', encoding="utf-8") as file:
        return file.read()

### Text cleaning
For this part, the goal is to delete all the parasitic characters in the body and the object of an email, we used re.sub to do this operation.

In [None]:
def text_cleaning(text):
    text = str(text)
    text = unidecode.unidecode(text)
    text = re.sub(r"""[.,(/"'?:)!;\\]""", '', text)
    text = re.sub(r"""[0-9]+""", '', text) #removing numbers
    text = re.sub(r"""-""", ' ', text) #uniquement - pour les mots du style "allez-vous"
    text = re.sub(r"""_""", ' ', text) 
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\<.*?\>', ' ', text)
    return text

### Text translation
We noticed, during the first attempts of data cleaning, that it was time consuming to deal with the different languages in our dataset. Moreover, we noticed that the lemmatization was quite more effective on english emails than on french email. So we decided to translate all our french emails in english, using a Neural Network model from Hugging Face, it is named : Helsinki-NLP/opus-mt-fr-en

In [None]:
happy_tt = HappyTextToText("MARIAN", "Helsinki-NLP/opus-mt-fr-en")
args = TTSettings(min_length=2)

In [None]:
def text_translation(text):
    try:
        lang = detect(text)
    except:
        lang = "error"
    if lang == "fr":
        #translate
        final_trans_text = ""
        ran = round(len(text.split())/50)
        if ran == 0:
            trans_text = happy_tt.generate_text(text, args=args)
            final_trans_text = trans_text.text
        else:
            for i in range(ran):
                piece_of_text = ' '.join(text.split()[i*50:50+(i*50)])                
                trans_text = happy_tt.generate_text(piece_of_text, args=args)
                final_trans_text = final_trans_text + " " + trans_text.text
        
    elif lang == "en":
        final_trans_text = text
        print("") #nothing happens
    else:
        #classify the email as autre
        final_trans_text = text
        print("")
    return final_trans_text

### Removing stop words
Some words are not usefull at all and are only noise in a text for NLP algorithms. These words correspond to linking word such as 'and' or other words like 'to' for example

In [None]:
def stop_words_english(text):
    stop_words = stopwords.words('english')
    text = [word for word in text.split() if ((word not in stop_words) and (len(word)>1))]
    
    return text

### Lemmatization
is the process of grouping together the inflected forms of a word so they can be analysed as a single item. For example, it permits to put all verbs in indicative form.

In [None]:
def lemmatization(nlp, texte):
    i = 0
    # On regarde chaque mot dans le texte
    # Chaque mot a le numéro i
    for mot in texte:
        # on va lemmatizer
        doc = nlp(mot)
        for token in doc:
            texte[i] = token.lemma_.lower()
            
        i += 1
    
            
    return texte

### Clean adress 
In our dataset, we decided to clean the address column, keeping a list of three elements, here is the pattern : [name before '@', name after '@', the domain which is at the end]

In [None]:
def clean_address(text):
    index = len(text)
    text = text.replace('@', ' ')
    text = text[:index-4] + text[index-4:].replace(".", ' ')
    text = text.split(' ')
    return text

### Data cleaning function
Here is the function calling all the different functions we created above, in order to retunr a cleaned dataset at the end.

In [None]:
def data_cleaning(df, nb):
    
    df = columns_treatment(df)
    
    nlp_en = spacy.load('en_core_web_md')

    for i in df.index:
        
        corps = remove_https(str(df['corps'][i]))

        corps = text_cleaning(corps)

        corps = text_translation(corps)
        
        objet = text_cleaning(df['objet'][i])

        objet = text_translation(objet)

        
        #stop words cleaning for object
        objet = stop_words_english(objet)
        objet_en = lemmatization(nlp_en, objet)
        
        #stop words cleaning for corps
        corps = stop_words_english(corps)
        corps = lemmatization(nlp_en, corps)

        df['objet'][i] = objet_en
        df['corps'][i] = corps

        df['adresse'][i] = clean_address(df['adresse'][i])
        print("dernier mail traité est le numéro : ", nb)
        nb = nb+1
        if nb%500 == 0:
            df.to_csv('df_'+str(nb)+'.CSV')
    return df

### Clean the dataset and print it

In [None]:
dataset = data_cleaning(dataset, 0)

In [None]:
dataset.head()