In [23]:
import pandas as pd
import glob
import unidecode
import re

import nltk
from nltk.corpus import stopwords
import os
import spacy
from happytransformer import HappyTextToText
from happytransformer import TTSettings
from langdetect import detect

In [3]:
def fusion_csv(liste_csv):
    dataset = pd.read_csv(liste_csv[0])
    for ds in liste_csv[1:]:
        dataset = pd.concat([dataset, pd.read_csv(ds)], ignore_index=True)
    
    return dataset

In [26]:
liste_csv = glob.glob('data'+os.path.sep+'*.CSV')

dataset = fusion_csv(liste_csv)

dataset

"\n17 octobre 2022\n\n \t\n\n \t\n\n \t \tÀ bientôt, Oscar\t\n\nVoici la facture de votre course annulée.\t\n\n\n \t\n\n \t\n\nTotal\t15,00 €\t\n\n\tPour compenser le désagrément causé aux chauffeurs, des frais vous sont facturés si vous annulez une course 2 minutes après qu'un chauffeur l'a acceptée. Si vous devez annuler une course, faites-le avant expiration du délai afin d'éviter de payer des frais.\t\n\n \t\n\nFrais d'annulation\t15,00 €\t\n\n \t\n\nSous-total\t15,00 €\t\n\n \t\n\nPaiements\t \t\n\n\nRevolut Oscar ••••9968\t\n\n17/10/2022 4:23\t\n\n15,00 €\t\n\n\n\n\n\n\nNotre protocole de sécurité a reçu la certification AENOR.\t\n\n \t\n\nUberX\n\nCourse annulée |\t\n\n\nCommande acceptée\t\n\n\nCommande annulée\t\n\n\n\n\n\n\n \t\n\n\n \t\n\n\n \t\n\n\n\n\nUber B.V.\n\nMr. Treublaan 7\n\n1097 DP Amsterdam\t\n\n\t\n\n \t\n\nSi vous pensez qu'il s'agit d'une erreur, rendez-vous sur help.uber.com ou appuyez sur AIDE dans le menu de l'application Uber. Nous étudierons le problème d

In [24]:
happy_tt = HappyTextToText("MARIAN", "Helsinki-NLP/opus-mt-fr-en")
args = TTSettings(min_length=2)

def text_translation(text):
    lang = detect(text)
    if lang == "fr":
        #translate
        translated_text = happy_tt.generate_text(text, args=args)
        print(translated_text)
    elif lang == "en":
        print("") #nothing happens
    else:
        #classify the email as autre
        print("")

In [25]:
text_translation(remove_https(dataset['Corps'][0])) #just to try

11/17/2022 13:27:36 - INFO - happytransformer.happy_transformer -   Using model: cpu


TextToTextResult(text='17 October 2022 See you soon, Oscar Here is the invoice for your cancelled race. Total €15.00 To compensate drivers for the inconvenience, a fee is charged if you cancel a race 2 minutes after a driver has accepted it. If you')


In [9]:
def text_cleaning(text):
    text = str(text)
    text = unidecode.unidecode(text)
    text = re.sub(r"""[.,/"':!;\\]""", '', text)
    text = re.sub(r"""[0-9]+""", '', text) #removing numbers
    text = re.sub(r"""-""", ' ', text) #uniquement - pour les mots du style "allez-vous"
    text = re.sub('\s+', ' ', text)
    text = text.split(' ')
    return text

In [10]:
def clean_address(text):
    index = len(text)
    text = text.replace('@', ' ')
    text = text[:index-4] + text[index-4:].replace(".", ' ')
    text = text.split(' ')
    return text

In [11]:
def stop_words_french(text):
    stop_words = stopwords.words('french')
    text = [word for word in text if word not in stopwords.words('french') and len(word)>1]
    return text

def stop_words_english(text):
    stop_words = stopwords.words('english')
    text = [word for word in text if word not in stopwords.words('english') and len(word)>1]
    return text

In [12]:
def lemmatization(nlp, texte):
    i = 0
    # On regarde chaque mot dans le texte
    # Chaque mot a le numéro i
    for mot in texte:
        # on va lemmatizer
        doc = nlp(mot)
        for token in doc:
            texte[i] = token.lemma_.lower()
            
        i += 1
    
            
    return texte

In [17]:
def remove_https(corps):

    text_file = open(r'corps.txt', 'w',  encoding="utf-8")
    text_file.write(corps)
    text_file.close()

        
    final_text_file = open(r'final_corps.txt', 'w',  encoding="utf-8")
    reading_text_file = open(r'corps.txt', 'r',  encoding="utf-8")
    for line in reading_text_file:
        if "http" not in line:
            final_text_file.write(line)
            
    final_text_file.close()
    reading_text_file.close()

    with open('final_corps.txt', 'r', encoding="utf-8") as file:
        return file.read()

In [None]:
def columns_treatment(df):
    df = df.iloc[:,[0,1,2,3]]
    df = df.rename(columns={'Objet':'objet','Corps':'corps','De: (nom)':'nom', 'De: (adresse)':'adresse'})
    return df

In [None]:
def data_cleaning(df):
    
    df = columns_treatment(df)
    
    # nlp_fr = spacy.load('fr_core_news_md')
    nlp_en = spacy.load('en_core_web_md')
    
    is_english = False

    for i in df.index:
        
        corps = remove_https(str(df['corps'][i]))

        corps = text_cleaning(corps)
        
        objet = text_cleaning(df['objet'][i])

        #stop words cleaning for object
        objet = stop_words_english(objet)
        # objet = stop_words_french(objet)
        
        # Lemmatization
        # objet_fr = lemmatization(nlp_fr, objet)
        objet_en = lemmatization(nlp_en, objet)
        
        # on vérifie si l'objet a été lemmatizé en anglais ou non
        # if objet_en != objet:
        #     is_english = True
        # if is_english:
        # objet = objet_en
        corps = stop_words_english(corps)
        corps = lemmatization(nlp_en, corps)
        # else:
        #     objet = objet_fr
        #     corps = stop_words_french(corps)
        #     corps = lemmatization(nlp_fr, corps)

        df['objet'][i] = objet_en
        df['corps'][i] = corps

        df['adresse'][i] = clean_address(df['adresse'][i])

    return df

In [None]:
dataset = data_cleaning(dataset)


In [None]:
dataset

In [None]:
dataset.head()