In [76]:
import pandas as pd
import os
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
from bs4 import BeautifulSoup
from sklearn.utils import shuffle

<h1>Préparation des données (Preprocessing)</h1>

<h3>Lecture des mails depuis des fichiers<h3>

In [21]:
path="./dataset"
directories=os.listdir(path)
directories

['easy_ham', 'hard_ham', 'spam']

<h2></h2>

<h2>Suppresion Des Balises HTML</h2>

In [22]:
def remove_html(email : str)->str:
    soup = BeautifulSoup(email, 'html.parser')
    new_email = soup.get_text()
    return new_email

<h2>Normalisation Des URLS</h2>

In [23]:
def url_normalization(email : str)->str:
    url_regex = r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'
    new_email = re.sub(url_regex, 'httpaddr', email)
    return new_email

<h2>Normalisation Des Adresses Email</h2>

In [24]:
def address_normalization(email, remplacement="emailaddr"):
    # Expression régulière pour détecter les adresses email
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

    # Remplacement des adresses email par la chaîne de remplacement emailaddr
    normalized_email = re.sub(email_pattern, remplacement, email)

    return normalized_email

<h2>Normalisation Des Nombres</h2>

In [25]:
def number_normalization(email, Remplacement="number"):
    # Expression régulière pour détecter les nombres
    number_pattern = r'\d+'

    # Remplacement des nombres par la chaîne de remplacement
    normalized_email = re.sub(number_pattern, Remplacement, email)

    return normalized_email

<h2>Normalisation Des Dollars</h2>

In [26]:
def dollars_normalization(email : str)->str:
    email_contents = re.sub('[$]+', 'dollar', email)
    return email_contents

<h2>Radicalisation Des Mots</h2>

In [27]:
def word_stemming(email : str)->str:

    ps = PorterStemmer()
    words = word_tokenize(email)
    t_email=""
    for w in words:
        t_email+=ps.stem(w) + " "
    return t_email.strip()
    

<h2>Suppression Des Non-mots Et Ponctuation</h2>

In [28]:
def non_word_cleaning(email : str)->str:
    # Supprimer la ponctuation
    email = email.translate(str.maketrans("", "", string.punctuation))
    # Supprimer les mots vides (stop words)
    stop_words = set(stopwords.words('english'))
    words = email.split()
    words = [word for word in words if word not in stop_words]
    email = ' '.join(words)
    
    return email

In [29]:
def space_cleaning(email : str)->str:
    lines=email.split("\n")
    newlines=[' '.join(line.split()) for line in lines]
    while '' in newlines:
        newlines.remove('')
    content=""
    for l in newlines:
        content+=l.strip() + " "
    
    return content.strip()

<h2>Netoyage D'emails<h2>

In [30]:
def clean_email(email):
    cleaned_email=email.lower()
    cleaned_email=space_cleaning(cleaned_email)
    cleaned_email=remove_html(cleaned_email)
    cleaned_email=url_normalization(cleaned_email)
    cleaned_email=address_normalization(cleaned_email)
    cleaned_email=number_normalization(cleaned_email)
    cleaned_email=dollars_normalization(cleaned_email)
    cleaned_email=non_word_cleaning(cleaned_email)
    cleaned_email=word_stemming(cleaned_email)
    return cleaned_email

#test

test_email="""
>Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors youre expecting. This can be anywhere from less than 10 bucks a month to a couple of $100. You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 if youre running something big..


To unsubscribe yourself from this mailing list, send an email to: groupname-unsubscribe@egroups.com
"""
clean_email(test_email)





'anyon know much cost host web portal well depend mani visitor your expect anywher less number buck month coupl dollarnumb checkout httpaddr perhap amazon ecnumb your run someth big unsubscrib mail list send email emailaddr'

In [41]:
import tqdm
files={d:os.listdir(f"{path}/{d}") for d in directories}
for k,v in files.items():
    for i in tqdm.tqdm(range(len(v))):
        with open(f"{path}/{k}/{v[i]}",'r',errors="ignore") as f:
            email=f.read()
        cleaned_email=clean_email(email)
        v[i]=cleaned_email

100%|██████████| 6451/6451 [00:50<00:00, 127.91it/s]
100%|██████████| 500/500 [00:14<00:00, 33.40it/s]
100%|██████████| 2398/2398 [00:26<00:00, 89.67it/s] 


<h2>Exportation des emails en fichier CSV</h2>

In [74]:
dataframes={"easy_ham":None,"hard_ham":None,'spam':None}
for k in files:
    dataframes[k]=pd.DataFrame(files[k],columns=['text'])
    if k=='spam':
        dataframes[k][1]=[1]*dataframes[k].shape[0]
    else:
        dataframes[k][1]=[0]*dataframes[k].shape[0]

    dataframes[k][2]=[k]*dataframes[k].shape[0]

In [78]:
df=pd.concat([v for k,v in dataframes.items()],axis=0)
df.columns=['text','label_num','label']
df=shuffle(df).reset_index(drop=True)
df.head()

Unnamed: 0,text,label_num,label
0,emailaddr thu aug number numbernumbernumb numb...,0,easy_ham
1,emailaddr mon sep number numbernumbernumb numb...,0,easy_ham
2,emailaddr wed sep number numbernumbernumb numb...,1,spam
3,emailaddr wed aug number numbernumbernumb numb...,0,easy_ham
4,emailaddr wed oct number numbernumbernumb numb...,0,easy_ham


In [59]:
df.to_csv("./data.csv")