*****Dataprocess.ipynb***** 

- This file converts raw emails, saved in .txt format, into CSV files after completing the preprocessing and text cleaning process. Also, the file converts the Training dataset (Enron), and the testing AI dataset, which contain my inbox email and AI-generated emails, into two CSV file and save new files in the "Data/training_Data" folder.

In [1]:
# Importing libraries
import pandas as pd
import csv
import string
import glob
import re
import os
from nltk.corpus import stopwords
from nltk import word_tokenize
import text_hammer as th
import spacy

In [2]:
# Dowwnload the en_core_web_sm model for spacy
!spacy download en_core_web_sm 1> /dev/null


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# Please change the path to the data folder
dirpath = "Data/"

In [4]:
name_of_raw_hams = [name for name in glob.glob(f'{dirpath}Training_Data/ham/*.txt')]
name_of_raw_spams = [name for name in glob.glob(f'{dirpath}Training_Data/spam/*.txt')]
list_of_csv_ham = []
list_of_csv_spam = []

In [5]:
# Combine ham emails to list_of_csv_ham
for ham in name_of_raw_hams:
    ham_content = pd.read_csv(ham, sep='delimiter', header=None,engine='python')
    list_of_csv_ham.append(ham_content)
# Combine spam emails to list_of_csv_spam
for spam in name_of_raw_spams:
    spam_content = pd.read_csv(spam, sep='delimiter', header=None,engine='python')
    list_of_csv_spam.append(spam_content)
# Convert to data frames
df_hams = pd.concat(list_of_csv_ham).reset_index(drop=True)
df_spams = pd.concat(list_of_csv_spam).reset_index(drop=True)
# Add the label column
df_hams['y'] = [0] * len(df_hams)
df_spams['y'] = [1] * len(df_spams)

In [6]:
print(f"Number of HAM mails =   {len(df_hams)}")
print(f"Number of SPAM mails =   {len(df_spams)}")

Number of HAM mails =   403551
Number of SPAM mails =   263650


In [7]:
# Update column names
col_name = {0: 'X', 'y': 'y'}
df_hams = df_hams.rename(columns=col_name)
df_spams = df_spams.rename(columns=col_name)

In [8]:
# Concatenate to a full dataset
df = pd.concat([df_hams, df_spams], axis=0)

In [9]:
# Randomly shuffle rows to mix ham emails with spam ones
df = df.sample(frac = 1).reset_index(drop=True)

In [None]:
display(df.shape)
display(df.head(10))
display(df.tail(10))

(667201, 2)

Unnamed: 0,X,y
0,"from : kitchen , louise",0
1,dualism croquet haley insurmountable reformato...,1
2,some people complain because the roses have th...,1
3,the reaction to the idea that mr lay would pro...,0
4,the juno software . your e - mail messages wil...,0
5,able to continue as a going concern in which c...,1
6,administration involvement,0
7,by simon english .,0
8,please respond to,0
9,your online sales dramatically .,1


Unnamed: 0,X,y
667191,* 0 . 5 rcvd _ in _ njabl _ proxy rbl : njabl ...,1
667192,"hello , and save up to 80 %",1
667193,spamassassin - sightings mailing list,1
667194,subject : tony hamilton,0
667195,stationery and web - sites . under our careful...,1
667196,welcome to rx - pills online discount pharmacy...,1
667197,vince j kaminski,0
667198,approval to writeoff the volumes to unaccounte...,0
667199,02 / 17 / 00 04 : 10 pm,0
667200,we also supply target email list according to ...,1


In [10]:
# Creating a function for text cleaning 

def text_cleaning(df,col_name):
    #-----Remove Stopwords----
    df[col_name] = df[col_name].progress_apply(lambda x: th.remove_stopwords(x))
    #----Remove Special Character----
    df[col_name] = df[col_name].progress_apply(lambda x: th.remove_special_chars(x))
    #---Remove accented characters---
    df[col_name] = df[col_name].progress_apply(lambda x: th.remove_accented_chars(x))
    # ---Removing HTML tags and URL---
    df[col_name] = df[col_name].progress_apply(lambda x: th.remove_html_tags(x))
    df[col_name] = df[col_name].progress_apply(lambda x: th.remove_urls(x))
    #--- Converting into root words--
    df[col_name] = df[col_name].progress_apply(lambda x: th.make_base(x))                                      
    return df

In [12]:

df = text_cleaning(df, 'X')

  0%|          | 0/667201 [00:00<?, ?it/s]

  0%|          | 0/667201 [00:00<?, ?it/s]

  0%|          | 0/667201 [00:00<?, ?it/s]

  0%|          | 0/667201 [00:00<?, ?it/s]



  0%|          | 0/667201 [00:00<?, ?it/s]

  0%|          | 0/667201 [00:00<?, ?it/s]

In [13]:
# Count how many entries exist for each label
df.value_counts('y')

y
0    403551
1    263650
dtype: int64

In [None]:
# Save as cleaned data frame
df.to_csv(f'{dirpath}Training_Data/emails_cleaned_final.csv', index=False)


### Create a Testing Data set from AI generated Sample Mail.

In [15]:
raw_AI_Spam_mails =[]
raw_AI_Spam_mails = [name for name in glob.glob(f'{dirpath}AI_dataset/Spam/*.txt')]
raw_AI_Ham_mails =[]
raw_AI_Ham_mails = [name for name in glob.glob(f'{dirpath}AI_dataset/Ham/*.txt')]


In [16]:
 # Combine Ai generated emails to one csv file
csv_AI_Spam_mails = []
for mails in raw_AI_Spam_mails:
    Ai_content = pd.read_csv(mails, sep='delimiter', header=None, engine='python')
    csv_AI_Spam_mails.append(Ai_content)
print(f"Numer of AI Generated sample mails:  {len(csv_AI_Spam_mails)}")

csv_AI_Ham_mails = []
for mails in raw_AI_Ham_mails:
    Ai_content = pd.read_csv(mails, sep='delimiter', header=None, engine='python')
    csv_AI_Ham_mails.append(Ai_content)

print(f"Numer of real world Ham mails:  {len(csv_AI_Ham_mails)}")

Numer of AI Generated sample mails:  201
Numer of real world Ham mails:  67


In [17]:
# Convert to data frames
df_Ai_Spam_mails = pd.concat(csv_AI_Spam_mails).reset_index(drop=True)
# Add the label column
df_Ai_Spam_mails['y'] = [1] * len(df_Ai_Spam_mails)

df_Ai_Ham_mails = pd.concat(csv_AI_Ham_mails).reset_index(drop=True)
# Add the label column
df_Ai_Ham_mails['y'] = [0] * len(df_Ai_Ham_mails)

# concatinate both data frames real mails and AI generated mails
df_Ai_mails = pd.concat([df_Ai_Spam_mails, df_Ai_Ham_mails], axis=0)

# update column names
col_name = {0: 'XAI', 'y': 'YAI'}
df_Ai_mails = df_Ai_mails.rename(columns=col_name)
df_Ai_mails

Unnamed: 0,XAI,YAI
0,"Hi William,",1
1,I hope this email finds you well. I came acros...,1
2,"At XYZ-Edu, we have a course called Practical ...",1
3,The course provides insights into real-world a...,1
4,"To make it even more appealing, we are offerin...",1
...,...,...
2506,TikTok logo,0
2507,infopoint@uwe.ac.uk,0
2508,+44 (0)117 32 85678,0
2509,"UWE Bristol, Frenchay Campus, Coldharbour Lane...",0


In [18]:
df_Ai_mails = text_cleaning(df_Ai_mails, 'XAI')

  0%|          | 0/4320 [00:00<?, ?it/s]

  0%|          | 0/4320 [00:00<?, ?it/s]

  0%|          | 0/4320 [00:00<?, ?it/s]

  0%|          | 0/4320 [00:00<?, ?it/s]

  0%|          | 0/4320 [00:00<?, ?it/s]

  0%|          | 0/4320 [00:00<?, ?it/s]

In [19]:
# Save as cleaned data frame
df_Ai_mails.to_csv("Data/Training_Data/AI_emails_cleaned_final.csv", index=False)
