## Important libraries

In [None]:
import math
import numpy as np
import pandas as pd
import scipy as sc
import seaborn as sns
import nltk
import pickle
import ast
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
# Downloading NLTK data
#nltk.download('punkt')       # Downloading tokenizer data
#nltk.download('stopwords')   # Downloading stopwords data
#nltk.download('wordnet')

## Read and show the data

In [None]:
df=pd.read_csv("spam_ham_dataset.csv")
df.head()

## Cleaning the data

In [None]:
df.drop(["label"],axis=1,inplace=True) #drop unnecessary column
df.head()

In [None]:
df.isnull().sum() #no missing values in all rows

In [None]:
df[df["# sent emails "]<0].sum() #no negative number of emails

In [None]:
df["text"].duplicated().sum() #check duplicate email

In [None]:
df.drop_duplicates(subset=["text"],inplace=True) #drop duplicated emails

In [None]:
df["# sent emails "].duplicated().sum() #check duplicate email

In [None]:
df['label_num'].unique()

In [None]:
df.isna().sum()

## show information about the data

In [None]:
df.info()

In [None]:
df.describe()

## Visualization of data

In [None]:
sns.barplot(x='label_num',y='# sent emails ',data=df)

In [None]:
sns.boxplot(x='label_num',y='# sent emails ',data=df)

In [None]:
sns.displot(df['# sent emails '])

In [None]:
sns.jointplot(data=df,x=df['label_num'],y=df['# sent emails '])

In [None]:
sns.countplot(data=df,x=df["label_num"])

## Preprocessing for the data

In [None]:
minmax_scaler=MinMaxScaler()
df['# sent emails ']=minmax_scaler.fit_transform(df['# sent emails '].values.reshape(-1,1))

In [None]:
df.head()

In [None]:
df['# sent emails '].corr(df['label_num'])

In [None]:
def preprocess(email):
    email=re.sub("^Subject: ","",email) #remove (Subject: )
    email=re.sub("[^a-zA-Z]"," ",email) #remove special characters
    #email=re.sub("\s\w\s","",email) # remove s in 's and t in 't (like book's cover or he can't)
    email=re.sub("^\s+","",email) #remove leading space
    email=re.sub("\s+$","",email) #remove trailing space
    email=re.sub("\s+"," ",email) #remove extra spaces between words
    email=email.lower()           #lowercase every word

    return email
#test

x=preprocess(df.loc[2,"text"])
x


In [None]:
def tokenize(email):
     list_of_words=nltk.word_tokenize(email)
     return list_of_words
#test

listx=tokenize(x)
print(listx)

In [None]:
def remove_stopwords(email):
   clean_words=[]
   list_of_words=tokenize(email)

   for word in list_of_words:
      if(word not in stopwords.words('english')):
         clean_words.append(word)

   email=' '.join(clean_words) #convert list to string with seperator between every element (' ')

   return email
  #test
x=preprocess(df.loc[14,"text"])
xnew=remove_stopwords(x)
print(tokenize(xnew))



In [None]:
def lemmatize_email(email):
    lemmatized_words=[]

    lemmatizer=WordNetLemmatizer()
    for word in tokenize(email):
        new_word=lemmatizer.lemmatize(word)
        lemmatized_words.append(new_word)
    email=' '.join(lemmatized_words)
    return email

#test
x=preprocess(df.loc[14,"text"])
xnew=remove_stopwords(x)
xnew2=lemmatize_email(xnew)
xnew2



In [None]:
# apply the functions of preprocessing on the text column
emails=df['text'].apply(preprocess)
emails=emails.apply(remove_stopwords)
emails=emails.apply(lemmatize_email)
df['preprocessed_text']=emails

In [None]:
df.head()

In [None]:
df.drop('text',axis=1,inplace=True)

In [None]:
#convert to vectors
# Initialize TfidfVectorizer with adjusted parameters
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_text'])

In [None]:
vectors=tfidf_matrix.toarray()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
def onehotendoder(df):
    label_0=[]
    label_1=[]
    labels=[]
    for i in range(len(df)):
        if df['label_num'].iloc[i]==0:
            label_0.append(1.0)
            label_1.append(0.0)
            labels.append([1.0,0.0])
        elif df['label_num'].iloc[i]==1:
            label_0.append(0.0)
            label_1.append(1.0)
            labels.append([0.0,1.0])
    df['label_num_0']=np.array(label_0)
    df['label_num_1']=np.array(label_1)
    labels=np.array(labels)
    return df,labels
df,labels=onehotendoder(df)

In [None]:
df.head()

In [None]:
df.drop('label_num',axis=1,inplace=True)

In [None]:
df.head()

## Save the vectors of preprocessed emails in pickle

In [None]:
pickle.dump(tfidf_vectorizer,open('vectorizer.pkl','wb'))

In [None]:
file = open('vectorizer.pkl', 'rb')
vectorizer = pickle.load(file)

In [None]:
pickle.dump(vectors,open('vectors.pkl','wb'))

In [None]:
file = open('vectors.pkl', 'rb')
vectors = pickle.load(file)

## divide the data to X and Y

In [None]:
X=vectors
Y=df[['label_num_0','label_num_1']].values

## Spliting to train and test data

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
# Apply SMOTE to the training data only to solve imbalance data by oversampling
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [None]:
y_train_resampled

In [None]:
y_train_resampled=OneHotEncoder().fit_transform(y_train_resampled)

In [None]:
y_train_resampled=y_train_resampled.toarray()
y_train_resampled

## Training the models

In [None]:
lrc = LogisticRegression(solver='liblinear',penalty='l2',C=1.0)
multi_target_lrc = MultiOutputClassifier(lrc, n_jobs=-1)
multi_target_lrc.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_train1 =multi_target_lrc.predict(X_train_resampled)
y_pred_test1 =multi_target_lrc.predict(x_test)

In [None]:
train_accuracy1 = accuracy_score(y_train_resampled, y_pred_train1)
test_accuracy1 = accuracy_score(y_test, y_pred_test1)
print("training accuracy of lrc : ",str(round(train_accuracy1*100,2)),'%')
print("testing  accuracy of lrc : ",str(round(test_accuracy1*100,2)),'%')

In [None]:
k_fold=KFold(n_splits = 5)
scores1=cross_val_score(multi_target_lrc,X,Y,cv=k_fold)
print("scores after applying cross validation of lrc : ",str(scores1))
print("average of scores of lrc : ",str(scores1.mean()))

In [None]:
svc = SVC(kernel='sigmoid',C=1.0)
multi_target_svc = MultiOutputClassifier(svc, n_jobs=-1)
multi_target_svc.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_train2 =multi_target_svc.predict(X_train_resampled)
y_pred_test2 =multi_target_svc.predict(x_test)

In [None]:
train_accuracy2 = accuracy_score(y_train_resampled, y_pred_train2)
test_accuracy2 = accuracy_score(y_test, y_pred_test2)
print("training accuracy of SVC : ",str(round(train_accuracy2*100,2)),'%')
print("testing  accuracy of SVC : ",str(round(test_accuracy2*100,2)),'%')

In [None]:
scores2=cross_val_score(multi_target_svc,X,Y,cv=k_fold)
print("scores after applying cross validation of SVC : ",str(scores2))
print("average of scores of SVC : ",str(scores2.mean()))

In [None]:
DTC = DecisionTreeClassifier(criterion='gini',max_depth=12,random_state=42)
multi_target_DTC = MultiOutputClassifier(DTC, n_jobs=-1)
multi_target_DTC.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_train3 =multi_target_DTC.predict(X_train_resampled)
y_pred_test3 =multi_target_DTC.predict(x_test)

In [None]:
train_accuracy3 = accuracy_score(y_train_resampled, y_pred_train3)
test_accuracy3 = accuracy_score(y_test, y_pred_test3)
print("training accuracy of DTC : ",str(round(train_accuracy3*100,2)),'%')
print("testing  accuracy of DTC : ",str(round(test_accuracy3*100,2)),'%')

In [None]:
KNN = KNeighborsClassifier(n_neighbors=3,p=2,algorithm='auto')
multi_target_KNN = MultiOutputClassifier(KNN, n_jobs=-1)
multi_target_KNN.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_train4 =multi_target_KNN.predict(X_train_resampled)
y_pred_test4 =multi_target_KNN.predict(x_test)

In [None]:
train_accuracy4 = accuracy_score(y_train_resampled, y_pred_train4)
test_accuracy4 = accuracy_score(y_test, y_pred_test4)
print("training accuracy of KNN : ",str(round(train_accuracy4*100,2)),'%')
print("testing  accuracy of KNN : ",str(round(test_accuracy4*100,2)),'%')

In [None]:
RFC = RandomForestClassifier(n_estimators=100,criterion='gini',max_depth=12,random_state=42)
multi_target_RFC = MultiOutputClassifier(RFC, n_jobs=-1)
multi_target_RFC.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_train5 =multi_target_RFC.predict(X_train_resampled)
y_pred_test5 =multi_target_RFC.predict(x_test)

In [None]:
train_accuracy5 = accuracy_score(y_train_resampled, y_pred_train5)
test_accuracy5 = accuracy_score(y_test, y_pred_test5)
print("training accuracy of RFC : ",str(round(train_accuracy5*100,2)),'%')
print("testing  accuracy of RFC : ",str(round(test_accuracy5*100,2)),'%')

In [None]:
NBC = GaussianNB()
multi_target_NBC = MultiOutputClassifier(NBC, n_jobs=-1)
multi_target_NBC.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_train6 =multi_target_NBC.predict(X_train_resampled)
y_pred_test6 =multi_target_NBC.predict(x_test)

In [None]:
train_accuracy6 = accuracy_score(y_train_resampled, y_pred_train6)
test_accuracy6 = accuracy_score(y_test, y_pred_test6)
print("training accuracy of NBC : ",str(round(train_accuracy6*100,2)),'%')
print("testing  accuracy of NBC : ",str(round(test_accuracy6*100,2)),'%')

## Evaluation the models

In [None]:
def inverse_onehot(y):
    label_0=[]
    label_1=[]
    labels=[]
    for i in range(len(y)):
        if y[i][0]==0 and y[i][1]==1:
            labels.append([1])
        elif y[i][0]==1 and y[i][1]==0:
            labels.append([0])
    labels=np.array(labels)
    return labels


In [None]:
# Generate classification report
report1 = classification_report(y_test, y_pred_test1)
print("classification report of LRC : \n")
print(report1)

In [None]:
#inverse one hot encoder
y_test1=inverse_onehot(y_test)
y_pred_test1=inverse_onehot(y_pred_test1)

In [None]:
# Generate confusion matrix
matrix1 = confusion_matrix(y_test1, y_pred_test1)
# Plot confusion matrix as heatmap
sns.heatmap(matrix1, annot=True, cmap="Blues", fmt="d")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix of LRC')
plt.show()

In [None]:
# Generate classification report
report2 = classification_report(y_test, y_pred_test2)
print("classification report of SVC : \n")
print(report2)

In [None]:
#inverse one hot encoder
y_test2=inverse_onehot(y_test)
y_pred_test2=inverse_onehot(y_pred_test2)

In [None]:
# Generate confusion matrix
matrix2 = confusion_matrix(y_test2, y_pred_test2)
# Plot confusion matrix as heatmap
sns.heatmap(matrix2, annot=True, cmap="Blues", fmt="d")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix of SVC')
plt.show()

In [None]:
# Generate classification report
report3 = classification_report(y_test, y_pred_test3)
print("classification report of DTC : \n")
print(report3)

In [None]:
#inverse one hot encoder
y_test3=inverse_onehot(y_test)
y_pred_test3=inverse_onehot(y_pred_test3)

In [None]:
# Generate confusion matrix
matrix3 = confusion_matrix(y_test3, y_pred_test3)
# Plot confusion matrix as heatmap
sns.heatmap(matrix3, annot=True, cmap="Blues", fmt="d")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix of DTC')
plt.show()

In [None]:
# Generate classification report
report4 = classification_report(y_test, y_pred_test4)
print("classification report of KNN : \n")
print(report4)

In [None]:
#inverse one hot encoder
y_test4=inverse_onehot(y_test)
y_pred_test4=inverse_onehot(y_pred_test4)

In [None]:
# Generate confusion matrix
matrix4 = confusion_matrix(y_test4, y_pred_test4)
# Plot confusion matrix as heatmap
sns.heatmap(matrix4, annot=True, cmap="Blues", fmt="d")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix of KNN')
plt.show()

In [None]:
# Generate classification report
report5 = classification_report(y_test, y_pred_test5)
print("classification report of RFC : \n")
print(report5)

In [None]:
#inverse one hot encoder
y_test5=inverse_onehot(y_test)
y_pred_test5=inverse_onehot(y_pred_test5)

In [None]:
# Generate confusion matrix
matrix5 = confusion_matrix(y_test5, y_pred_test5)
# Plot confusion matrix as heatmap
sns.heatmap(matrix5, annot=True, cmap="Blues", fmt="d")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix of RFC')
plt.show()

In [None]:
# Generate classification report
report6 = classification_report(y_test, y_pred_test6)
print("classification report of NBC : \n")
print(report6)

In [None]:
#inverse one hot encoder
y_test6=inverse_onehot(y_test)
y_pred_test6=inverse_onehot(y_pred_test6)

In [None]:
# Generate confusion matrix
matrix6 = confusion_matrix(y_test6, y_pred_test6)
# Plot confusion matrix as heatmap
sns.heatmap(matrix6, annot=True, cmap="Blues", fmt="d")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix of NBC')
plt.show()

## Save the models

In [None]:
pickle.dump(multi_target_lrc,open('LRC_model.pkl','wb'))

In [None]:
pickle.dump(multi_target_svc,open('SVC_model.pkl','wb'))

In [None]:
pickle.dump(multi_target_DTC,open('DTC_model.pkl','wb'))

In [None]:
pickle.dump(multi_target_KNN,open('KNN_model.pkl','wb'))

In [None]:
pickle.dump(multi_target_RFC,open('RFC_model.pkl','wb'))

In [None]:
pickle.dump(multi_target_NBC,open('NBC_model.pkl','wb'))

## Load the models

In [None]:
def load_models():
    file = open(r'models\LRC_model.pkl', 'rb')
    LRC= pickle.load(file)   
    file = open(r'models\SVC_model.pkl', 'rb')
    SVC= pickle.load(file) 
    file = open(r'models\DTC_model.pkl', 'rb')
    DTC= pickle.load(file) 
    file = open(r'models\KNN_model.pkl', 'rb')
    KNN= pickle.load(file) 
    file = open(r'models\RFC_model.pkl', 'rb')
    RFC= pickle.load(file)
    file = open(r'models\NBC_model.pkl', 'rb')
    NBC= pickle.load(file) 
    return LRC,SVC,DTC,KNN,RFC,NBC  

In [None]:
LRC,SVC,DTC,KNN,RFC,NBC =load_models()