# **Week 4 project: Email Spam Detection**

In [None]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.svm import SVC

In [None]:
# importing warnings to ignore warnings
import warnings
warnings.filterwarnings('ignore')

CSV file has some characters that don’t follow the UTF-8 rules.<br>

UTF-8 and Latin-1 are two different rulebooks.

In [None]:
data=pd.read_csv('/content/email_spam_detection.csv',encoding='latin-1').iloc[:,:2]

In [None]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# renaming columns
data=data.rename(columns={"v1":"Category","v2":"Text"})
data.head()

Unnamed: 0,Category,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
print("Total Rows: ",len(data))

Total Rows:  5572


In [None]:
data.describe()

Unnamed: 0,Category,Text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [None]:
data.isna().sum()

Unnamed: 0,0
Category,0
Text,0


In [None]:
# printing total count for each class
print("Total Ham: ",data['Category'].value_counts()['ham'])
print("Total Spam: ",data['Category'].value_counts()['spam'])

Total Ham:  4825
Total Spam:  747


### **Apply Resampling to Deal with the Data Imbalance**

In [None]:
# importing utilities
from sklearn.utils import resample

In [None]:
ham=data[data['Category']=='ham']
spam=data[data['Category']=='spam']

In [None]:
# applying resampling to increase spam count equal to ham
resample_spam=resample(spam,
                       replace=True,
                       n_samples=len(ham),
                       random_state=42
)

In [None]:
# concatinating the resambled data with the other class
data=pd.concat([resample_spam,ham])

In [None]:
# printing the count again to check if the imbalance have been removed now
print("Total Ham: ",data['Category'].value_counts()['ham'])
print("Total Spam: ",data['Category'].value_counts()['spam'])

Total Ham:  4825
Total Spam:  4825


In [None]:
data.head()

Unnamed: 0,Category,Text
712,spam,08714712388 between 10am-7pm Cost 10p
3228,spam,Ur cash-balance is currently 500 pounds - to m...
1928,spam,Call from 08702490080 - tells u 2 call 0906635...
737,spam,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...
504,spam,+123 Congratulations - in this week's competit...


In [None]:
# encoding the target class
data['Category']=data['Category'].map({'ham':0,'spam':1})

In [None]:
# checking for null values
print(data['Category'].unique())
print(data['Category'].isna().sum())

[1 0]
0


In [None]:
data.head()

Unnamed: 0,Category,Text
712,1,08714712388 between 10am-7pm Cost 10p
3228,1,Ur cash-balance is currently 500 pounds - to m...
1928,1,Call from 08702490080 - tells u 2 call 0906635...
737,1,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...
504,1,+123 Congratulations - in this week's competit...


### **Data Training**

In [None]:
x_trian,x_test,y_train,y_test=train_test_split(data['Text'],data['Category'],test_size=0.3,random_state=42)

It converts text data into numerical features that ML models can understand.<br>

Specifically, it creates a Bag-of-Words representation:<br>

Builds a vocabulary of all unique words in your dataset.<br>

Turns each document/message into a vector of word counts.<br>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# stop_words='english' will ignore the common words, max_features will limit the number of words
vectorizer=CountVectorizer(stop_words='english',max_features=3000)

In [None]:
x_train_vec=vectorizer.fit_transform(x_trian) #fit & transform on training data
x_test_vec=vectorizer.transform(x_test) #only transform on test data

In [None]:
# creating Logistic Regression object
model=LogisticRegression(max_iter=1000)

### **Logistic Regression**

In [None]:
model.fit(x_train_vec,y_train)

In [None]:
y_pred=model.predict(x_test_vec)

In [None]:
svc_model=SVC(kernel='linear')

### **Support Vector Machine**

In [None]:
svc_model.fit(x_train_vec,y_train)

In [None]:
y_pred_scv=svc_model.predict(x_test_vec)

**Printing Results**

In [None]:
print("--------------LOGISTIC REGRESSION RESULTS--------------")
print("\nAccuracy: ",accuracy_score(y_test,y_pred))
print("\nClassification Report: ",classification_report(y_test,y_pred))
print("\nConfusion Matrix: ",confusion_matrix(y_test,y_pred))


--------------LOGISTIC REGRESSION RESULTS--------------

Accuracy:  0.9951640759930915

Classification Report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00      1433
           1       1.00      1.00      1.00      1462

    accuracy                           1.00      2895
   macro avg       1.00      1.00      1.00      2895
weighted avg       1.00      1.00      1.00      2895


Confusion Matrix:  [[1426    7]
 [   7 1455]]


In [None]:
print("--------------SVM RESULTS--------------")
print("\nAccuracy: ",accuracy_score(y_test,y_pred_scv))
print("\nClassification Report: ",classification_report(y_test,y_pred_scv))
print("\nConfusion Matrix: ",confusion_matrix(y_test,y_pred_scv))


--------------SVM RESULTS--------------

Accuracy:  0.9958549222797928

Classification Report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00      1433
           1       1.00      1.00      1.00      1462

    accuracy                           1.00      2895
   macro avg       1.00      1.00      1.00      2895
weighted avg       1.00      1.00      1.00      2895


Confusion Matrix:  [[1426    7]
 [   5 1457]]


### **Testing**

In [None]:
input = ["Congratulations, you won free 100 Bitcoins in a lottery"]
new_vec = vectorizer.transform(input)
result=model.predict(new_vec)
if result==0:
  print("Not Spam")
else:
  print("Spam")

Spam
