In [1]:
import seaborn as sb
import pandas as pd
import numpy as np
import nltk 
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import reuters
from nltk.stem import WordNetLemmatizer

In [1]:
with open('spam.csv', 'r', encoding='ISO-8859-1') as file:
    content = file.read()


In [2]:
content



In [2]:
data = pd.read_csv('emails.csv')
data.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
new_data=data[['spam','text']]
new_data

Unnamed: 0,spam,text
0,1,Subject: naturally irresistible your corporate...
1,1,Subject: the stock trading gunslinger fanny i...
2,1,Subject: unbelievable new homes made easy im ...
3,1,Subject: 4 color printing special request add...
4,1,"Subject: do not have money , get software cds ..."
...,...,...
5723,0,Subject: re : research and development charges...
5724,0,"Subject: re : receipts from visit jim , than..."
5725,0,Subject: re : enron case study update wow ! a...
5726,0,"Subject: re : interest david , please , call..."


In [4]:
new_data.isna().sum()

spam    0
text    0
dtype: int64

In [5]:
new_data['text'] = new_data['text'].apply(lambda x: re.sub(r'@\w+','',x))
new_data['text'] = new_data['text'].apply(lambda x: re.sub(r'http\S+','',x))
new_data['text'] = new_data['text'].apply(lambda x: re.sub(r'\W',' ',x))
new_data['text'] = new_data['text'].apply(lambda x: re.sub(r'\s',' ',x))

In [6]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hiten\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hiten\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
stop_words = set(stopwords.words('english'))

In [8]:
new_data['text'] = new_data['text'].apply(lambda x:word_tokenize(x.lower()))
new_data['text'] = new_data['text'].apply(lambda x: [word for word in x if word not in stop_words])

In [9]:
new_data

Unnamed: 0,spam,text
0,1,"[subject, naturally, irresistible, corporate, ..."
1,1,"[subject, stock, trading, gunslinger, fanny, m..."
2,1,"[subject, unbelievable, new, homes, made, easy..."
3,1,"[subject, 4, color, printing, special, request..."
4,1,"[subject, money, get, software, cds, software,..."
...,...,...
5723,0,"[subject, research, development, charges, gpg,..."
5724,0,"[subject, receipts, visit, jim, thanks, invita..."
5725,0,"[subject, enron, case, study, update, wow, day..."
5726,0,"[subject, interest, david, please, call, shirl..."


In [10]:
new_data['text'] = new_data['text'].apply(lambda x:' '.join(x))

In [11]:
new_data

Unnamed: 0,spam,text
0,1,subject naturally irresistible corporate ident...
1,1,subject stock trading gunslinger fanny merrill...
2,1,subject unbelievable new homes made easy im wa...
3,1,subject 4 color printing special request addit...
4,1,subject money get software cds software compat...
...,...,...
5723,0,subject research development charges gpg forwa...
5724,0,subject receipts visit jim thanks invitation v...
5725,0,subject enron case study update wow day super ...
5726,0,subject interest david please call shirley cre...


In [12]:
 X_train, X_test, y_train,y_test = train_test_split(new_data['text'], new_data['spam'], test_size=0.20, random_state=42)

In [13]:
sentiment_counts = new_data['spam'].value_counts()
print(sentiment_counts)

0    4360
1    1368
Name: spam, dtype: int64


# 5.Features Extraction

In [15]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_text_vec = vectorizer.transform(X_test)


In [16]:
X_text_vec

<1146x33644 sparse matrix of type '<class 'numpy.float64'>'
	with 105576 stored elements in Compressed Sparse Row format>

# 6.Model Training

In [17]:
#Model training by Naive Bayes
from sklearn.naive_bayes import MultinomialNB

# Initialize an Naive Bayes model
NB = MultinomialNB()
NB.fit(X_train_vec, y_train)


MultinomialNB()

In [18]:
from sklearn.svm import SVC

# Initialize an SVM model
svm_model = SVC(kernel='linear', C=1.0)

# Train the SVM model
svm_model.fit(X_train_vec, y_train)


SVC(kernel='linear')

# 7.Model Evaluation

In [19]:
#Model Evaluation for Naive Byes
y_pred_ = NB.predict(X_text_vec)

In [20]:
accuracy = accuracy_score(y_test,y_pred_)
print("Accuracy:",accuracy)
print(classification_report(y_test,y_pred_))

Accuracy: 0.887434554973822
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       856
           1       1.00      0.56      0.71       290

    accuracy                           0.89      1146
   macro avg       0.93      0.78      0.82      1146
weighted avg       0.90      0.89      0.88      1146



In [21]:
#Model Evaluation for SVM
y_pred = svm_model.predict(X_text_vec)

In [22]:
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy:",accuracy)
print(classification_report(y_test,y_pred))

Accuracy: 0.9947643979057592
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       856
           1       0.99      0.99      0.99       290

    accuracy                           0.99      1146
   macro avg       0.99      0.99      0.99      1146
weighted avg       0.99      0.99      0.99      1146

