<a href="https://colab.research.google.com/github/MohammadErfanRashidi/Spam_Email/blob/main/Spam_mail.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [82]:
# Uploading Needed Libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [83]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [84]:
# Uploading the csv file as dt
dt = pd.read_csv("mail_data.csv")

In [85]:
# Checking the head of dt
dt.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [86]:
# Checking the shape of dt
dt.shape

(5572, 2)

In [87]:
# Checking for missing values
dt.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [88]:
# Stemming Function
port_stem = PorterStemmer()

def stemming(content):
        stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
        stemmed_content = stemmed_content.lower()
        stemmed_content = stemmed_content.split()
        stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words("english")] # Removed the closing parenthesis
        stemmed_content = ' '.join(stemmed_content)
        return stemmed_content


In [89]:
# Converting the "Message" column into numerical values and storing it in X
stemmed_text = dt["Message"].apply(stemming)

vectorizer = TfidfVectorizer()

vectorizer.fit(stemmed_text)

X = vectorizer.transform(stemmed_text)

In [90]:
# Storing "Category" column in Y
Y = dt['Category']

In [91]:
# Printing X and Y
print(X)
print(Y)

  (0, 190)	0.3522946643655987
  (0, 379)	0.26350491969128115
  (0, 736)	0.33630333732147566
  (0, 738)	0.29761995607435426
  (0, 964)	0.29761995607435426
  (0, 1169)	0.27282796669086984
  (0, 2171)	0.14066343975170745
  (0, 2208)	0.1649859743034801
  (0, 2245)	0.19460776670194488
  (0, 2827)	0.3522946643655987
  (0, 2932)	0.28506031120996994
  (0, 4091)	0.24055424511726686
  (0, 5957)	0.19460776670194488
  (0, 6135)	0.23616756554565888
  (1, 2794)	0.4745440766926726
  (1, 2960)	0.4218982744467187
  (1, 3760)	0.2809319560263009
  (1, 3785)	0.564793662023427
  (1, 6056)	0.44597659211687757
  (2, 262)	0.18752116579572622
  (2, 1058)	0.2181159425903744
  (2, 1220)	0.22327647280120547
  (2, 1673)	0.3983526060107063
  (2, 1791)	0.52682621884254
  (2, 1890)	0.18841663063918468
  :	:
  (5567, 5520)	0.20176693864555295
  (5567, 5644)	0.23763296461255506
  (5568, 1704)	0.6652366917601374
  (5568, 1996)	0.5740672391289212
  (5568, 2171)	0.29597505521175127
  (5568, 2457)	0.37457404553349233
  (55

In [92]:
# Train and Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, stratify = Y, random_state = 1)

In [93]:
# Checking the shape of Train and Test data
print(X.shape, X_train.shape, X_test.shape)
print(Y.shape, Y_train.shape, Y_test.shape)

(5572, 6296) (5014, 6296) (558, 6296)
(5572,) (5014,) (558,)


In [94]:
# Uploading the Model
model = LogisticRegression()

In [95]:
# Fitting the Model
model.fit(X_train, Y_train)

In [96]:
# Model Evaluation (train)
X_train_prediction = model.predict(X_train)
train_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the train data : ', train_data_accuracy)

Accuracy score of the train data :  0.9734742720382927


In [97]:
# Model Evaluation (test)
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.967741935483871


In [98]:
# Predicting Emails
for i in range(X_test.shape[0]):
    X_new = X_test[i]

# Predicting the example
    prediction = model.predict(X_new)

    if prediction != Y_test.iloc[i]:
        print(f"At X_test[{i}] we predicted {prediction} and it was not correct!")

At X_test[34] we predicted ['ham'] and it was not correct!
At X_test[81] we predicted ['ham'] and it was not correct!
At X_test[85] we predicted ['ham'] and it was not correct!
At X_test[186] we predicted ['ham'] and it was not correct!
At X_test[210] we predicted ['ham'] and it was not correct!
At X_test[213] we predicted ['ham'] and it was not correct!
At X_test[233] we predicted ['ham'] and it was not correct!
At X_test[237] we predicted ['ham'] and it was not correct!
At X_test[280] we predicted ['ham'] and it was not correct!
At X_test[286] we predicted ['ham'] and it was not correct!
At X_test[346] we predicted ['ham'] and it was not correct!
At X_test[358] we predicted ['ham'] and it was not correct!
At X_test[360] we predicted ['ham'] and it was not correct!
At X_test[376] we predicted ['ham'] and it was not correct!
At X_test[403] we predicted ['ham'] and it was not correct!
At X_test[517] we predicted ['ham'] and it was not correct!
At X_test[532] we predicted ['ham'] and it 