Create a Machine Learning model to predict the given mail is spam or not.

In [26]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [27]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [28]:
from nltk.corpus import stopwords

In [29]:
df = pd.read_csv('spam.csv')

In [30]:
df.head()

Unnamed: 0,label,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [31]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis = 1, inplace = True)

In [32]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [33]:
df.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [34]:
df.shape

(5572, 2)

In [35]:
df.isnull().sum().sum()

0

In [36]:
df['label'].replace({"ham": 0, "spam": 1}, inplace = True)

In [37]:
x = df["message"]
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object

In [38]:
y = df["label"]
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: label, Length: 5572, dtype: int64

### Train test data 

In [39]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 3)

In [40]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(4457,)
(1115,)


### TF-IDF vectorizer

In [41]:
x.shape

(5572,)

In [42]:
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    lowercase=True,        # Convert text to lowercase
    stop_words='english',  # Remove stopwords
    max_features=1000      # Limit the number of features
)

# Fit and transform your training data
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

# Transform your test data using the same vectorizer
x_test_tfidf = tfidf_vectorizer.transform(x_test)

model1 = MultinomialNB()
model1.fit(x_train_tfidf, y_train)

# Make predictions on the test data
y_pred = model1.predict(x_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9802690582959641
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       960
           1       0.99      0.87      0.92       155

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [44]:
y_train.value_counts()

0    3865
1     592
Name: label, dtype: int64

In [45]:
y_pred

array([0, 1, 0, ..., 0, 0, 0])

In [46]:
y_test

2632    0
454     1
983     0
1282    0
4610    0
       ..
4827    0
5291    0
3325    0
3561    0
1136    1
Name: label, Length: 1115, dtype: int32

### Logistic Regression

In [47]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create a Logistic Regression model
model2 = LogisticRegression()

# Fit the model to the training data
model2.fit(x_train_tfidf, y_train)

# Make predictions on the test data
y_pred1 = model2.predict(x_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred1)
print("Accuracy:", accuracy)


Accuracy: 0.9730941704035875


### Predict on new data

In [48]:
input_your_mail = ["Congratulations! You have won a free vacation to an exotic destination. Click the link to claim your prize now!"]
input_data_features = tfidf_vectorizer.transform(input_your_mail)
prediction = model1.predict(input_data_features)
print(prediction)

if (prediction)[0] == 0:
    print("Ham Mail")
else:
    print("Spam Mail")

[1]
Spam Mail


In [49]:
input_your_mail = ["You have won a free vacation to an exotic destination."]
input_data_features = tfidf_vectorizer.transform(input_your_mail)
prediction = model2.predict(input_data_features)
print(prediction)

if (prediction)[0] == 0:
    print("Ham Mail")
else:
    print("Spam Mail")

[1]
Spam Mail


In [50]:
input_your_mail = ["Do ou have free time on sunday for an outdoor meeting?."]
input_data_features = tfidf_vectorizer.transform(input_your_mail)
prediction = model1.predict(input_data_features)
print(prediction)

if (prediction)[0] == 0:
    print("Ham Mail")
else:
    print("Spam Mail")

[0]
Ham Mail
