<a href="https://colab.research.google.com/github/Karanamshivakumar22/Machine_learning_projects/blob/main/spam_mail_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing the Dependencies**

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


**Getting the Data from the kaggle**

In [34]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("abdallahwagih/spam-emails")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/spam-emails


**Loading the Dataset**

In [35]:
raw_df = pd.read_csv(path+"/spam.csv")

In [36]:
#Checking the dataset
raw_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [37]:
raw_df.shape

(5572, 2)

In [38]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [39]:
raw_df.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [40]:
raw_df['Message'].value_counts()

Unnamed: 0_level_0,count
Message,Unnamed: 1_level_1
"Sorry, I'll call later",30
I cant pick the phone right now. Pls send a message,12
Ok...,10
"Say this slowly.? GOD,I LOVE YOU &amp; I NEED YOU,CLEAN MY HEART WITH YOUR BLOOD.Send this to Ten special people &amp; u c miracle tomorrow, do it,pls,pls do it...",4
Please call our customer service representative on FREEPHONE 0808 145 4742 between 9am-11pm as you have WON a guaranteed £1000 cash or £5000 prize!,4
...,...
I think asking for a gym is the excuse for lazy people. I jog.,1
I love your ass! Do you enjoy doggy style? :),1
"Come to me, slave. Your doing it again ... Going into your shell and unconsciously avoiding me ... You are making me unhappy :-(",1
Ok.. Ü finishing soon?,1


In [41]:
raw_df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


Label Encoding

In [42]:
# label spam mail -->0
# label ham mail --> 1

raw_df.loc[raw_df['Category'] == 'spam', 'Category',] = 0
raw_df.loc[raw_df['Category'] == 'ham', 'Category',] = 1

In [53]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [44]:
# Separating the Label and text
X= raw_df['Message']
Y= raw_df['Category']

Train test split

In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [47]:
print(X.shape, X_train.shape, X_test.shape)

(5572,) (4457,) (1115,)


Feature Extraction

In [54]:
#Transform text data to data
from sklearn.feature_extraction.text import TfidfVectorizer
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Convert Y_train and Y_test values to integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [56]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_features,Y_train)

In [60]:
#Checkin the training data

from sklearn.metrics import accuracy_score,classification_report

training_pred = model.predict(X_train_features)

print("Training Accuracy",accuracy_score(training_pred,Y_train))
print("Classification Report of Training Data")
print(classification_report(training_pred,Y_train))

Training Accuracy 0.9676912721561588
Classification Report of Training Data
              precision    recall  f1-score   support

           0       0.76      0.99      0.86       456
           1       1.00      0.97      0.98      4001

    accuracy                           0.97      4457
   macro avg       0.88      0.98      0.92      4457
weighted avg       0.97      0.97      0.97      4457



In [61]:
#Testing data predictions

testing_pred = model.predict(X_test_features)

print("Testing Accuracy",accuracy_score(testing_pred,Y_test))
print("Classification Report of Testing Data")
print(classification_report(testing_pred,Y_test))

Testing Accuracy 0.9668161434977578
Classification Report of Testing Data
              precision    recall  f1-score   support

           0       0.76      1.00      0.86       118
           1       1.00      0.96      0.98       997

    accuracy                           0.97      1115
   macro avg       0.88      0.98      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [62]:
from sklearn.naive_bayes import MultinomialNB
model2 = MultinomialNB()
model2.fit(X_train_features,Y_train)

In [65]:
#Checking the Training data accuracy
training_pred2 = model2.predict(X_train_features)

print("Training Accuracy",accuracy_score(training_pred2,Y_train))
print("Classification Report of Training Data")
print(classification_report(training_pred2,Y_train))

Training Accuracy 0.9807045097599282
Classification Report of Training Data
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       506
           1       1.00      0.98      0.99      3951

    accuracy                           0.98      4457
   macro avg       0.93      0.99      0.96      4457
weighted avg       0.98      0.98      0.98      4457



In [66]:
#checking The testing data for model2
testing_pred2 = model2.predict(X_test_features)

print("Testing Accuracy",accuracy_score(testing_pred2,Y_test))
print("Classification Report of Testing Data")
print(classification_report(testing_pred2,Y_test))


Testing Accuracy 0.9730941704035875
Classification Report of Testing Data
              precision    recall  f1-score   support

           0       0.81      1.00      0.89       125
           1       1.00      0.97      0.98       990

    accuracy                           0.97      1115
   macro avg       0.90      0.98      0.94      1115
weighted avg       0.98      0.97      0.97      1115



# **Predictive System**

In [77]:
input_mail = ["I HAVE A DATE ON SUNDAY WITH WILL!!"]

input_mail = feature_extraction.transform(input_mail)

prediction = model.predict(input_mail)
print(prediction)

if(prediction[0]==1):
  print("Ham Mail")
else:
  print("Spam Mail")

[1]
Ham Mail


In [76]:
input_mail = ["XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> h..."]

input_mail = feature_extraction.transform(input_mail)

prediction = model2.predict(input_mail)
print(prediction)

if(prediction[0]==1):
  print("Ham Mail")
else:
  print("Spam Mail")

[1]
Ham Mail
