In [None]:
#importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix


In [None]:
#loading dataset
dataset=pd.read_csv('/content/drive/MyDrive/ML_PROJECTS/SPAM_MAIL_DETECTION/mail_data.csv')

In [None]:
print(dataset)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [None]:
print(dataset.isnull())

      Category  Message
0        False    False
1        False    False
2        False    False
3        False    False
4        False    False
...        ...      ...
5567     False    False
5568     False    False
5569     False    False
5570     False    False
5571     False    False

[5572 rows x 2 columns]


In [None]:
#replacing the null values with null string
mail_data=dataset.where((pd.notnull(dataset)),'')

In [None]:
#first five rows of the dataset
print(mail_data.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [None]:
#no. of rows and columns of the dataset
print(mail_data.shape)

(5572, 2)


In [None]:

#label spam as 0 and ham=1
mail_data.loc[mail_data['Category']=='spam','Category']=0
mail_data.loc[mail_data['Category']=='ham','Category']=1

In [None]:
#seperating the data as text and labels
X=mail_data['Message']
Y=mail_data['Category']
#x is text and y are outputs

In [None]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [None]:

print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [None]:
#splitting the data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3)

In [None]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


In [None]:
#feature extraction
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english', lowercase=True)

X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)

#converting Y values as integer

Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [None]:
print(X_train)

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object


In [None]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34775 stored elements and shape (4457, 7431)>
  Coords	Values
  (0, 2329)	0.38783870336935383
  (0, 3811)	0.34780165336891333
  (0, 2224)	0.413103377943378
  (0, 4456)	0.4168658090846482
  (0, 5413)	0.6198254967574347
  (1, 3811)	0.17419952275504033
  (1, 3046)	0.2503712792613518
  (1, 1991)	0.33036995955537024
  (1, 2956)	0.33036995955537024
  (1, 2758)	0.3226407885943799
  (1, 1839)	0.2784903590561455
  (1, 918)	0.22871581159877646
  (1, 2746)	0.3398297002864083
  (1, 2957)	0.3398297002864083
  (1, 3325)	0.31610586766078863
  (1, 3185)	0.29694482957694585
  (1, 4080)	0.18880584110891163
  (2, 6601)	0.6056811524587518
  (2, 2404)	0.45287711070606745
  (2, 3156)	0.4107239318312698
  (2, 407)	0.509272536051008
  (3, 7414)	0.8100020912469564
  (3, 2870)	0.5864269879324768
  (4, 2870)	0.41872147309323743
  (4, 487)	0.2899118421746198
  :	:
  (4454, 2855)	0.47210665083641806
  (4454, 2246)	0.47210665083641806
  (4455, 4456)	0.24

In [None]:
#training the model
model=LogisticRegression()

In [None]:
#traning the data with logistic regression
model.fit(X_train_features,Y_train)

In [None]:
#evaluating the training model
prediction_on_training_data=model.predict(X_train_features)
accuracy_on_traning_data=accuracy_score(Y_train,prediction_on_training_data)

In [None]:
print("accuracy on traning data : ",accuracy_on_traning_data)

accuracy on traning data :  0.9676912721561588


In [None]:
#evaluating the testing  model
prediction_on_testing_data=model.predict(X_test_features)
accuracy_on_testing_data=accuracy_score(Y_test,prediction_on_testing_data)

In [None]:
print("accuracy on testing data : ",accuracy_on_testing_data)

accuracy on testing data :  0.9668161434977578


In [None]:
precision = precision_score(Y_test, prediction_on_testing_data)
recall = recall_score(Y_test, prediction_on_testing_data)
f1 = f1_score(Y_test, prediction_on_testing_data)
cm = confusion_matrix(Y_test, prediction_on_testing_data)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", cm)

Precision: 0.9628886659979939
Recall: 1.0
F1 Score: 0.9810935104752172
Confusion Matrix:
 [[118  37]
 [  0 960]]


In [None]:

#building a predictive system
input_mail=["As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"]
#convert text into feature vector
input_data_features=feature_extraction.transform(input_mail)

#making prediction
prediction=model.predict(input_data_features)
print(prediction)

if prediction[0]==1:
  print("HAM MAIL")
else:
  print("SPAM MAIL")

[1]
HAM MAIL


Comparing Naive Bayes vs Logistic Regression

In [None]:
from sklearn.naive_bayes import MultinomialNB


In [None]:
#Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_features, Y_train)

nb_predictions = nb_model.predict(X_test_features)


In [None]:
#Evaluate Naive Bayes
nb_accuracy = accuracy_score(Y_test, nb_predictions)
nb_precision = precision_score(Y_test, nb_predictions)
nb_recall = recall_score(Y_test, nb_predictions)
nb_f1 = f1_score(Y_test, nb_predictions)

print("Naive Bayes Results")
print("Accuracy:", nb_accuracy)
print("Precision:", nb_precision)
print("Recall:", nb_recall)
print("F1:", nb_f1)


Naive Bayes Results
Accuracy: 0.9730941704035875
Precision: 0.9696969696969697
Recall: 1.0
F1: 0.9846153846153847
