#**Importing Necessary Libraries**

In [101]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

#**Data Collection and Pre Processing**

In [102]:
raw_mail_data=pd.read_csv('/content/mail_data (2).csv')

In [103]:
# raw_mail_data=pd.read_csv('/content/mail_data.csv')

In [104]:
# # Possible encodings
# encodings = ['utf-8','latin1','ISO-8859-1','cp1252']

# # Iterating over encodings to see the best encoding
# for encoding in encodings:
#   try:
#     raw_mail_data = pd.read_csv(file_path,encoding=encoding)
#     print(f"Successfully encoded,{encoding}")
#     break
#   except UnicodeDecodeError:
#     print(f"Failed to encode using {encoding}")
#     continue
# if 'df' in locals():
#   print(f'Encoded Successfully using {encoding}')
# else:
#   print('Any encoding is not used to encode the data')

In [105]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [106]:
raw_mail_data.shape

(5572, 2)

In [107]:
raw_mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [108]:
# Replace null values
mail_data=raw_mail_data.where(pd.notnull(raw_mail_data),'')

In [109]:
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [110]:
mail_data.shape

(5572, 2)

In [111]:
# Printing the first five rows
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [112]:
# Checking the no of rows and cols in data frame
mail_data.shape

(5572, 2)

#Label Encoding

In [113]:
# Label spam mail as 1 and ham mail as 0
le=LabelEncoder()
mail_data['Category']=le.fit_transform(mail_data['Category'])

In [114]:
mail_data.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [115]:
# Seperating the data
X=mail_data['Message']
Y=mail_data['Category']

In [116]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [117]:
Y.head()

0    0
1    0
2    1
3    0
4    0
Name: Category, dtype: int64

**Splitting the data into train data and test data**

In [118]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [119]:
X_train.head()

1978    Reply to win £100 weekly! Where will the 2006 ...
3989    Hello. Sort of out in town already. That . So ...
3935     How come guoyang go n tell her? Then u told her?
4078    Hey sathya till now we dint meet not even a si...
4086    Orange brings you ringtones from all time Char...
Name: Message, dtype: object

In [120]:
Y_train.head()

1978    1
3989    0
3935    0
4078    0
4086    1
Name: Category, dtype: int64

In [121]:
X_test.head()

3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
Name: Message, dtype: object

In [122]:
Y_test.head()

3245    0
944     0
1044    0
2484    0
812     0
Name: Category, dtype: int64

In [123]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)


(5572,)
(4457,)
(1115,)


In [124]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(5572,)
(4457,)
(1115,)


#**Feature Extraction**

In [125]:
# Converting message col into array of numerics so that we can feed it as an input to the logistic regression model
tfidf_vector=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
# here tfidfvectorizer is used to convert the text in to array of numerics.here the score is given to the each word based on their no of occurences of each word.i.e., based on occurence of word the score is aligned and this score is will be given as  a array to particulsr sentence
# min_df =1 means thw words which occurs lessthan or equal to 1 are excluded as they are  not much useful in prediction
# stop_words=english  means the sentences has non meaningful words such as the,is,about,did etc which will ne excludede and finally for better processing and easy understand the sentences are kept in the lower case

In [126]:
X_train_feat=tfidf_vector.fit_transform(X_train)
# here fit_transform first fits the data in to vectorizer and then transforms into the numerical values .During its fitting process it learns the data and transform

In [127]:
X_test_feat=tfidf_vector.transform(X_test)
# here only transform is used because here we just need to tranform the data because model already learned the data during it's training process

In [128]:
# Converting Y_test and Y_train to integers
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [129]:
print(X_train_feat)

  (0, 5818)	0.22682143517864364
  (0, 2497)	0.2442158912653505
  (0, 694)	0.3171299579602537
  (0, 6264)	0.1898892037332199
  (0, 5800)	0.17558937755823417
  (0, 3262)	0.33791755486732394
  (0, 2049)	0.3034375179183143
  (0, 7300)	0.24288153842988894
  (0, 2724)	0.3544175987866074
  (0, 354)	0.3544175987866074
  (0, 7162)	0.2550284465664535
  (0, 258)	0.2379428657041507
  (0, 7222)	0.2173884735352799
  (0, 5512)	0.1898892037332199
  (1, 2555)	0.3840709491751004
  (1, 3804)	0.1902902346515268
  (1, 3932)	0.24325511357721427
  (1, 4509)	0.4028245991060671
  (1, 2440)	0.33870544648398715
  (1, 3333)	0.20665394084233096
  (1, 5650)	0.360444144470318
  (1, 2335)	0.2162321275166079
  (1, 6738)	0.28986069568918
  (1, 6109)	0.3239762634465801
  (1, 3267)	0.2678713077029217
  :	:
  (4452, 2438)	0.4574160733416501
  (4452, 7280)	0.3968991650168732
  (4452, 3978)	0.4574160733416501
  (4452, 3290)	0.26370969643076225
  (4452, 3084)	0.22948428918295163
  (4452, 2236)	0.2676662072392096
  (4453, 387

In [130]:
print(X_test_feat)

  (0, 4942)	0.2260795967233104
  (0, 4100)	0.2783653715582127
  (0, 3955)	0.30969913392437864
  (0, 3395)	0.6599992933708911
  (0, 3225)	0.32999964668544557
  (0, 2173)	0.24736140852983116
  (0, 2065)	0.29632752800743906
  (0, 1751)	0.2863401438180079
  (1, 7158)	0.3981347747267476
  (1, 6986)	0.2493471978387002
  (1, 6642)	0.326271353777915
  (1, 6544)	0.2204999931204713
  (1, 5430)	0.387052012561607
  (1, 4044)	0.3234324946551934
  (1, 3443)	0.3234324946551934
  (1, 1975)	0.3578586983359201
  (1, 1361)	0.37034060973735533
  (2, 6570)	0.2671012270734155
  (2, 5597)	0.38473841792677693
  (2, 4369)	0.37140936745963093
  (2, 3510)	0.35262312595844614
  (2, 3084)	0.19302212472396826
  (2, 3067)	0.19302212472396826
  (2, 2377)	0.37140936745963093
  (2, 1292)	0.5530689808395817
  :	:
  (1110, 6142)	0.184212737624403
  (1110, 5204)	0.20379483330125558
  (1110, 4806)	0.21000774387718502
  (1110, 4497)	0.4617602822459726
  (1110, 4105)	0.19205506803469924
  (1110, 3938)	0.19408816272334495
  (

**Training Model**

**Logistic Regression**

In [131]:
model_lr=LogisticRegression()

In [132]:
# Training logistic regression model
model_lr.fit(X_train_feat,Y_train)

**Evaluating the trained model**

In [133]:
# Prediction on training data
predict_train_data=model_lr.predict(X_train_feat)
accuracy_on_train=accuracy_score(Y_train,predict_train_data)
print('Accuracy on Trained data:',accuracy_on_train)
print('Confusion Matrix:',confusion_matrix(Y_train,predict_train_data))
print('precision Score:',precision_score(Y_train,predict_train_data))

print()

predict_test_data=model_lr.predict(X_test_feat)
accuracy_on_test=accuracy_score(Y_test,predict_test_data)
print('Accuracy On Test data:',accuracy_on_test)
print('Confusion Matric:',confusion_matrix(Y_test,predict_test_data))
print('Precision Score:',precision_score(Y_test,predict_test_data))

Accuracy on Trained data: 0.9661207089970832
Confusion Matrix: [[3855    4]
 [ 147  451]]
precision Score: 0.9912087912087912

Accuracy On Test data: 0.967713004484305
Confusion Matric: [[966   0]
 [ 36 113]]
Precision Score: 1.0


**Naive bayes**

In [134]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [135]:
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

In [136]:
X_train_dense=X_train_feat.toarray()
X_test_dense=X_test_feat.toarray()


**Gaussian**

In [137]:
gnb.fit(X_train_dense,Y_train)
predict_on_train_gnb=gnb.predict(X_train_dense)
accuracy_on_train_gnb=accuracy_score(Y_train,predict_on_train_gnb)
print('Accuracy on train GNB:',accuracy_on_train_gnb)
print('Confusion_matrix: ',confusion_matrix(Y_train,predict_on_train_gnb))
print('Precision_Score: ',precision_score(Y_train,predict_on_train_gnb))

print()

predict_on_test_gnb=gnb.predict(X_test_dense)
accuracy_on_test_gnb=accuracy_score(Y_test,predict_on_test_gnb)
print('Accuracy on test GNB:',accuracy_on_test_gnb)
print('Confusion_matrix: ',confusion_matrix(Y_test,predict_on_test_gnb))
print('Precision_Score: ',precision_score(Y_test,predict_on_test_gnb))

Accuracy on train GNB: 0.9371774736369756
Confusion_matrix:  [[3579  280]
 [   0  598]]
Precision_Score:  0.6810933940774487

Accuracy on test GNB: 0.8941704035874439
Confusion_matrix:  [[860 106]
 [ 12 137]]
Precision_Score:  0.5637860082304527


**Multi Nominal**

In [138]:
mnb.fit(X_train_dense,Y_train)
predict_on_train_mnb=mnb.predict(X_train_dense)
accuracy_on_train_mnb=accuracy_score(Y_train,predict_on_train_mnb)
print('Accuracy on train mNB:',accuracy_on_train_mnb)
print('Confusion_matrix: ',confusion_matrix(Y_train,predict_on_train_mnb))
print('Precision_Score: ',precision_score(Y_train,predict_on_train_mnb))

print()

predict_on_test_mnb=mnb.predict(X_test_dense)
accuracy_on_test_mnb=accuracy_score(Y_test,predict_on_test_mnb)
print('Accuracy on test MNB:',accuracy_on_test_mnb)
print('Confusion_matrix: ',confusion_matrix(Y_test,predict_on_test_mnb))
print('Precision_Score: ',precision_score(Y_test,predict_on_test_mnb))

Accuracy on train mNB: 0.9813776082566749
Confusion_matrix:  [[3859    0]
 [  83  515]]
Precision_Score:  1.0

Accuracy on test MNB: 0.9766816143497757
Confusion_matrix:  [[966   0]
 [ 26 123]]
Precision_Score:  1.0


**Bernouli**

In [139]:
bnb.fit(X_train_dense,Y_train)
predict_on_train_bnb=bnb.predict(X_train_dense)
accuracy_on_train_bnb=accuracy_score(Y_train,predict_on_train_bnb)
print('Accuracy on train bNB:',accuracy_on_train_bnb)
print('Confusion_matrix: ',confusion_matrix(Y_train,predict_on_train_bnb))
print('Precision_Score: ',precision_score(Y_train,predict_on_train_bnb))

print()

predict_on_test_bnb=bnb.predict(X_test_dense)
accuracy_on_test_bnb=accuracy_score(Y_test,predict_on_test_bnb)
print('Accuracy on test BNB:',accuracy_on_test_bnb)
print('Confusion_matrix: ',confusion_matrix(Y_test,predict_on_test_bnb))
print('Precision_Score: ',precision_score(Y_test,predict_on_test_bnb))

Accuracy on train bNB: 0.9858649315683194
Confusion_matrix:  [[3856    3]
 [  60  538]]
Precision_Score:  0.9944547134935305

Accuracy on test BNB: 0.9757847533632287
Confusion_matrix:  [[965   1]
 [ 26 123]]
Precision_Score:  0.9919354838709677


**Predictive System**

In [140]:
input_mail = ["Hey, just checking in to see how you're doing. Let me know if you need anything!"]

# Convert text to numerical
input_data_feat=tfidf_vector.transform(input_mail)

# Prediction
predicted_val=model_lr.predict(input_data_feat)
print(predicted_val)


[0]


In [141]:
input_mail = ["Hey, just checking in to see how you're doing. Let me know if you need anything!"]

# Convert text to numerical
input_data_feat=tfidf_vector.transform(input_mail)

# Prediction
predicted_val=mnb.predict(input_data_feat)
print(predicted_val)


[0]


In [142]:
if(predicted_val[0]==1):
  print('Spam Mail')
else:
  print('Ham Mail')

Ham Mail


In [143]:
import pickle
pickle.dump(tfidf_vector,open('vector.pkl','wb'))
pickle.dump(model_lr,open('model.pkl','wb'))

In [144]:
pickle.dump(mnb,open('mnb.pkl','wb'))