In [63]:
import pandas as pd
import numpy as np

In [64]:
df = pd.read_csv('mail_data.csv',header=0)

In [65]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [67]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [68]:
df.columns

Index(['Category', 'Message'], dtype='object')

In [69]:
df['Category'].unique()

array(['ham', 'spam'], dtype=object)

In [70]:
df.replace({'Category' : {'ham': 1 ,'spam' : 0}},inplace=True)

In [71]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [72]:
df['Category'].value_counts()

1    4825
0     747
Name: Category, dtype: int64

In [73]:
X = df['Message']
Y = df['Category']

In [74]:
X.shape

(5572,)

In [75]:
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: int64

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=10)

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)


In [78]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [79]:
print(X_train_features)

  (0, 5218)	1.0
  (1, 4745)	0.4534886728220183
  (1, 7266)	0.39349136612703317
  (1, 3213)	0.47563189358898544
  (1, 5831)	0.47563189358898544
  (1, 3258)	0.33769704827191216
  (1, 3117)	0.2702256012963159
  (2, 6650)	0.4724534451200627
  (2, 4618)	0.4798905751162268
  (2, 7168)	0.5655453624599882
  (2, 7324)	0.4760790072128801
  (3, 2111)	0.6471064429181299
  (3, 4078)	0.3578260568100486
  (3, 6689)	0.3660825403036148
  (3, 2099)	0.34395340550548065
  (3, 4599)	0.44821132641606853
  (4, 4264)	0.19400419560355137
  (4, 644)	0.32783379543981434
  (4, 7038)	0.30174250755529963
  (4, 3727)	0.13960291836824296
  (4, 5822)	0.20666960669811668
  (4, 2096)	0.24955993178627037
  (4, 6552)	0.3408280379552965
  (4, 2262)	0.312571370431853
  (4, 7315)	0.312571370431853
  :	:
  (4454, 2701)	0.275356125521126
  (4454, 4208)	0.24725965866005303
  (4454, 2891)	0.2523713079039201
  (4454, 3932)	0.275356125521126
  (4454, 4241)	0.23543228813185407
  (4454, 5234)	0.22427484104284712
  (4454, 1981)	0.226

In [80]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train_features,Y_train)

In [81]:
Y_pred = model.predict(X_test_features)

In [82]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
print('Confusion Matrix')
cfm = confusion_matrix(Y_test,Y_pred)
print(cfm)
print()
print('Classification Report')
print(classification_report(Y_test,Y_pred))

acc=accuracy_score(Y_test,Y_pred)
print('Accuracy of the model =',acc*100,'%')

Confusion Matrix
[[ 92  51]
 [  1 971]]

Classification Report
              precision    recall  f1-score   support

           0       0.99      0.64      0.78       143
           1       0.95      1.00      0.97       972

    accuracy                           0.95      1115
   macro avg       0.97      0.82      0.88      1115
weighted avg       0.96      0.95      0.95      1115

Accuracy of the model = 95.33632286995515 %


In [87]:
input_your_mail = ["Spooktacular Fun 👻 awaits at 'Trick or Treat' Halloween Party for Kids at Jio World Drive!"]

input_features = feature_extraction.transform(input_your_mail)

prediction = model.predict(input_features)

print(prediction)

[1]
