# Imporing Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
df=pd.read_csv("mail_data.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
#replace the null values with a null string
df1=df.where((pd.notnull(df)),'')

In [4]:
df1.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#checking the number of rows and columns
df1.shape

(5572, 2)

# Label Encoding

In [6]:
#label spam mail as 0;ham mail as 1;
df1.loc[df1['Category']=='spam','Category',]=0
df1.loc[df1['Category']=='ham','Category',]=1

Spam ----->0

ham ------>1

In [9]:
#seperating the data as texts and label
x=df1['Message']
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [10]:
y=df1['Category']
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [11]:
#Splitting the data into training & testing data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)

In [12]:
print(x.shape,x_train.shape,x_test.shape)

(5572,) (4457,) (1115,)


# Feature Extraction

tf-idf is an important measure and is used by algorithms like cosine similarity to find documents that are similar to a given search query.

TfidfVectorizer calculates tf-idf values

What is Term Frequency (tf)

tf is the number of times a term appears in a particular document.

Inverse Document Frequency (idf)

idf is a measure of how common or rare a term is across the entire corpus of documents.

In [15]:
#transform the text data to feature vectors that can be used as input to the logistic regression
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')

In [28]:
x_train_features=feature_extraction.fit_transform(x_train)
x_test_features=feature_extraction.transform(x_test)

In [29]:
#convert y_train and y_test values as integers
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [30]:
x_train

3890                    Unlimited texts. Limited minutes.
5553                          Hahaha..use your brain dear
4366    Ujhhhhhhh computer shipped out with address to...
3968    YOU HAVE WON! As a valued Vodafone customer ou...
3771    Love it! The girls at the office may wonder wh...
                              ...                        
3335    That's fine, have him give me a call if he kno...
1099    NO GIFTS!! You trying to get me to throw mysel...
2514    U have won a nokia 6230 plus a free digital ca...
3606                      Jordan got voted out last nite!
2575    Your next amazing xxx PICSFREE1 video will be ...
Name: Message, Length: 4457, dtype: object

In [31]:
print(x_train_features)

  (0, 4334)	0.42941702167641554
  (0, 3958)	0.6161071828926097
  (0, 6586)	0.44333254982109394
  (0, 6927)	0.48935591439341625
  (1, 2121)	0.3573617143022146
  (1, 1428)	0.5869421390016223
  (1, 6971)	0.42812434651556874
  (1, 3168)	0.5869421390016223
  (2, 5115)	0.3408491178137899
  (2, 7353)	0.31988118061968496
  (2, 3852)	0.3408491178137899
  (2, 4884)	0.35749230587184955
  (2, 5695)	0.35749230587184955
  (2, 806)	0.26730249393705324
  (2, 5894)	0.35749230587184955
  (2, 1876)	0.28751725124107325
  (2, 6878)	0.35749230587184955
  (3, 197)	0.36522237107066735
  (3, 3723)	0.16297045459835785
  (3, 2435)	0.26698378141852
  (3, 1825)	0.26858331513730566
  (3, 5231)	0.2266831802864503
  (3, 300)	0.2915969875465198
  (3, 7248)	0.23571908490908416
  (3, 5005)	0.3169028431039865
  :	:
  (4454, 2244)	0.2526916142542512
  (4454, 666)	0.28653660324238944
  (4454, 1575)	0.20946314330145205
  (4454, 1094)	0.24862733340971144
  (4454, 5068)	0.22284357632450164
  (4454, 311)	0.19547195974237946
  

In [32]:
x_test

5086    Omg if its not one thing its another. My cat h...
2120                I hope you know I'm still mad at you.
2318    Waqt se pehle or naseeb se zyada kisi ko kuch ...
2917      What time should I tell my friend to be around?
1352                       Yo theres no class tmrw right?
                              ...                        
884                           Dude we should go sup again
3821    I got arrested for possession at, I shit you n...
1066                            No my mum went 2 dentist.
208                          Aight yo, dats straight dogg
1378    Double Mins & Double Txt & 1/2 price Linerenta...
Name: Message, Length: 1115, dtype: object

In [33]:
y_train

3890    1
5553    1
4366    1
3968    0
3771    1
       ..
3335    1
1099    1
2514    0
3606    1
2575    0
Name: Category, Length: 4457, dtype: int32

# Model Training

Logistic Regression

In [34]:
lr=LogisticRegression()
lr.fit(x_train_features,y_train)

LogisticRegression()

# Model Evaluation

In [35]:
#prediction on training data
training_data_prediction=lr.predict(x_train_features)
training_accuracy_score=metrics.accuracy_score(y_train,training_data_prediction)
print("Accuracy on training data:",training_accuracy_score)

Accuracy on training data: 0.9683643706529056


In [36]:
#prediction on testing data
testing_data_prediction=lr.predict(x_test_features)
testing_accuracy_score=metrics.accuracy_score(y_test,testing_data_prediction)
print("Accuracy on training data:",testing_accuracy_score)

Accuracy on training data: 0.9524663677130045


# Building a predictive System

In [39]:
input_mail=["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."]
#convert text to feature vectors
input_data_features=feature_extraction.transform(input_mail)
#making prediction
prediction=lr.predict(input_data_features)
print(prediction)

if(prediction[0]==1):
    print("The mail is not spam(i.e.,Ham mail)")
else:
    print("The mail is spam")

[1]
The mail is not spam(i.e.,Ham mail)
