In [2]:
#importing dependencies
import numpy as np #arrays 
import pandas as pd #dataframe and data analysis
from sklearn.model_selection import train_test_split #this function is used to plit the dataset into training and testing 
from sklearn.feature_extraction.text import TfidfVectorizer #text data into numerical values
from sklearn.linear_model import LogisticRegression #used for classifying spam and ham
from sklearn.metrics import accuracy_score #Evaluate or model and check how well the model is function

In [3]:
#loading the data from csv file to a pandas dataframe
df=pd.read_csv('mail_data.csv')

In [4]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
#replace the  null values with a null string
mail_data=df.where((pd.notnull(df)),'') #'' represents the null string. Pandas dataframe.notnull() function detects existing/ non-missing values in the dataframe.


In [6]:
df.head() #prints first five rows 

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
#label encoding
#label spam mail as 0 and ham mail as 1
df.loc[df['Category']=='spam','Category',]=0
df.loc[df['Category']=='ham','Category',]=1

In [8]:
#seperating the data as texts and label
X=df['Message'] #X-axis
Y=df['Category'] #Y-axis

In [9]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [10]:
Y


0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [11]:
#splitting the data into training data and test data
#In order to do this we form 4 arrays
#now that the X and Y are different data set now we will split X and Y into training and testing data set, all the corressponding values of X to Y will be splitted accordingly
X_train,X_test,Y_train,Y_test=train_test_split(X, Y, test_size=0.2,random_state=3) #0.2 =20% for testing (X_test, Y_test) and 80% for training (X_train, Y_train). Random_state they split the data randomly each time we run the code

In [12]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)
#shape tells us the number of columns and rows 

(5572,)
(4457,)
(1115,)


In [13]:
#feature extraction 
#Transform the text data to feature vector that can be used as imput to the logistic regression
#Tfidvectorizer will give some score to the words
feature_extraction= TfidfVectorizer(min_df=1,stop_words='english', lowercase='true') #min_df if the score of word is less than the number we have given then we exclude it (Score based on repetition of words)
#stop_words is like the words was, is, are etc that isn't really important so we exclude them
#lowercase all the words will be changed to lowercase, computing the letters into lowercase
X_train_features= feature_extraction.fit_transform(X_train) #X_train data is getting converted into numerical value and then storing it in X_train_features
#2 step: 1st is fitting the data into Tfidvectorizer and after that its transforming those into numerical value 
X_test_features= feature_extraction.transform(X_test) #here we are not fitting the data into tfidvectorizer because we don't want to model to see the test dataset
#Convert Y_train and Y_test values as integers
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int') #inshort converting the datatype into integer


In [14]:
print(X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [15]:
#training the model
#logistic regression
model=LogisticRegression()


In [16]:
#training the logistic regression model with the training data
model.fit(X_train_features, Y_train) #X_train_feautres consist of all the numerical values of X_train dataset and Y_train consist of all the corresponding values of X_train_features
#logistic model trained
#fit is the function that is used to fit our logistic regression model to the dataset so it is like training our model

LogisticRegression()

In [17]:
#Evaluating the trained model
#prediction in training data
prediction_on_training_data=model.predict(X_train_features)  #the model will find the Y_train values on its own
accuracy_on_training_data=accuracy_score(Y_train, prediction_on_training_data) #the actual ans is in Y_train whether its spam or not

In [18]:
print("Accuracy on training data= ",accuracy_on_training_data)

Accuracy on training data=  0.9670181736594121


In [19]:
prediction_on_test_data=model.predict(X_test_features)  
accuracy_on_test_data=accuracy_score(Y_test, prediction_on_test_data) 

In [20]:
print("Accuracy on test data= ",accuracy_on_test_data)

Accuracy on test data=  0.9659192825112107


In [21]:
#building a predictive system
input_mail=['Hello how are you']
#convert text to feature vectors
input_data_features=feature_extraction.transform(input_mail)
#making prediction 
prediction=model.predict(input_data_features)
print(prediction)
if prediction[0]==1:
    print("Ham mail")
else:
    print("Spam mail")
    

[1]
Ham mail
