In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer #it will be used to convert the text data into the numeriacal value, that is more understandable by machine

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score # this will be used to evaluate our model, i.e. how well our model is running

# Data Collection & Pre-Processing

In [110]:
# Loading the data from csv file.
data = pd.read_csv('Data/mail_data.csv')
#data = pd.read_csv('Data/spam.csv')

In [111]:
raw_mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [112]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [113]:
data = data.drop(columns=(['Unnamed: 2']), axis='1')

KeyError: "['Unnamed: 2'] not found in axis"

In [114]:
# data = data.drop(columns=(['Unnamed: 4', 'Unnamed: 3']), axis='1')

In [115]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [116]:
len(data) == len(raw_mail_data)

True

In [117]:
# replacing the null values with the null string

mail_data = data.where((pd.notnull(data)), '') # we are removing the null with the null string

In [118]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [119]:
data.rename(columns= {'v1' : 'Category', 'v2' : 'Message'}, inplace=True)

In [120]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [121]:
data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [122]:
data.shape

(5572, 2)

In [141]:
# since we are working on the categorical data, and using Logistic Regression, thus we'll be changing the `Category` of the data from Ham : 0(True) and Spam : 1(False). this is called *Label Encoding*
data.loc[data['Category'] == 'spam', 'Category'] = 0
data.loc[data['Category'] == 'ham', 'Category'] = 1

In [125]:
# seperating the data as text and labels. So that we can give the values to the ML Model. This will work similar to X-Axis value and Y-Axis value.
x = data['Message'] # labels
y = data['Category'] # labels
print(x)


0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [143]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [160]:
x = x.drop(1115)

In [161]:
# x.iloc[1115]

'So lets make it saturday or monday as per convenience.'

In [162]:
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


## Splitting Data into Training and Testing Data

In [163]:
X_train, x_test, Y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=3)

ValueError: Found input variables with inconsistent numbers of samples: [5570, 5572]

In [164]:
print(x.shape)
print(X_train.shape)
print(x_test.shape)

(5570,)
(4457,)
(1115,)


## Feature Extraction

(Converting Text data to numerical values)

In [165]:
# converting the data to feature vector, so that we can use these values to the LogisticRegession Model, as it work on the numerical data not categorical data.


In [178]:
feature_ext = TfidfVectorizer(min_df=1, stop_words='english', lowercase='True')
# Tfidfvectorizer = it looks at the data that we have added i.e. 'data' it will go through all the words in the dataset, and the word that is repeated too many times it will get score i.e. it will give value to the words that are used too many times.
# The LOGISTIC_REGRESSION model will link all the words that are used too many times which they will be related to the spam point, and so on. Thus the score value is taken into consideration i.e. given by the Tfidfvectorizer().
# here we've put the minimum value as 1, if the score >1 we will include it. that is the score min_df= 1 will be give initially to all the words.
# stop_words = english, means that we added the words that are the stop words in the english language for ex. that, like and all.... So, we will be using these words for making the prediction.
# lowercase = 'True, all the letters will ce changed to the lowecase coz it is easy for computation pursopse.
# convering the words to numbers, the words in the X_train will be converted into the numerical values taht will be used for the processing and prediction.
X_train_fts = feature_ext.fit_transform(X_train) # this will fit all the mails in the vectorizer function and will transform into the feature vectors i.e. numerical values.
x_test_fts = feature_ext.fit_transform(x_test) # here we will not fit the data, bcoz we dont want our model to look at the x_test data coz then it will be converted into numerical values.


In [179]:
# Since the labels for span and ham mail are labeled as 0 & 1. They are the labels not the integer values. SO we will convert them to the integer values.
Y_train = Y_train.astype('int')

In [180]:
y_test = y_test.astype('int')

In [181]:
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [182]:
print(X_train_fts)
# thus a score is

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

## Training the Model

Logistic Regression

In [183]:
model = LogisticRegression()

In [184]:
model.fit(X_train_fts, Y_train)

LogisticRegression()

In [185]:
Y_train

3075    1
1787    1
1614    1
4304    1
3266    0
       ..
789     0
968     1
1667    1
3321    1
1688    0
Name: Category, Length: 4457, dtype: int32

In [186]:
prediction_on_model =  model.predict(x_test_fts)
accuracy_on_data = accuracy_score(Y_train, prediction_on_model)

ValueError: X has 3296 features per sample; expecting 7431

In [188]:
# x_test_fts.loc([4457, 1115])

Now we compare the predicted values by our model with the `y_test` walue

In [189]:
print(accuracy_on_data)

NameError: name 'accuracy_on_data' is not defined