#### Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#### Data Collection and Data Processing

In [2]:
# loading the data to a pandas DataFrame

mail_data = pd.read_csv('mail_data.csv')

In [3]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
mail_data.shape

(5572, 2)

In [5]:
# replacing null values with null string

data = mail_data.where((pd.notnull(mail_data)),'')

#### Label Encoding
#### 0 ---> spam
#### 1 ---> ham

In [6]:
data.loc[data['Category']=='spam','Category'] = 0
data.loc[data['Category']=='ham','Category'] = 1

In [7]:
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# saperating data and labels

X = data['Message']

Y = data['Category']

In [9]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [10]:
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

#### Train_Test_Split

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

In [12]:
print(X.shape, X_train.shape, X_test.shape)

(5572,) (4457,) (1115,)


#### Feature Extraction - transform text data to feature vectors (Numericals)

In [13]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = 'True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# converting Y_train and Y_test as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [14]:
print(X_train_features)

  (0, 7289)	0.517250079608171
  (0, 2823)	0.517250079608171
  (0, 3764)	0.22046319970004674
  (0, 2262)	0.4931693086193514
  (0, 7438)	0.2996693624522655
  (0, 4768)	0.28858793133473676
  (1, 4136)	0.4717788963273522
  (1, 6517)	0.49481520325330863
  (1, 1558)	0.4236400720998954
  (1, 4972)	0.49481520325330863
  (1, 3317)	0.32904344933475643
  (2, 5798)	0.2821824162510531
  (2, 3835)	0.2623708342584191
  (2, 4943)	0.33789703751914013
  (2, 5837)	0.1845655907506494
  (2, 1430)	0.28509060215711635
  (2, 6641)	0.20096909705626312
  (2, 3722)	0.24768901862403342
  (2, 3935)	0.3671145612703168
  (2, 3118)	0.18009671431232455
  (2, 4269)	0.2543939099135892
  (2, 3398)	0.20665621299033204
  (2, 2136)	0.180851695270251
  (2, 3086)	0.27449720225122765
  (2, 4099)	0.186263215205624
  :	:
  (4454, 5765)	0.27366476899994313
  (4454, 4205)	0.27366476899994313
  (4454, 6404)	0.2834859847167938
  (4454, 387)	0.2598225428978842
  (4454, 865)	0.26604684225670366
  (4454, 2972)	0.2598225428978842
  (445

In [15]:
print(X_test_features)

  (0, 6885)	0.15260537497993798
  (0, 6513)	0.2657436287350355
  (0, 5030)	0.24210715613503428
  (0, 4827)	0.21680692811499552
  (0, 4755)	0.2137047703002642
  (0, 4625)	0.1629132197579507
  (0, 4419)	0.2358899506086862
  (0, 4365)	0.2003761424782757
  (0, 3912)	0.20823705036803863
  (0, 3154)	0.17077412764771363
  (0, 2906)	0.28559070500052114
  (0, 2375)	0.466320953046431
  (0, 2046)	0.1984270278883612
  (0, 1585)	0.2137047703002642
  (0, 1283)	0.19658332365071185
  (0, 398)	0.2831628958086886
  (0, 45)	0.23885705786351533
  (1, 7196)	0.5256391808173945
  (1, 6271)	0.681815528764269
  (1, 4768)	0.5087543968611168
  (2, 5317)	0.3771071267667238
  (2, 4475)	0.47232298297806397
  (2, 3539)	0.47232298297806397
  (2, 3339)	0.27401894009219313
  (2, 3154)	0.2848556310871354
  :	:
  (1111, 3012)	0.24407682892924834
  (1111, 2849)	0.5805643327270962
  (1111, 845)	0.4669254648913738
  (1112, 4036)	0.42046222473394385
  (1112, 1321)	0.5893917852568756
  (1112, 1133)	0.6898034800169506
  (1113,

#### Training the model - Logistic Regression

In [16]:
model = LogisticRegression()

In [17]:
model.fit(X_train_features, Y_train)

LogisticRegression()

#### Evaluating the model

In [18]:
# predicting on training data

prediction_training_data = model.predict(X_train_features)
accuracy_train_data = accuracy_score(Y_train, prediction_training_data)

In [19]:
accuracy_train_data 

0.9672425398249944

In [20]:
# predicting on test data

prediction_test_data = model.predict(X_test_features)
accuracy_test_data = accuracy_score(Y_test, prediction_test_data)

In [21]:
accuracy_test_data

0.9704035874439462

#### Building a predictive syatem

In [22]:
input_mail = ["I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."]

# convert text to feature vector

input_feature = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_feature)

prediction


array([1])

In [23]:
if(prediction[0] == 1):
    print('ham')
else:
    print('spam')

ham
