In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import  train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
raw_mail_data = pd.read_csv('/content/mail_data.csv')

In [None]:
print(raw_mail_data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [None]:
#replace null values with null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
mail_data.shape

(5572, 2)

In [None]:
mail_data.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [None]:
mail_data['Category_num'] = mail_data['Category'].apply(lambda x:1 if x =='spam' else 0)

In [None]:
mail_data.head()

Unnamed: 0,Category,Message,Category_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


Spam = 1
Ham = 0

## ***Handling Imbalanced Data***

In [None]:
spam_data = mail_data[mail_data.Category_num ==1]
ham_data = mail_data[mail_data.Category_num ==0]

In [None]:
spam_data.shape

(747, 3)

In [None]:
ham_data.shape

(4825, 3)

In [None]:
ham_data_sample = ham_data.sample(spam_data.shape[0])

In [None]:
ham_data_sample.shape

(747, 3)

## ***New Dataset with Balanced Data***

In [None]:
mail_data_new = pd.concat([spam_data,ham_data_sample],axis = 0)

In [None]:
mail_data_new.shape

(1494, 3)

In [None]:
mail_data_new.head()

Unnamed: 0,Category,Message,Category_num
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1


In [None]:
mail_data_new.tail()

Unnamed: 0,Category,Message,Category_num
5022,ham,:-( sad puppy noise,0
4437,ham,"House-Maid is the murderer, coz the man was mu...",0
2140,ham,But i juz remembered i gotta bathe my dog today..,0
2118,ham,Wish u many many returns of the day.. Happy bi...,0
1149,ham,I'm not driving... Raining! Then i'll get caug...,0


In [None]:
mail_data_new['Category_num'].value_counts()

1    747
0    747
Name: Category_num, dtype: int64

## ***Seperating Data and Labels***

In [None]:
X = mail_data_new.Message

In [None]:
Y = mail_data_new.Category_num

In [None]:
print(X)

2       Free entry in 2 a wkly comp to win FA Cup fina...
5       FreeMsg Hey there darling it's been 3 week's n...
8       WINNER!! As a valued network customer you have...
9       Had your mobile 11 months or more? U R entitle...
11      SIX chances to win CASH! From 100 to 20,000 po...
                              ...                        
5022                                  :-( sad puppy noise
4437    House-Maid is the murderer, coz the man was mu...
2140    But i juz remembered i gotta bathe my dog today..
2118    Wish u many many returns of the day.. Happy bi...
1149    I'm not driving... Raining! Then i'll get caug...
Name: Message, Length: 1494, dtype: object


In [None]:
print(Y)

2       1
5       1
8       1
9       1
11      1
       ..
5022    0
4437    0
2140    0
2118    0
1149    0
Name: Category_num, Length: 1494, dtype: int64


## ***Train Test Split***

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state= 3 , stratify = Y)

In [None]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(1494,)
(1195,)
(299,)


## ***Feature Extraction***

In [None]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [None]:
print(X_train)

4506    Mobile Club: Choose any of the top quality ite...
639                  What will we do in the shower, baby?
5241                            Its a part of checking IQ
2300    Congrats! 1 year special cinema pass for 2 is ...
4577    Congratulations ur awarded 500 of CD vouchers ...
                              ...                        
4514    Money i have won wining number 946 wot do i do...
2386    Someone has contacted our dating service and e...
579     our mobile number has won £5000, to claim call...
1638    0A$NETWORKS allow companies to bill for SMS, s...
4183    Urgent! Please call 0906346330. Your ABTA comp...
Name: Message, Length: 1195, dtype: object


In [None]:
print(X_train_features)

  (0, 553)	0.4410178814294106
  (0, 2006)	0.4410178814294106
  (0, 2827)	0.4410178814294106
  (0, 1126)	0.3450170447514276
  (0, 1155)	0.3293673643824658
  (0, 2376)	0.4347298734723464
  (1, 850)	0.6692974587390039
  (1, 3137)	0.742994556995885
  (2, 1999)	0.7189774068987932
  (2, 1106)	0.6950334440651665
  (3, 2357)	0.22382164169637872
  (3, 1419)	0.20025171180323662
  (3, 300)	0.2728112942996661
  (3, 505)	0.25863973731516976
  (3, 1996)	0.25863973731516976
  (3, 1020)	0.2651322645004963
  (3, 1666)	0.13012481920052738
  (3, 3285)	0.2822096672083118
  (3, 2301)	0.2822096672083118
  (3, 3378)	0.2822096672083118
  (3, 191)	0.2822096672083118
  (3, 2632)	0.25301565445234525
  (3, 1131)	0.2728112942996661
  (3, 3248)	0.2104089891613871
  (3, 3889)	0.23593825174452973
  :	:
  (1193, 738)	0.3317007472752787
  (1193, 254)	0.3317007472752787
  (1193, 2487)	0.3135102517732568
  (1193, 3125)	0.25522015280777394
  (1193, 3203)	0.20459349794428427
  (1193, 2052)	0.15664544689176144
  (1194, 366)

## ***Training The Model***

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train_features,Y_train)

### ***Prediction on Training Data***

In [None]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [None]:
print(f'Training Data Accuracy : {accuracy_on_training_data}')

Training Data Accuracy : 0.9874476987447699


### ***Prediction on Test Data***

In [None]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [None]:
print(f'Test Data Accuracy : {accuracy_on_test_data}')

Test Data Accuracy : 0.959866220735786


In [None]:
#input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]
input_mail = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
input_data_features = feature_extraction.transform(input_mail)
prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Spam mail')

else:
  print('Ham mail')

[1]
Spam mail
