In [1]:
#import basic packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('spams.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Labels,Message
0,0,ham,go jurong point crazy available bugis n great ...
1,1,ham,ok lar joking wif u oni
2,2,spam,free entry wkly comp win fa cup final tkts st ...
3,3,ham,u dun say early hor u c already say
4,4,ham,nah think go usf life around though


In [3]:
#we don't want to have the 'unnamed column'

df = df.iloc[:,1:]
df.head()

Unnamed: 0,Labels,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


In [4]:
#check for missing data

df.isnull().sum()

Labels     0
Message    8
dtype: int64

In [5]:
#we have 8 missing data so drop them
#we won't impute them because if we fill them with same data so no point

newdf = df.dropna()

In [6]:
newdf.isnull().sum()

Labels     0
Message    0
dtype: int64

In [None]:
#labels : dependent variable
#messages : independent variable

In [8]:
newdf['Labels'].value_counts()

ham     4817
spam     747
Name: Labels, dtype: int64

# its imbalanced, so we will balance it with sampling techniques(oversampling)


In [10]:
ham = newdf[newdf['Labels']=='ham']
spam = newdf[newdf['Labels']=='spam']

In [11]:
ham.shape,spam.shape

((4817, 2), (747, 2))

In [13]:
spam = spam.sample(ham.shape[0],replace =True) 
# we make the spam sample shape same to ham sample shape
#replace = True to copy paste the same number, by default replace= False
#make sure you do the above line or else value error, since we are taking the sample of smaller and repeating to get larger number

In [12]:
ham.shape[0]

4817

In [14]:
ham.shape,spam.shape
#now we have same shape

((4817, 2), (4817, 2))

In [15]:
#now we append both of them in same table and go ahead
dataset = ham.append(spam, ignore_index =True)
dataset.head() #ham will come first and spam will come later

Unnamed: 0,Labels,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,ham,u dun say early hor u c already say
3,ham,nah think go usf life around though
4,ham,even brother like speak treat like aid patent


In [16]:
dataset.tail()

Unnamed: 0,Labels,Message
9629,spam,u secret admirer looking make contact u find r...
9630,spam,prize go another customer c www c biz p min po...
9631,spam,hot live fantasy call p per min ntt ltd po box...
9632,spam,think ur smart win week weekly quiz text play ...
9633,spam,guaranteed cash prize claim yr prize call cust...


# Splitting the data train-test

In [17]:
#splitting the dataset into train and test for building model and for prediction

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(dataset['Message'], dataset['Labels'], test_size =0.25,random_state=101)

In [18]:
x_train

3862     lem know swing pick free basically time semester
8893    sexy sexy cum text im wet warm ready porn u fu...
9090    guaranteed award even cashto claim ur award ca...
3956    hi engagement fixd lt gt th next month know re...
8827    free entry wkly comp win fa cup final tkts st ...
                              ...                        
599                           k u also dont msg reply msg
5695    congrats mobile g videophones r call videochat...
8006    bored speed dating try speedchat txt speedchat...
1361          charge transfer charge withdraw anyhow like
1547                                       bus leaf lt gt
Name: Message, Length: 7225, dtype: object

In [19]:
y_train

3862     ham
8893    spam
9090    spam
3956     ham
8827    spam
        ... 
599      ham
5695    spam
8006    spam
1361     ham
1547     ham
Name: Labels, Length: 7225, dtype: object

In [20]:
x_test

3462                  gon na worry nothing give money use
8889    tried call reply sm video mobile min unlimited...
8235    free st week nokia tone ur mob every week txt ...
7156    great news call freefone claim guaranteed cash...
1998                              apps class varaya elaya
                              ...                        
3899                                      project pa come
7482    camera awarded sipix digital camera call fromm...
9071    eerie nokia tone u rply tone title eg tone dra...
9512    important customer service announcement premie...
1572         hello thanx taking call got job start monday
Name: Message, Length: 2409, dtype: object

In [21]:
y_test

3462     ham
8889    spam
8235    spam
7156    spam
1998     ham
        ... 
3899     ham
7482    spam
9071    spam
9512    spam
1572     ham
Name: Labels, Length: 2409, dtype: object

# Building Classification model

In [22]:
#Feature Extraction - since models work on numbers
from sklearn.feature_extraction.text import TfidfVectorizer

#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

#combining both feature extraction and machine learning together by the help of pipeline

from sklearn.pipeline import Pipeline

In [23]:
classifier = Pipeline([('tfidf', TfidfVectorizer()),
                       ('classifier', RandomForestClassifier(n_estimators =100, criterion = 'gini'))])

In [24]:
classifier.fit(x_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier', RandomForestClassifier())])

In [25]:
y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

# Performance Matrix

In [26]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [27]:
print('Training ')
print(classification_report(y_train, y_pred_train))
print(' ')
print('Testing ')
print(classification_report(y_test, y_pred_test))

Training 
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3585
        spam       1.00      1.00      1.00      3640

    accuracy                           1.00      7225
   macro avg       1.00      1.00      1.00      7225
weighted avg       1.00      1.00      1.00      7225

 
Testing 
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      1232
        spam       1.00      1.00      1.00      1177

    accuracy                           1.00      2409
   macro avg       1.00      1.00      1.00      2409
weighted avg       1.00      1.00      1.00      2409



In [29]:
#check your model performance

test1 = ["Hi, can you please share the class notes what we discussed yesterday."]
test2 = ["Congratulation, You won a lottery ticket worth $1 Millons ! To claim call @1111111"]
test3 = ["Click here 'www.glajsdlfjasld.com' to get yours prize money"]
test4 = ['please find the attached file for the review']
test5 = ["Get rich in less than a month once you click on the given link 'www.asdjf;lasdjfla.com' and share your account details"]
test6 = ['Your credit card is blocked']
test7 = ["You are lucky one! Congratulation, you will win 1 lakh rupees to click on the given link 'https.www.ggajdajsd.com'"]
test8 = ['your mobile no.  won the  first prize  worth 1 Cr']
test9 = ['Share yours otp to unlock yours account']
test10 = ["sorry we don't have vacancy for your qualification."]

In [31]:
print("test1" , classifier.predict(test1))
print("test2" , classifier.predict(test2))
print("test3" , classifier.predict(test3))
print("test4" , classifier.predict(test4))
print("test5" , classifier.predict(test5))
print("test6" , classifier.predict(test6))
print("test7" , classifier.predict(test7))
print("test8" , classifier.predict(test8))
print("test9" , classifier.predict(test9))
print("test10" , classifier.predict(test10))

test1 ['ham']
test2 ['spam']
test3 ['spam']
test4 ['ham']
test5 ['spam']
test6 ['ham']
test7 ['spam']
test8 ['spam']
test9 ['ham']
test10 ['ham']
