In [157]:
# Perform std imports 
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
import nltk
# nltk.download('stopwords') #Downloding stop word corpus
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [158]:
dataset = pd.read_csv('Extracted Data/SMSSpamCollection',sep='\t',header=None)

In [159]:
dataset.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [160]:
dataset.columns = ['label','msg']

In [161]:
dataset.isnull().sum()

label    0
msg      0
dtype: int64

#### Preprocessing

In [162]:
ps = PorterStemmer()

In [163]:
corpus = []
for sent in dataset['msg']:
    corpus.append(' '.join([ps.stem(word) for word in re.sub('[^a-zA-z]',' ',sent).lower().split() if word not in  stopwords.words('english')]))

In [164]:
dataset['processed_text'] = corpus

In [165]:
dataset.head()

Unnamed: 0,label,msg,processed_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [166]:
X = dataset['processed_text'].values
y = dataset['label'].values

In [167]:
from sklearn.model_selection import train_test_split

In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [169]:
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

In [170]:
pip = Pipeline([('tfid',TfidfVectorizer()),('clf',LogisticRegression())])

In [171]:
pip.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('tfid',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [172]:
pip.predict(X_test)

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [173]:
from sklearn.metrics import confusion_matrix, classification_report

In [174]:
print(classification_report(y_test,pip.predict(X_test)))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1593
        spam       0.97      0.77      0.86       246

    accuracy                           0.97      1839
   macro avg       0.97      0.88      0.92      1839
weighted avg       0.97      0.97      0.96      1839



In [175]:
confusion_matrix(y_test,pip.predict(X_test))

array([[1587,    6],
       [  57,  189]], dtype=int64)

In [176]:
confusion_matrix(y_train,pip.predict(X_train))

array([[3227,    5],
       [ 110,  391]], dtype=int64)