<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#importing-librarys-and-dataset" data-toc-modified-id="importing-librarys-and-dataset-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>importing librarys and dataset</a></span></li><li><span><a href="#Tokenization" data-toc-modified-id="Tokenization-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Tokenization</a></span></li><li><span><a href="#Filtering-stopwords" data-toc-modified-id="Filtering-stopwords-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Filtering stopwords</a></span></li><li><span><a href="#Lemmatization" data-toc-modified-id="Lemmatization-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Lemmatization</a></span></li><li><span><a href="#Feature-Extraction" data-toc-modified-id="Feature-Extraction-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Feature Extraction</a></span></li><li><span><a href="#PCA---Dimentionality-Reduction" data-toc-modified-id="PCA---Dimentionality-Reduction-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>PCA - Dimentionality Reduction</a></span></li><li><span><a href="#split-dataset" data-toc-modified-id="split-dataset-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>split dataset</a></span></li><li><span><a href="#Training-Classifier" data-toc-modified-id="Training-Classifier-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Training Classifier</a></span><ul class="toc-item"><li><span><a href="#using-GaussianNB" data-toc-modified-id="using-GaussianNB-8.1"><span class="toc-item-num">8.1&nbsp;&nbsp;</span>using GaussianNB</a></span></li><li><span><a href="#using-LSTM" data-toc-modified-id="using-LSTM-8.2"><span class="toc-item-num">8.2&nbsp;&nbsp;</span>using LSTM</a></span></li></ul></li></ul></div>

# importing librarys and dataset

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('Datasets/smsText.csv')
data.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   sms     5574 non-null   object
 1   label   5574 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [4]:
df = data.copy()

# Tokenization

In [5]:
sms = df.sms.apply(lambda row: row.lower())
label = df.label

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize
tokenized_sms = sms.apply(word_tokenize)
tokenized_sms.iloc[0]

['go',
 'until',
 'jurong',
 'point',
 ',',
 'crazy',
 '..',
 'available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 '...',
 'cine',
 'there',
 'got',
 'amore',
 'wat',
 '...']

# Filtering stopwords

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SHEHA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [13]:
def remove_stopwords(sentence: list) -> list:
    return [word for word in sentence if (word.casefold() not in stop_words) and word.isalpha()]

In [14]:
filtered_tokens = pd.Series(map(remove_stopwords, tokenized_sms))
filtered_tokens

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, wkly, comp, win, fa, cup, final,...
3           [u, dun, say, early, hor, u, c, already, say]
4          [nah, think, goes, usf, lives, around, though]
                              ...                        
5569    [time, tried, contact, u, pound, prize, claim,...
5570                   [ü, b, going, esplanade, fr, home]
5571                            [pity, mood, suggestions]
5572    [guy, bitching, acted, like, interested, buyin...
5573                                   [rofl, true, name]
Length: 5574, dtype: object

In [16]:
filtered_tokens.iloc[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

# Lemmatization

In [17]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SHEHA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\SHEHA\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [18]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize(sentence: list) -> list:
    return ' '.join(map(lambda word: lemmatizer.lemmatize(word), sentence))

In [19]:
lemmatized_tokens = filtered_tokens.apply(lemmatize)
lemmatized_tokens

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry wkly comp win fa cup final tkts may...
3                     u dun say early hor u c already say
4                     nah think go usf life around though
                              ...                        
5569    time tried contact u pound prize claim easy ca...
5570                          ü b going esplanade fr home
5571                                 pity mood suggestion
5572    guy bitching acted like interested buying some...
5573                                       rofl true name
Length: 5574, dtype: object

In [20]:
lemmatized_tokens.iloc[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

# Feature Extraction

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
features = pd.DataFrame(tfidf_vectorizer.fit_transform(lemmatized_tokens).toarray()).astype(np.float16)
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6677,6678,6679,6680,6681,6682,6683,6684,6685,6686
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# PCA - Dimentionality Reduction

In [23]:
from sklearn.decomposition import PCA

dim_reductor = PCA(n_components=100)
features = dim_reductor.fit_transform(features)

In [31]:
features.shape

(5574, 100)

# split dataset

In [25]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(features, label, test_size=0.3, random_state=33) 

# Training Classifier

## using GaussianNB

In [26]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(xtrain, ytrain)

In [40]:
from sklearn.metrics import classification_report
ypred = clf.predict(xtest)
report = classification_report(y_true=ytest, y_pred=ypred)
print(report)

              precision    recall  f1-score   support

           0       0.96      0.91      0.93      1449
           1       0.56      0.75      0.64       224

    accuracy                           0.89      1673
   macro avg       0.76      0.83      0.79      1673
weighted avg       0.91      0.89      0.89      1673



In [37]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=ytest, y_pred=ypred)

array([[1317,  132],
       [  56,  168]], dtype=int64)

## using LSTM

In [28]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [95]:
# Vocabulary size
# using loss='binary_crossentropy' as it is a binary classification problem

voc_size = 5000
embedding_vector_features = 40 # features representation
model=Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=100))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 100, 40)           200000    
                                                                 
 lstm_4 (LSTM)               (None, 100)               56400     
                                                                 
 dense_4 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [96]:
model.fit(xtrain, ytrain, validation_data=(xtest, ytest), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x147ffb360e0>

In [97]:
model.fit(xtrain, ytrain)



<keras.callbacks.History at 0x1478842d840>

In [98]:
ypred_lstm1 = model.predict(xtest)



In [99]:
ypred_lstm1.shape

(1673, 1)

In [100]:
ypred_lstm1

array([[0.16614172],
       [0.16614172],
       [0.16614172],
       ...,
       [0.16614172],
       [0.16614172],
       [0.16614172]], dtype=float32)

In [110]:
ypred_lstm2 = ypred_lstm1.copy()

In [111]:
ypred_lstm2 = np.where(ypred_lstm > 16614172, 0,1)

In [112]:
ypred_lstm2

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [113]:
report = classification_report(y_true=ytest, y_pred=ypred_lstm2)
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1449
           1       0.13      1.00      0.24       224

    accuracy                           0.13      1673
   macro avg       0.07      0.50      0.12      1673
weighted avg       0.02      0.13      0.03      1673



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




<keras.callbacks.History at 0x147f7e70ca0>

In [74]:
ypred_lstm1 = model.predict(xtest)



In [58]:
ytrain.value_counts()

0    3378
1     523
Name: label, dtype: int64

In [48]:
ypred_lstm.shape

(1673, 1)

In [50]:
ypred

array([0, 1, 0, ..., 1, 0, 0], dtype=int64)

In [49]:
ypred_lstm

array([[0.14365998],
       [0.14365998],
       [0.14365998],
       ...,
       [0.14365998],
       [0.14365998],
       [0.14365998]], dtype=float32)

In [45]:
ypred_lstm = np.where(ypred_lstm > 0.5, 1,0)

In [52]:
ypred_lstm1 = model.predict(xtest)
ypred_lstm1



array([[0.14365998],
       [0.14365998],
       [0.14365998],
       ...,
       [0.14365998],
       [0.14365998],
       [0.14365998]], dtype=float32)

In [46]:
confusion_matrix(ytest, ypred_lstm)

array([[1449,    0],
       [ 224,    0]], dtype=int64)

In [44]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, ypred_lstm)

0.8661087866108786

In [71]:
report = classification_report(y_true=ytest, y_pred=ypred_lstm1)
print(report)

              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1449
           1       0.00      0.00      0.00       224

    accuracy                           0.87      1673
   macro avg       0.43      0.50      0.46      1673
weighted avg       0.75      0.87      0.80      1673



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [90]:
ytest.value_counts()

0    1449
1     224
Name: label, dtype: int64

In [63]:
xtest.shape

(1673, 100)