## BOW, TF-IDF and ML-Algorithms

#### 1) Text preprocessing
#### 2) Train Test split
#### 3) BOW and TF-IDF
#### 4) Model Training

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['labels', 'messages'])

In [2]:
messages

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Data cleaning and preprocessing

In [4]:
ps = PorterStemmer()

In [43]:
corpus = []

for i in messages['messages']:
    review = re.sub('^a-zA-z', ' ', i)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)

    corpus.append(review)

y = pd.get_dummies(messages['labels'], dtype=int, drop_first=True).values.flatten()

In [6]:
# corpus

In [56]:
x_train, x_test, y_train, y_test = train_test_split(corpus, y)

## Create bag of words

In [57]:
from sklearn.feature_extraction.text import CountVectorizer

In [58]:
cv = CountVectorizer(max_features=2500, ngram_range=(1, 2))

x_train

['mmmmm ... sooooo good wake word morning, love!! mmmm fuck ... love too, lion ... *devour kiss across sea*',
 'wish you. hold tightly. make see import are. much mean ... much need ... life ...',
 "party' place usf, charg (but contribut way greatli appreciated) yeah, got room one",
 "ur chanc win £250 wkli shop spree txt: shop 80878. t's&c' www.txt-2-shop.com custcar 08715705022, 1x150p/wk",
 'ü come out?',
 'er mw im fill tuth aight',
 'no. meant calcul same. &lt;#&gt; unit &lt;#&gt; . school realli expensive. start practic accent. important. decid 4year dental school nmde exam.',
 'ü decid faster co si go home liao..',
 'im wonder right now?',
 'need. person give na.',
 'okie...',
 'tkt euro2004 cup final £800 cash, collect call 09058099801 b4190604, pobox 7876150ppm',
 'today voda number end 1225 select receiv £50award. match pleas call 08712300220 quot claim code 3100 standard rate app',
 '2 babe feel let 4get it+both tri +cheer up+not fit soo muchxxlov u locaxx',
 'get offici engl

In [59]:
x_train = cv.fit_transform(x_train).toarray()
x_test = cv.transform(x_test).toarray()

In [64]:
cv.vocabulary_  

{'good': 926,
 'wake': 2330,
 'word': 2433,
 'morning': 1436,
 'love': 1293,
 'fuck': 857,
 'too': 2192,
 'lion': 1248,
 'kiss': 1173,
 'across': 164,
 'sea': 1863,
 'kiss across': 1174,
 'across sea': 165,
 'wish': 2414,
 'you': 2486,
 'hold': 1044,
 'make': 1322,
 'see': 1872,
 'import': 1104,
 'are': 238,
 'much': 1453,
 'mean': 1352,
 'need': 1476,
 'life': 1235,
 'place': 1663,
 'usf': 2290,
 'charg': 463,
 'but': 376,
 'way': 2357,
 'yeah': 2474,
 'got': 941,
 'room': 1820,
 'one': 1574,
 'yeah got': 2475,
 'ur': 2256,
 'chanc': 459,
 'win': 2408,
 '250': 73,
 'wkli': 2426,
 'shop': 1919,
 'spree': 2014,
 'txt': 2226,
 'www': 2454,
 'com': 517,
 'custcar': 586,
 '08715705022': 23,
 '1x150p': 61,
 'wk': 2424,
 'ur chanc': 2260,
 'chanc win': 460,
 'win 250': 2410,
 'shop spree': 1920,
 'txt shop': 2231,
 '1x150p wk': 62,
 'come': 518,
 'out': 1605,
 'come out': 522,
 'er': 731,
 'im': 1100,
 'fill': 790,
 'aight': 191,
 'no': 1506,
 'meant': 1353,
 'same': 1845,
 'lt': 1301,
 'gt'

In [62]:
import numpy as np

np.set_printoptions(edgeitems=100)

In [63]:
y

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0])

## Train Test Split

In [65]:
model = MultinomialNB()
model.fit(x_train, y_train)

In [66]:
y_pred = model.predict(x_test)

In [67]:
from sklearn.metrics import accuracy_score, classification_report

In [68]:
accuracy_score(y_test, y_pred)

0.9849246231155779

In [69]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1201
           1       0.97      0.92      0.94       192

    accuracy                           0.98      1393
   macro avg       0.98      0.96      0.97      1393
weighted avg       0.98      0.98      0.98      1393



In [70]:
new_message = ['My name is Harsh']
new_message_transformed = cv.transform(new_message).toarray() 
prediction = model.predict(new_message_transformed)

In [71]:
prediction

array([0])

In [72]:
y

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0])

### Create TF-IDF Model

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
tfidf = TfidfVectorizer(max_features=2500, ngram_range=(1, 2))

In [30]:
x_train = tfidf.fit_transform(x_train).toarray()
x_test = tfidf.transform(x_te)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

accuracy_score(y_test, y_pred)

0.9784637473079684