### BOW,TFIDF,Machine Learning Algorithms
1. Preprocessing And Cleaning
2. Train Test Split
3. BOW And TF-IDF (Sentences--->vectors) {Preventing Data Leakage}
4. Trained Our Models

In [60]:
import pandas as pd

df = pd.read_csv("../datasets/smsspamcollection.csv",
                 sep='\t',
                 names=['lables','messages'])
df

Unnamed: 0,lables,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [61]:
# Cleaning the data
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()
lemm = WordNetLemmatizer()

corpus=[]
for i in range(0,len(df)):
    special_characters_removed = re.sub('[^A-Za-z1-9]',' ',df['messages'][i])
    lower_df = special_characters_removed.lower()
    list_words_df = lower_df.split()
    stem_df = [ps.stem(word) for word in list_words_df if not word in stopwords.words('english')]
    #lemm_df = [lemm.lemmatize(word) for word in list_words_df if word not in stopwords.words('english')]
    cleaned_df = ' '.join(stem_df)
    corpus.append(cleaned_df)

In [62]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2 5 text fa 87121 receiv entri question std txt rate c appli 845281 75over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 5 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 9 prize reward claim call 9 617 1461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 8 2986 3',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 1 2 pound txt csh11 send 87575 cost 15 p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free membership 1 prize j

In [63]:
# as we can see the columns of get_dummies in latest version of pandas is boolean so we get true r false.
# so let's convert them from booleans to integers

# y = pd.get_dummies(df['lables'])
# print(y.dtypes)
# y = y.iloc[:,0].values



# lets convert them to integer so that we get 0's an 1's as output instead of true or false
y = pd.get_dummies(df['lables']).astype(int)
y = y.iloc[:, 0].values



#or we can convert only that column alone to int 

# y = pd.get_dummies(df['lables'])
# y.iloc[:, 0] = y.iloc[:, 0].astype(int)
# y = y.iloc[:, 0].values

In [64]:
#Train Test Splie

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(corpus, y, test_size=0.2)

In [65]:
len(X_train),len(y_train)

(4457, 4457)

In [66]:
len(X_test),len(y_test)

(1115, 1115)

In [67]:
# Now create the bag of words

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500, ngram_range=(1,2))
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()


In [68]:
X_train, X_test

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64))

In [69]:
X_train.shape, X_test.shape

((4457, 2500), (1115, 2500))

In [70]:
cv.vocabulary_

{'long': 1283,
 'sinc': 1948,
 'scream': 1880,
 'princess': 1718,
 'sorri': 1996,
 'went': 2401,
 'bed': 298,
 'earli': 704,
 'hello': 1034,
 'news': 1501,
 'job': 1156,
 'make': 1336,
 'wait': 2342,
 'week': 2384,
 'yeah': 2480,
 'im': 1109,
 'still': 2039,
 'home': 1066,
 'late': 1210,
 'wen': 2399,
 'ur': 2277,
 'free': 836,
 'come': 517,
 'also': 198,
 'tel': 2117,
 'vikki': 2325,
 'hav': 1022,
 'sent': 1909,
 'mail': 1334,
 'better': 310,
 'even': 745,
 'il': 1107,
 'today': 2183,
 'aftr': 176,
 'wen ur': 2400,
 'come home': 520,
 'import': 1113,
 'custom': 590,
 'servic': 1914,
 'announc': 209,
 'call': 379,
 'freephon': 854,
 'custom servic': 592,
 'servic announc': 1915,
 'say': 1872,
 'give': 912,
 'friend': 858,
 'got': 955,
 'money': 1437,
 'definit': 631,
 'buy': 371,
 'end': 725,
 'give call': 913,
 'got money': 959,
 'walk': 2347,
 'mom': 1433,
 'right': 1827,
 'pass': 1613,
 'left': 1234,
 'address': 167,
 'lt': 1315,
 'gt': 978,
 'lt gt': 1317,
 'togeth': 2190,
 'thinki

In [71]:
# Model training

from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB()

spam_detect_model = NB.fit(X_train,y_train)
predicted_y = spam_detect_model.predict(X_test)

In [72]:
predicted_y

array([1, 1, 1, ..., 1, 1, 1])

In [73]:
from sklearn.metrics import classification_report, accuracy_score

print(f"accuracy report of naivebayes model for BOW is : {accuracy_score(y_test,predicted_y)}")
print(f"classification report of naivebayes model for BOW is : {classification_report(y_test,predicted_y)}")


accuracy report of naivebayes model for BOW is : 0.9883408071748879
classification report of naivebayes model for BOW is :               precision    recall  f1-score   support

           0       0.97      0.94      0.95       143
           1       0.99      1.00      0.99       972

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [74]:
# TFIDF
# first do train test split

X_train,X_test,y_train,y_test = train_test_split(corpus,y,test_size=0.2)



In [78]:
len(X_train),len(X_test),len(y_train),len(y_test)

(4457, 1115, 4457, 1115)

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(max_features=2500, ngram_range=(1,2))

X_train = tf.fit_transform(X_train).toarray()
X_test = tf.transform(X_test).toarray()

In [80]:
X_train,X_test

(array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.27948303, 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]]),
 array([[0.        , 0.        , 0.        , ..., 0.        , 0.18516346,
         0.18959923],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0. 

In [82]:
tf.vocabulary_

{'support': 2068,
 'text': 2112,
 'ad': 163,
 'next': 1485,
 'bill': 301,
 'fuck': 846,
 'know': 1168,
 'weak': 2373,
 'grin': 952,
 'push': 1715,
 'belli': 294,
 'pull': 1711,
 'head': 1002,
 'forget': 809,
 'smile': 1951,
 'import': 1092,
 'messag': 1368,
 'final': 781,
 'contact': 521,
 'attempt': 248,
 'wait': 2334,
 'custom': 575,
 'claim': 455,
 'expir': 748,
 '13': 8,
 'call': 369,
 'messag wait': 1371,
 'lol': 1266,
 'great': 946,
 'im': 1087,
 'get': 869,
 'hungri': 1070,
 'ok': 1539,
 'hi': 1020,
 'think': 2131,
 'match': 1334,
 'never': 1476,
 'day': 596,
 'ur': 2263,
 'life': 1232,
 'good': 919,
 'give': 889,
 'happi': 991,
 'bad': 273,
 'experi': 747,
 'god': 911,
 'bless': 313,
 'morn': 1427,
 'ur life': 2271,
 'good day': 921,
 'day give': 598,
 'good morn': 925,
 'convinc': 533,
 'tht': 2145,
 'possibl': 1674,
 'hurt': 1072,
 'feel': 772,
 'nobodi': 1498,
 'town': 2194,
 'open': 1558,
 'sent': 1882,
 'money': 1422,
 'yesterday': 2487,
 'oh': 1536,
 'new': 1477,
 'year':

In [85]:
from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB()

tfidf_model_precdictor = NB.fit(X_train,y_train)
y_predicted = tfidf_model_precdictor.predict(X_test)



In [86]:
from sklearn.metrics import classification_report, accuracy_score

print(f"accuracy report of naivebayes model for BOW is : {accuracy_score(y_test,y_predicted)}")
print(f"classification report of naivebayes model for BOW is : {classification_report(y_test,y_predicted)}")

accuracy report of naivebayes model for BOW is : 0.9901345291479821
classification report of naivebayes model for BOW is :               precision    recall  f1-score   support

           0       1.00      0.92      0.96       133
           1       0.99      1.00      0.99       982

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115

