# Changing directory

In [1]:
# Setup
!pip install -q wordcloud
import wordcloud

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import io
import unicodedata
import numpy as np
import re
import string
import warnings
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report,f1_score

warnings.filterwarnings('ignore')
  

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = "/content/drive/MyDrive/CSV/sms_spam_ham_Question5_exam1_2.tsv"

In [None]:
dataset = pd.read_table(path,names=['pred', 'message'])
dataset.head()

In [None]:
dataset['pred'].value_counts()

In [None]:
dataset['pred']=(dataset['pred']=='spam').astype(int)

In [None]:
dataset['pred'].value_counts()

In [None]:
#importing all the nlp packages
import re
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

tokenizer = TreebankWordTokenizer()
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
#Filter the reviews by removing punctuations, stopwords, lemmatizing and stemming
corpus = [] #creating an empty list
for i in range(len(dataset.message)):
    text = dataset.message[i].lower()# converting each row to lower case
    text = re.sub('[^a-z0-9]', ' ', text) # except english letters and numbers converting other charectors to space
    tokens = tokenizer.tokenize(text) #seoparating words by using TreebankWordTokenizer
    filtered_tokens = [w for w in tokens if w not in stop_words] #removing stopwords
    filtered  = [stemmer.stem(lemmatizer.lemmatize(w)) for w in filtered_tokens] #Stemming and Lemmatizing to fine the root of words
    filtered_text = ' '.join(filtered) #convertong all tokens that left to one string for each row/record
    corpus.append(filtered_text) #appending that new record to corpus

In [None]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[:10])

In [None]:
# it is better for those adjective that have not befor them converting to new word
from nltk import bigrams, pos_tag, word_tokenize
s="Torrez is not happy. He wasn't free last night. he was too busy"
wt=word_tokenize(s)
p = pos_tag(wt)
bg=bigrams(p)

for(w1,t1),(w2,t2) in bg:
    if ((w1.lower()=="n't") | (w1.lower()=='not')| (w1.lower()=='no'))&(t2[0]=='J'):
        print("not"+ w2)

In [None]:
#Bag of Words (BoW) technique to convert corpus into X
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(corpus)

In [None]:
#each word as feature and column index 
cv.vocabulary_

In [None]:
features = list(cv.vocabulary_.keys())
features = sorted(features)

In [None]:
#Transform text to matrix for further modelling
X = pd.DataFrame(cv.transform(corpus).toarray(),columns=features)
y = dataset.pred

In [None]:
X.shape

In [None]:
cv.vocabulary_["wow"]# since  cv.vocabulary_ is dictionary you can get value of each key simply by using that key

In [None]:
X.iloc[0,7089] #word wow in the first message

In [None]:
cv.vocabulary_["love"]

In [None]:
X.iloc[0,4027]

In [None]:
X.iloc[1,4025]

##### Since we converted our unstructure dataset to structure dataset we can apply any classification algorithm for prediction 

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state =0)

In [None]:
#TF-IDF technique to convert corpus into X
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(corpus)
features = list(tfidf.vocabulary_.keys())
features = sorted(features)
X = pd.DataFrame(tfidf.transform(corpus).toarray(),columns=features)
y = dataset.pred

In [None]:
X.head()

# Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
model8= AdaBoostClassifier(random_state=0)

In [None]:
model8.fit(X_train,y_train)
model8.score(X_test,y_test)

0.9770279971284996

In [None]:
model8_2= AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),n_estimators=40,random_state=0)

In [None]:
model8_2.fit(X_train,y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
                   n_estimators=40, random_state=0)

In [None]:
model8_2.score(X_test,y_test)

0.9705671213208902

In [None]:
message = ['good food good place','plate dirt','nice decor']

In [None]:
model8.fit(X,y)
model8.predict(cv.transform(message).toarray())

array([0, 0, 0])

### Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid={'n_estimators':range(10,110,10)}
clf8=GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier(max_depth=3)),param_grid,cv=4,scoring='f1')

In [None]:
clf8.fit(X_train,y_train)
clf8.best_params_

{'n_estimators': 100}

In [None]:
adab=AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),n_estimators=100,random_state=0)
adab.fit(X_train,y_train)
y_pred83 =adab.predict(X_test)

### Model evaluation

In [None]:
adab.score(X_test,y_test)

0.9712849964106246

In [None]:
confusion_matrix(y_test,y_pred83)

array([[1188,   20],
       [  20,  165]])

In [None]:
print(classification_report(y_test,y_pred83))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1208
           1       0.89      0.89      0.89       185

    accuracy                           0.97      1393
   macro avg       0.94      0.94      0.94      1393
weighted avg       0.97      0.97      0.97      1393



In [None]:
"""K-FOLD CROSSVALIDATION"""
ADA=AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),n_estimators=20,random_state=0)
scores=cross_val_score(ADA,X,y,cv=4)
ADA_DS_score=scores.mean()
ADA_DS_score

0.9680545585068199

In [None]:
"""K-FOLD CROSSVALIDATION"""
ADA=AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),n_estimators=20,random_state=0)
scores=cross_val_score(ADA,X,y,cv=4,scoring='f1')
ADA_DS_f1score=scores.mean()
ADA_DS_f1score

0.8764879737247061