## Modeling Exercises - NLP

In [1]:
import pandas as pd
import nltk
import unicodedata
import re
import acquire
import prepare
from env import get_db_url
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
query = 'SELECT * FROM spam'
df = pd.read_sql(query, get_db_url('spam_db'))
df.head()

Unnamed: 0,id,label,text
0,0,ham,"Go until jurong point, crazy.. Available only ..."
1,1,ham,Ok lar... Joking wif u oni...
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,ham,U dun say so early hor... U c already then say...
4,4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df['clean_text'] = df.text.apply(prepare.basic_clean).apply(prepare.tokenize).apply(prepare.lemmatize).apply(prepare.remove_stopwords)
df.head()


Unnamed: 0,id,label,text,clean_text
0,0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,4,ham,"Nah I don't think he goes to usf, he lives aro...",nah ' think go usf life around though


In [4]:
X = df.clean_text
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=12)

In [5]:
cv = CountVectorizer()
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_bow, y_train)

tree.score(X_bow, y_train)

0.9302221225039264

In [6]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train)
tree.fit(X_tfidf, y_train)
tree.score(X_tfidf, y_train)

0.9540049360556428

In [7]:
tree.score(tfidf.transform(X_test), y_test)

0.9246636771300448

---

#### Modeling with bigrams

In [8]:
X_train.head()

1562               dude saw parked car sunroof popped sux
3362                                               ' free
3686                       great shoot big load get ready
2457                                     kkhow sister kid
353     yo guy ever figure much need alcohol jay tryin...
Name: clean_text, dtype: object

In [9]:
y_train.head()

1562    ham
3362    ham
3686    ham
2457    ham
353     ham
Name: label, dtype: object

In [10]:
y_train.value_counts()

ham     3860
spam     597
Name: label, dtype: int64

In [11]:
cv = CountVectorizer(ngram_range=(2, 2))
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.8893874803679606

In [12]:
tfidf = TfidfVectorizer(ngram_range=(2, 2))
X_tfidf = tfidf.fit_transform(X_train)
tree.fit(X_tfidf, y_train)
tree.score(X_tfidf, y_train)

0.8896118465335427

**Unigrams and Bigrams**

In [13]:
cv = CountVectorizer(ngram_range=(1, 2))
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.9302221225039264

In [14]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(X_train)
tree.fit(X_tfidf, y_train)
tree.score(X_tfidf, y_train)

0.9546780345523895

**Unigrams, Bigrams, and Trigrams**

In [15]:
cv = CountVectorizer(ngram_range=(1, 3))
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.9302221225039264

In [16]:
tfidf = TfidfVectorizer(ngram_range=(1, 3))
X_tfidf = tfidf.fit_transform(X_train)
tree.fit(X_tfidf, y_train)
tree.score(X_tfidf, y_train)

0.9535562037244784

---

### Train, Validate, Test split for further model development / evaluation

In [18]:
df2 = df.copy()
df2.head()

Unnamed: 0,id,label,text,clean_text
0,0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,4,ham,"Nah I don't think he goes to usf, he lives aro...",nah ' think go usf life around though


In [19]:
def split_data(df):
    '''
    this function takes the full dataset and splits it into three parts (train, validate, test) 
    and returns the resulting dataframes
    '''
    train_val, test = train_test_split(df, train_size = 0.8, random_state=123)
    train, validate = train_test_split(train_val, train_size = 0.7, random_state=123)
    return train, validate, test

In [20]:
train, validate, test = split_data(df2)
train.head()

Unnamed: 0,id,label,text,clean_text
5116,5116,ham,Thanks. Fills me with complete calm and reassu...,thanks fill complete calm reassurance
3735,3735,ham,Hows the street where the end of library walk is?,hows street end library walk
2538,2538,ham,The monthly amount is not that terrible and yo...,monthly amount terrible pay anything till 6mon...
3743,3743,ham,Nobody names their penis a girls name this sto...,nobody name penis girl name story ' add
3573,3573,ham,Yeah sure I'll leave in a min,yeah sure ' leave min


In [21]:
train.shape, validate.shape, test.shape

((3119, 4), (1338, 4), (1115, 4))

In [22]:
X_train = train.clean_text
y_train = train.label
X_val = validate.clean_text
y_val = validate.label
X_test = test.clean_text
y_test = test.label

### CountVectorizer (ngram range 1,2) and Decision Tree - Max depth exploration

Max Depth = 60

In [26]:
#Train data
cv = CountVectorizer(ngram_range=(1, 2))
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=60)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

1.0

In [27]:
#Validate data
X_bow_val = cv.transform(X_val)
tree.score(X_bow_val, y_val)

0.9603886397608371

Max Depth = 50

In [34]:
cv = CountVectorizer(ngram_range=(1, 2))
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=50)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.9987175376723308

In [35]:
X_bow_val = cv.transform(X_val)
tree.score(X_bow_val, y_val)

0.9626307922272048

Max Depth = 40

In [36]:
cv = CountVectorizer(ngram_range=(1, 2))
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=40)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.9958319974350753

In [37]:
X_bow_val = cv.transform(X_val)
tree.score(X_bow_val, y_val)

0.9603886397608371

Max Depth = 30

In [38]:
cv = CountVectorizer(ngram_range=(1, 2))
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=30)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.9923052260339853

In [39]:
X_bow_val = cv.transform(X_val)
tree.score(X_bow_val, y_val)

0.9603886397608371

Max Depth = 20

In [40]:
cv = CountVectorizer(ngram_range=(1, 2))
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=20)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.9846104520679705

In [41]:
X_bow_val = cv.transform(X_val)
tree.score(X_bow_val, y_val)

0.9573991031390134

Max Depth = 10

In [42]:
cv = CountVectorizer(ngram_range=(1, 2))
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=10)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.9621673613337608

In [43]:
X_bow_val = cv.transform(X_val)
tree.score(X_bow_val, y_val)

0.952914798206278

**Best Model - Max Depth = 50**

In [45]:
#test Data
cv = CountVectorizer(ngram_range=(1, 2))
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=50)
tree.fit(X_bow, y_train)

X_bow_test = cv.transform(X_test)
tree.score(X_bow_test, y_test)

0.957847533632287

---

### TfidfVectorizer (ngram range 1,2) and Decision Tree - Max depth evaluation

Max Depth = 60

In [72]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=60)
tree.fit(X_tfidf, y_train)
tree.score(X_tfidf, y_train)

1.0

In [73]:
X_tfidf_val = tfidf.transform(X_val)
tree.score(X_tfidf_val, y_val)

0.952914798206278

Max Depth = 50

In [74]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=50)
tree.fit(X_tfidf, y_train)
tree.score(X_tfidf, y_train)

1.0

In [75]:
X_tfidf_val = tfidf.transform(X_val)
tree.score(X_tfidf_val, y_val)

0.9521674140508222

Max Depth = 40

In [76]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=40)
tree.fit(X_tfidf, y_train)
tree.score(X_tfidf, y_train)

0.9967938441808272

In [77]:
X_tfidf_val = tfidf.transform(X_val)
tree.score(X_tfidf_val, y_val)

0.952914798206278

Max Depth = 30

In [78]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=30)
tree.fit(X_tfidf, y_train)
tree.score(X_tfidf, y_train)

0.9932670727797371

In [79]:
X_tfidf_val = tfidf.transform(X_val)
tree.score(X_tfidf_val, y_val)

0.9521674140508222

Max Depth = 20

In [80]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=20)
tree.fit(X_tfidf, y_train)
tree.score(X_tfidf, y_train)

0.9871753767233088

In [81]:
X_tfidf_val = tfidf.transform(X_val)
tree.score(X_tfidf_val, y_val)

0.9521674140508222

---