In [1]:
import numpy as np
import pandas as pd

from _00_runBuilder import RunBuilder
from IPython.display import display ,clear_output

In [2]:
from _01_baseModel_2 import preProc, get_cross_val_score

from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

In [3]:
# load dataset
data = pd.read_csv('./dataset/train_folds.csv')
data.head()

Unnamed: 0,keyword,location,text,target,fold
0,,,Our Deeds are the Reason of this #earthquake M...,1,2.0
1,,,Forest fire near La Ronge Sask. Canada,1,3.0
2,,,All residents asked to 'shelter in place' are ...,1,2.0
3,,,"13,000 people receive #wildfires evacuation or...",1,1.0
4,,,Just got sent this photo from Ruby #Alaska as ...,1,3.0


# Raw tweets

In [4]:
xData = data.text
yData = data.target
folds = data.fold

## Count Vectorizer

In [5]:
fe     = CountVectorizer(max_features=5000, ngram_range=(1,3))
clf    = MultinomialNB()
metric = accuracy_score

score  = get_cross_val_score(xData, yData, folds, fe, clf, metric)
print(score)

[0.8023637557452397, 0.7890932982917214, 0.7806959947472094, 0.8015768725361366, 0.7971109652002626]


In [6]:
hyper_params = {
    'max_features':[100, 1000, 2500, 5000, 7500],
    'max_ngram':[1,2,3,4],
}
runs = RunBuilder.get_runs(hyper_params)

In [7]:
def CountVectorizer_runs():
    results_CountVectorizer = []
    for run in runs:
        fe     = CountVectorizer(max_features=run['max_features'], ngram_range=(1,run['max_ngram']))
        clf    = MultinomialNB()
        metric = accuracy_score

        score  = np.mean(get_cross_val_score(xData, yData, folds, fe, clf, metric))
        result = {**run, 'score':score}
        results_CountVectorizer.append(result)

        display(pd.DataFrame(results_CountVectorizer))
        clear_output(wait=True)
    clear_output(wait=True)
    return results_CountVectorizer

In [8]:
results_CountVectorizer = CountVectorizer_runs()
pd.DataFrame(results_CountVectorizer).sort_values('score',ascending=False).head()

Unnamed: 0,max_features,max_ngram,score
16,7500,1,0.799949
12,5000,1,0.79916
17,7500,2,0.798897
13,5000,2,0.796925
18,7500,3,0.796401


## Tfidf Vectorizer

In [9]:
fe     = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
clf    = MultinomialNB()
metric = accuracy_score
score  = get_cross_val_score(xData, yData, folds, fe, clf, metric)
print(score)

[0.7918581746552856, 0.7923784494086727, 0.7813525935653316, 0.8015768725361366, 0.7984241628365069]


In [10]:
def TfidfVectorizer_runs():
    results_TfidfVectorizer = []
    for run in runs:
        fe     = TfidfVectorizer(max_features=run['max_features'], ngram_range=(1,run['max_ngram']))
        clf    = MultinomialNB()
        metric = accuracy_score

        score  = np.mean(get_cross_val_score(xData, yData, folds, fe, clf, metric))
        result = {**run, 'score':score}
        results_TfidfVectorizer.append(result)

        display(pd.DataFrame(results_TfidfVectorizer))
        clear_output(wait=True)
    clear_output(wait=True)
    return results_TfidfVectorizer

In [11]:
results_TfidfVectorizer = TfidfVectorizer_runs()
pd.DataFrame(results_TfidfVectorizer).sort_values('score',ascending=False).head()

Unnamed: 0,max_features,max_ngram,score
12,5000,1,0.801788
13,5000,2,0.800867
16,7500,1,0.800474
17,7500,2,0.800342
8,2500,1,0.79916


# Pre-Processing 

In [12]:
# pre-processing tweets
stemmer = PorterStemmer()
xData = xData.map(lambda tweet:preProc(tweet, stemmer))

In [13]:
results_CountVectorizer_proc = CountVectorizer_runs()
pd.DataFrame(results_CountVectorizer_proc).sort_values('score',ascending=False).head()

Unnamed: 0,max_features,max_ngram,score
13,5000,2,0.803494
17,7500,2,0.803232
18,7500,3,0.799291
14,5000,3,0.798503
19,7500,4,0.797452


In [14]:
results_TfidfVectorizer_proc = TfidfVectorizer_runs()
pd.DataFrame(results_TfidfVectorizer_proc).sort_values('score',ascending=False).head()

Unnamed: 0,max_features,max_ngram,score
8,2500,1,0.799423
13,5000,2,0.799161
17,7500,2,0.798504
12,5000,1,0.797847
16,7500,1,0.797322
