In [18]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from utility.get_twitter import *
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from utility.text_normalizer import *
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('../data/tweets_for_tagging.csv')

In [3]:
df.head()

Unnamed: 0,tweet,relevant
0,"Our Mountain Lions are ""Capturing Kid's Hearts...",0
1,Awesome tasty fruit pop treat every Bobcat tod...,0
2,@MattfromKC Idk but he looks like Bobcat Goldt...,0
3,Mountain lion attacks dog; bear chills at loca...,1
4,In Defense of Animals gives #SaveLACougars a g...,1


In [4]:
df.fillna(0, inplace=True)

In [5]:
df['relevant'] = df['relevant'].astype('int64')

In [6]:
df.head()

Unnamed: 0,tweet,relevant
0,"Our Mountain Lions are ""Capturing Kid's Hearts...",0
1,Awesome tasty fruit pop treat every Bobcat tod...,0
2,@MattfromKC Idk but he looks like Bobcat Goldt...,0
3,Mountain lion attacks dog; bear chills at loca...,1
4,In Defense of Animals gives #SaveLACougars a g...,1


In [7]:
# Save
df.to_csv('../data/tweets_for_tagging.csv', index=None)

In [8]:
X = df['tweet']
y = df['relevant']

In [13]:
count_vec = CountVectorizer()
nb = MultinomialNB()
pipeline = Pipeline(steps=[('wordcount', count_vec), ('bayes', nb)])
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'wordcount', 'bayes', 'wordcount__analyzer', 'wordcount__binary', 'wordcount__decode_error', 'wordcount__dtype', 'wordcount__encoding', 'wordcount__input', 'wordcount__lowercase', 'wordcount__max_df', 'wordcount__max_features', 'wordcount__min_df', 'wordcount__ngram_range', 'wordcount__preprocessor', 'wordcount__stop_words', 'wordcount__strip_accents', 'wordcount__token_pattern', 'wordcount__tokenizer', 'wordcount__vocabulary', 'bayes__alpha', 'bayes__class_prior', 'bayes__fit_prior'])

In [14]:
params = {
    'wordcount__input': ['content'],
    'wordcount__decode_error': ['ignore'],
    'wordcount__preprocessor': [tweet_preprocessor],
    'wordcount__ngram_range': [(1,2), (1,1), (2,2)],
    'wordcount__stop_words': ['english'],
    'wordcount__lowercase': [True, False],
    'wordcount__strip_accents': ['ascii'],
    'wordcount__min_df': [0, 1, 2, 3],
    'bayes__alpha': [0.01, 0.1, 0.5, 1, 2],
    'memory': ['wordcount']
}

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((140,), (36,), (140,), (36,))

In [16]:
# Find best hyper-parameters for CountVectorizer() and MultinomialNB()
search = GridSearchCV(pipeline, param_grid=params, n_jobs=-1)

In [17]:
%%time
search.fit(x_train, y_train)

CPU times: user 3.75 s, sys: 311 ms, total: 4.06 s
Wall time: 3min 19s


GridSearchCV(estimator=Pipeline(steps=[('wordcount', CountVectorizer()),
                                       ('bayes', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'bayes__alpha': [0.01, 0.1, 0.5, 1, 2],
                         'memory': ['wordcount'],
                         'wordcount__decode_error': ['ignore'],
                         'wordcount__input': ['content'],
                         'wordcount__lowercase': [True, False],
                         'wordcount__min_df': [0, 1, 2, 3],
                         'wordcount__ngram_range': [(1, 2), (1, 1), (2, 2)],
                         'wordcount__preprocessor': [<function tweet_preprocessor at 0x7f7f4a8eaa60>],
                         'wordcount__stop_words': ['english'],
                         'wordcount__strip_accents': ['ascii']})

In [19]:
search.best_params_

{'bayes__alpha': 1,
 'memory': 'wordcount',
 'wordcount__decode_error': 'ignore',
 'wordcount__input': 'content',
 'wordcount__lowercase': True,
 'wordcount__min_df': 0,
 'wordcount__ngram_range': (2, 2),
 'wordcount__preprocessor': <function utility.text_normalizer.tweet_preprocessor(tweet: str) -> str>,
 'wordcount__stop_words': 'english',
 'wordcount__strip_accents': 'ascii'}

In [21]:
best_pipline = search.best_estimator_

In [24]:
search.best_params_

{'bayes__alpha': 1,
 'memory': 'wordcount',
 'wordcount__decode_error': 'ignore',
 'wordcount__input': 'content',
 'wordcount__lowercase': True,
 'wordcount__min_df': 0,
 'wordcount__ngram_range': (2, 2),
 'wordcount__preprocessor': <function utility.text_normalizer.tweet_preprocessor(tweet: str) -> str>,
 'wordcount__stop_words': 'english',
 'wordcount__strip_accents': 'ascii'}

In [25]:
search.best_score_

0.8214285714285715

In [26]:
best_pipline.score(x_test, y_test)

0.75

In [27]:
# Log loss on test set
y_hat_test = best_pipline.predict_proba(x_test)[:,1]
log_loss(y_test, y_hat_test)

0.4664119283922401

In [28]:
roc_auc_score(y_test, y_hat_test)

0.8277591973244147

In [29]:
# Save Model
import joblib

In [30]:
with open('bayes_pipeline.pkl', 'wb') as f:
    joblib.dump(best_pipline, f)