In [12]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from utility.get_twitter import *
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss

In [4]:
# Load manually tagged tweets
df = pd.read_csv('../data/tagged_tweets.csv')
df.head()

Unnamed: 0,tweet_id,tweet,revelant,sighting,lost_pet,coyote_death,eating,pet_coyote_interaction,traffic,howling
0,145595200000000.0,@OnlineAlison Several comments later… “I feed ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,145595200000000.0,@johnlevenstein If we hadn’t defunded police t...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,145593300000000.0,@TheOGpianoGeek That her COYOTE dues are late...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,145593200000000.0,Hey Coyotes! There is a virtual PFC meeting ne...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,145593000000000.0,@DunkaPeacecraft The Dems have been in power f...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
X = df['tweet']
y = df['revelant']

# Find best hyperparameters

In [6]:
# Initialize CountVectorizer and Naive Bayes object to pass into a pipeline.
# Pipeline does the following:
# Tweets -> CountVectorizer to get word count -> Naive Bayes model to predict relevancy
count_vec = CountVectorizer()
nb = MultinomialNB()
pipeline = Pipeline(steps=[('wordcount', count_vec), ('bayes', nb)])
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'wordcount', 'bayes', 'wordcount__analyzer', 'wordcount__binary', 'wordcount__decode_error', 'wordcount__dtype', 'wordcount__encoding', 'wordcount__input', 'wordcount__lowercase', 'wordcount__max_df', 'wordcount__max_features', 'wordcount__min_df', 'wordcount__ngram_range', 'wordcount__preprocessor', 'wordcount__stop_words', 'wordcount__strip_accents', 'wordcount__token_pattern', 'wordcount__tokenizer', 'wordcount__vocabulary', 'bayes__alpha', 'bayes__class_prior', 'bayes__fit_prior'])

In [7]:
from utility.text_normalizer import *

In [8]:
# Different hyper-parameters to try
params = {
    'wordcount__input': ['content'],
    'wordcount__decode_error': ['ignore'],
    'wordcount__preprocessor': [tweet_preprocessor],
    'wordcount__ngram_range': [(1,2), (1,1), (2,2)],
    'wordcount__stop_words': ['english'],
    'wordcount__lowercase': [True, False],
    'wordcount__strip_accents': ['ascii'],
    'wordcount__min_df': [0, 1, 2, 3],
    'bayes__alpha': [0, 1, 2],
    'memory': ['wordcount']
}

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((337,), (85,), (337,), (85,))

In [10]:
# Find best hyper-parameters for CountVectorizer() and MultinomialNB()
search = GridSearchCV(pipeline, param_grid=params, n_jobs=-1)

In [11]:
search.fit(x_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('wordcount', CountVectorizer()),
                                       ('bayes', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'bayes__alpha': [0, 1, 2], 'memory': ['wordcount'],
                         'wordcount__decode_error': ['ignore'],
                         'wordcount__input': ['content'],
                         'wordcount__lowercase': [True, False],
                         'wordcount__min_df': [0, 1, 2, 3],
                         'wordcount__ngram_range': [(1, 2), (1, 1), (2, 2)],
                         'wordcount__preprocessor': [<function tweet_preprocessor at 0x7fe5c9bf0040>],
                         'wordcount__stop_words': ['english'],
                         'wordcount__strip_accents': ['ascii']})

In [13]:
best_pipeline = search.best_estimator_

In [14]:
search.best_score_

0.8488147497805091

In [16]:
# Best hyper-parameters
search.best_params_

{'bayes__alpha': 1,
 'memory': 'wordcount',
 'wordcount__decode_error': 'ignore',
 'wordcount__input': 'content',
 'wordcount__lowercase': True,
 'wordcount__min_df': 2,
 'wordcount__ngram_range': (1, 1),
 'wordcount__preprocessor': <function utility.text_normalizer.tweet_preprocessor(tweet: str) -> str>,
 'wordcount__stop_words': 'english',
 'wordcount__strip_accents': 'ascii'}

In [21]:
y_hat = best_pipeline.predict_proba(x_test)[:,1]
y_hat

array([6.61814984e-02, 3.91921439e-02, 3.33496306e-03, 2.49258160e-01,
       9.41983985e-01, 6.08708933e-02, 6.94038331e-02, 1.56703296e-02,
       5.87431151e-01, 7.47566113e-01, 9.51139572e-01, 8.51713473e-01,
       1.17310184e-01, 3.82652498e-02, 3.04415305e-01, 2.75189484e-01,
       9.70886221e-01, 9.82590299e-01, 2.02122870e-01, 3.61827831e-01,
       5.28528559e-01, 8.25088652e-01, 2.54679035e-01, 4.58305827e-02,
       5.23896974e-04, 1.42923820e-01, 8.10301163e-01, 2.99230928e-01,
       7.69296297e-01, 3.04477872e-01, 1.96197365e-01, 2.38099453e-01,
       8.36376978e-06, 2.49258160e-01, 4.97306061e-04, 1.08361378e-01,
       1.67659956e-02, 1.85536801e-01, 9.99624057e-01, 9.99912774e-01,
       4.06251105e-03, 3.53540663e-01, 2.94169337e-01, 3.12106899e-01,
       5.08221706e-01, 3.50476659e-01, 5.36553976e-02, 2.20646960e-03,
       1.05017416e-02, 4.18647520e-05, 2.49258160e-01, 9.21075802e-01,
       2.19627484e-01, 5.30578411e-01, 3.38867348e-01, 2.25862170e-01,
      

In [22]:
log_loss(y_test, y_hat)

0.41224883226657494

In [23]:
# Compare on test set
best_pipeline.score(x_test, y_test)

0.8588235294117647

In [24]:
# Sample tweets to see how model does
tweets = ['The Coyotes are my favorite team', 'I like wolfs', 'The coyote attacked my dog']

In [25]:
best_pipeline.predict(tweets)

array([0., 0., 1.])

# Save model

In [26]:
import joblib

In [27]:
with open('bayes_pipeline.pkl', 'wb') as f:
    joblib.dump(best_pipeline, f)

---------------

{'bayes__alpha': 1,
 'wordcount__decode_error': 'ignore',
 'wordcount__input': 'content',
 'wordcount__lowercase': True,
 'wordcount__min_df': 2,
 'wordcount__stop_words': 'english',
 'wordcount__strip_accents': 'ascii',
 'wordcount__tokenizer': <function text_normalizer.normalize_tweet(tweet)>}