In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from get_twitter import *
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from text_normalizer import *

In [2]:
# Load manually tagged tweets
df = pd.read_csv('../data/tagged_tweets.csv')
df.head()

Unnamed: 0,tweet_id,tweet,revelant,sighting,lost_pet,coyote_death,eating,pet_coyote_interaction,traffic,howling
0,145595200000000.0,@OnlineAlison Several comments later… “I feed ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,145595200000000.0,@johnlevenstein If we hadn’t defunded police t...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,145593300000000.0,@TheOGpianoGeek That her COYOTE dues are late...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,145593200000000.0,Hey Coyotes! There is a virtual PFC meeting ne...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,145593000000000.0,@DunkaPeacecraft The Dems have been in power f...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
X = df['tweet']
y = df['revelant']

# Find best hyperparameters

In [4]:
# Initialize CountVectorizer and Naive Bayes object to pass into a pipeline.
# Pipeline does the following:
# Tweets -> CountVectorizer to get word count -> Naive Bayes model to predict relevancy
count_vec = CountVectorizer()
nb = MultinomialNB()
pipeline = Pipeline(steps=[('wordcount', count_vec), ('bayes', nb)])
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'wordcount', 'bayes', 'wordcount__analyzer', 'wordcount__binary', 'wordcount__decode_error', 'wordcount__dtype', 'wordcount__encoding', 'wordcount__input', 'wordcount__lowercase', 'wordcount__max_df', 'wordcount__max_features', 'wordcount__min_df', 'wordcount__ngram_range', 'wordcount__preprocessor', 'wordcount__stop_words', 'wordcount__strip_accents', 'wordcount__token_pattern', 'wordcount__tokenizer', 'wordcount__vocabulary', 'bayes__alpha', 'bayes__class_prior', 'bayes__fit_prior'])

In [6]:
# Different hyper-parameters to try
params = {
    'wordcount__input': ['content'],
    'wordcount__decode_error': ['ignore'],
    'wordcount__preprocessor': [tweet_preprocessor],
    'wordcount__ngram_range': [(1,2), (1,1), (2,2)],
    'wordcount__stop_words': ['english'],
    'wordcount__lowercase': [True, False],
    'wordcount__strip_accents': ['ascii'],
    'wordcount__min_df': [0, 1, 2, 3],
    'bayes__alpha': [0, 1, 2],
    'memory': ['wordcount']
}

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((337,), (85,), (337,), (85,))

In [8]:
# Find best hyper-parameters for CountVectorizer() and MultinomialNB()
search = GridSearchCV(pipeline, param_grid=params, n_jobs=-1)

In [9]:
search.fit(x_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('wordcount', CountVectorizer()),
                                       ('bayes', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'bayes__alpha': [0, 1, 2], 'memory': ['wordcount'],
                         'wordcount__decode_error': ['ignore'],
                         'wordcount__input': ['content'],
                         'wordcount__lowercase': [True, False],
                         'wordcount__min_df': [0, 1, 2, 3],
                         'wordcount__ngram_range': [(1, 2), (1, 1), (2, 2)],
                         'wordcount__preprocessor': [<function tweet_preprocessor at 0x7f9fda8f8430>,
                                                     <function tweet_preprocessor_lowercase at 0x7f9fda8f83a0>],
                         'wordcount__stop_words': ['english'],
                         'wordcount__strip_accents': ['ascii']})

In [10]:
best_pipeline = search.best_estimator_

In [11]:
search.best_score_

0.8485952589991219

In [13]:
# Best hyper-parameters
search.best_params_

{'bayes__alpha': 2,
 'memory': 'wordcount',
 'wordcount__decode_error': 'ignore',
 'wordcount__input': 'content',
 'wordcount__lowercase': True,
 'wordcount__min_df': 2,
 'wordcount__ngram_range': (1, 2),
 'wordcount__preprocessor': <function text_normalizer.tweet_preprocessor(tweet: str) -> str>,
 'wordcount__stop_words': 'english',
 'wordcount__strip_accents': 'ascii'}

In [14]:
search.best_score_

0.8485952589991219

In [15]:
# Compare on test set
best_pipeline.score(x_test, y_test)

0.8823529411764706

In [16]:
# Sample tweets to see how model does
tweets = ['sports and balls ball ball ball', 'I like wolfs', 'The coyote attacked my dog']

In [17]:
best_pipeline.predict(tweets)

array([0., 0., 1.])

# Save model

In [18]:
import joblib

In [19]:
with open('bayes_pipeline.pkl', 'wb') as f:
    joblib.dump(best_pipeline, f)

---------------

{'bayes__alpha': 1,
 'wordcount__decode_error': 'ignore',
 'wordcount__input': 'content',
 'wordcount__lowercase': True,
 'wordcount__min_df': 2,
 'wordcount__stop_words': 'english',
 'wordcount__strip_accents': 'ascii',
 'wordcount__tokenizer': <function text_normalizer.normalize_tweet(tweet)>}