### Setting Up Standard Imports

In [31]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.corpus import wordnet
import matplotlib.pyplot as plt
import string
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import train_test_split


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Importing Clean Data

In [2]:
corpus = pd.read_csv('data/cleaned.csv')
corpus.drop(columns='Unnamed: 0', inplace=True)
corpus

Unnamed: 0,body,target,company
0,"['wesley', 'i', 'have', 'a', 'g', 'iphone', 'a...",0,apple
1,"['jessedee', 'know', 'about', 'fludapp', 'awes...",2,apple
2,"['swonderlin', 'can', 'not', 'wait', 'for', 'i...",2,apple
3,"['sxsw', 'i', 'hope', 'this', 'year', 'festiva...",0,apple
4,"['sxtxstate', 'great', 'stuff', 'on', 'fri', '...",2,google
...,...,...,...
8158,"['ipad', 'everywhere', 'sxsw', 'link']",2,apple
8159,"['wave', 'buzz', 'rt', 'mention', 'we', 'inter...",1,google
8160,"['google', 'zeiger', 'a', 'physician', 'never'...",1,google
8161,"['some', 'verizon', 'iphone', 'customer', 'com...",1,apple


### Setting Up Train-Test-Split for Modeling

In [3]:
X = corpus.body
y = corpus.target

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    test_size=0.25)

In [11]:
# Secondary train-test split to build our best model
X_t, X_val, y_t, y_val = train_test_split(X_train, y_train,
                                          test_size=0.25, random_state=42)

### Using Count Vectorizer 

In [12]:
cv = CountVectorizer()

In [13]:
X_t_vec = cv.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

In [15]:
# We then transform the validation set. (Do not refit the vectorizer!)

X_val_vec = cv.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

### First Model -- Multinomial Naive Bayes (with count vectorizer)

In [24]:
baseline_model = MultinomialNB()

In [38]:
baseline_model.fit(X_t_vec, y_t)

In [39]:
y_hat = mnb.predict(X_val_vec)

In [17]:
from sklearn.model_selection import cross_val_score

In [40]:
baseline_cv = cross_val_score(baseline_model, X_t_vec, y_t)

In [41]:
baseline_cv.mean()

0.6416961222888381

In [42]:
y_t.value_counts(normalize=True)

1    0.568068
2    0.361359
0    0.070573
Name: target, dtype: float64

If we were to predict the majority class of 1 every time we would have an accuracy of 56%. Our baseline model has an accuracy score of 64%, which is already better than just pure guessing. 

In [43]:
accuracy_score(y_val, y_hat)

0.6681907250163291