In [1]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from datetime import timedelta
import os
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import Phrases
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from itertools import islice
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import ElasticNet
from sklearn.feature_extraction.text import TfidfTransformer
%matplotlib inline

In [2]:
os.chdir("../data/trainingandtestdata")

In [27]:
tweets = pd.read_csv('training.csv', encoding='latin-1', header=None)
tweets.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
del tweets['id']
del tweets['date']
del tweets['user']
del tweets['query']

In [33]:
stemmer = SnowballStemmer("english")
tokenizer = TweetTokenizer()
df_base = pd.DataFrame(tweets['text'])

df_base['text_token'] = df_base["text"].apply(tokenizer.tokenize)
df_base['text_token'] = df_base['text_token'].apply(lambda x: [unicode(y.encode("utf-8"), errors='ignore') for y in x])
df_base['text_stemmed'] = df_base["text_token"].apply(lambda x: [stemmer.stem(y) for y in x])

vectorizer = TfidfVectorizer(min_df=.0001, max_df=.4, stop_words='english', ngram_range=(1,2))

In [34]:
df_base['text_stemmed']=df_base["text_stemmed"].apply(lambda x: [filter(None, y) for y in x])
df_base['text_processed']=df_base['text_stemmed'].apply(' '.join)
train_vecs = vectorizer.fit_transform(df_base['text_processed'])

In [35]:
classifier_nb = MultinomialNB()
classifier_rf = ExtraTreesClassifier()
classifier_gb = GradientBoostingClassifier()
classifier_en = ElasticNet(alpha=0.1, l1_ratio=0.7)

In [36]:
classifier_nb.fit(train_vecs, tweets['sentiment'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
classifier_rf.fit(train_vecs, tweets['sentiment'])

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [8]:
classifier_gb.fit(train_vecs, tweets['sentiment'])

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [9]:
classifier_en.fit(train_vecs, tweets['sentiment'])

ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.7,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [37]:
tweets_test = pd.read_csv('test.csv', encoding='latin-1', header=None)
tweets_test.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
del tweets_test['id']
del tweets_test['date']
del tweets_test['user']
del tweets_test['query']

In [38]:
df_base_test = tweets_test['text']
test_vecs = vectorizer.transform(df_base_test)

In [39]:
prediction_nb = classifier_nb.predict(test_vecs)
#prediction_rf = classifier_rf.predict(test_vecs)
#prediction_gb = classifier_gb.predict(test_vecs)
#prediction_en = classifier_en.predict(test_vecs)

In [40]:
tweets_test['pred_nb'] = prediction_nb
#tweets_test['pred_rf'] = prediction_rf
#tweets_test['pred_gb'] = prediction_gb
#tweets_test['pred_en'] = prediction_en

In [43]:
tweets_test['success_nb'] = (tweets_test['pred_nb']-tweets_test['sentiment'])**2
#tweets_test['success_rf'] = (tweets_test['pred_rf']-tweets_test['sentiment'])**2
#tweets_test['success_gb'] = (tweets_test['pred_gb']-tweets_test['sentiment'])**2
#tweets_test['success_en'] = (tweets_test['pred_en']-tweets_test['sentiment'])**2

In [44]:
tweets_test['success_nb'].value_counts()

0     269
4     139
16     90
Name: success_nb, dtype: int64

In [16]:
tweets_test['success_rf'].value_counts()

0     286
4     139
16     73
Name: success_rf, dtype: int64

In [17]:
tweets_test['success_gb'].value_counts()

0     228
4     139
16    131
Name: success_gb, dtype: int64

In [18]:
tweets_test['success_en'].value_counts()

4.0    359
0.0    139
Name: success_en, dtype: int64

In [19]:
(tweets_test['pred_nb']==tweets_test['sentiment']).value_counts()

True     294
False    204
dtype: int64

In [21]:
(tweets_test['pred_rf']==tweets_test['sentiment']).value_counts()

True     286
False    212
dtype: int64

In [22]:
(tweets_test['pred_gb']==tweets_test['sentiment']).value_counts()

False    270
True     228
dtype: int64

In [23]:
(tweets_test['pred_en']==tweets_test['sentiment']).value_counts()

False    359
True     139
dtype: int64

In [47]:
import pickle
from sklearn.externals import joblib

In [57]:
os.chdir("../../NotablyLoftPotential")
filename = 'finalized_sentiment_model.pkl'
model_pkl = open(filename, 'wb')
pickle.dump(classifier_nb, model_pkl)
model_pkl.close()

'/Users/asimonoff/Documents/Capstone'