In [1]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from datetime import timedelta
import os
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import Phrases
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from itertools import islice
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import ElasticNet
from sklearn.feature_extraction.text import TfidfTransformer
%matplotlib inline

In [2]:
os.chdir("../data/trainingandtestdata")

In [3]:
tweets = pd.read_csv('training.csv', encoding='latin-1', header=None)
tweets.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
del tweets['id']
del tweets['date']
del tweets['user']
del tweets['query']

In [4]:
stemmer = SnowballStemmer("english")
tokenizer = TweetTokenizer()
df_base = pd.DataFrame(tweets['text'])

df_base['text_token'] = df_base["text"].apply(tokenizer.tokenize)
df_base['text_token'] = df_base['text_token'].apply(lambda x: [unicode(y.encode("utf-8"), errors='ignore') for y in x])
df_base['text_stemmed'] = df_base["text_token"].apply(lambda x: [stemmer.stem(y) for y in x])

df_base['text_stemmed']=df_base["text_stemmed"].apply(lambda x: [filter(None, y) for y in x])
df_base['text_processed']=df_base['text_stemmed'].apply(' '.join)
df_base = df_base[df_base.text_processed.notnull()]
vectorizer = TfidfVectorizer(min_df=.0001, max_df=.4, stop_words='english', ngram_range=(1,2))
train_vecs = vectorizer.fit_transform(df_base['text_processed'])

In [5]:
classifier_nb = MultinomialNB()
classifier_rf = ExtraTreesClassifier()
classifier_gb = GradientBoostingClassifier()
classifier_en = ElasticNet(alpha=0.1, l1_ratio=0.7)

In [6]:
classifier_nb.fit(train_vecs, tweets['sentiment'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
classifier_rf.fit(train_vecs, tweets['sentiment'])

In [None]:
classifier_gb.fit(train_vecs, tweets['sentiment'])

In [None]:
classifier_en.fit(train_vecs, tweets['sentiment'])

In [7]:
tweets_test = pd.read_csv('test.csv', encoding='latin-1', header=None)
tweets_test.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
del tweets_test['id']
del tweets_test['date']
del tweets_test['user']
del tweets_test['query']

In [8]:
df_base_test = pd.DataFrame(tweets_test['text'])

df_base_test['text_token'] = df_base_test['text'].apply(tokenizer.tokenize)
df_base_test['text_token'] = df_base_test['text_token'].apply(lambda x: [unicode(y.encode("utf-8"), errors='ignore') for y in x])
df_base_test['text_stemmed'] = df_base_test["text_token"].apply(lambda x: [stemmer.stem(y) for y in x])
df_base_test['text_processed']=df_base_test['text_stemmed'].apply(' '.join)
df_base_test = df_base_test[df_base_test.text_processed.notnull()]
test_vecs = vectorizer.transform(df_base_test['text_processed'])

In [9]:
prediction_nb = classifier_nb.predict(test_vecs)
#prediction_rf = classifier_rf.predict(test_vecs)
#prediction_gb = classifier_gb.predict(test_vecs)
#prediction_en = classifier_en.predict(test_vecs)

In [10]:
tweets_test['pred_nb'] = prediction_nb
#tweets_test['pred_rf'] = prediction_rf
#tweets_test['pred_gb'] = prediction_gb
#tweets_test['pred_en'] = prediction_en

In [11]:
tweets_test['success_nb'] = (tweets_test['pred_nb']-tweets_test['sentiment'])**2
#tweets_test['success_rf'] = (tweets_test['pred_rf']-tweets_test['sentiment'])**2
#tweets_test['success_gb'] = (tweets_test['pred_gb']-tweets_test['sentiment'])**2
#tweets_test['success_en'] = (tweets_test['pred_en']-tweets_test['sentiment'])**2

In [12]:
tweets_test['success_nb'].value_counts()

0     293
4     139
16     66
Name: success_nb, dtype: int64

In [None]:
tweets_test['success_rf'].value_counts()

In [None]:
tweets_test['success_gb'].value_counts()

In [None]:
tweets_test['success_en'].value_counts()

In [15]:
(tweets_test['pred_nb']==tweets_test['sentiment']).value_counts()

True     293
False    205
dtype: int64

In [None]:
(tweets_test['pred_rf']==tweets_test['sentiment']).value_counts()

In [None]:
(tweets_test['pred_gb']==tweets_test['sentiment']).value_counts()

In [None]:
(tweets_test['pred_en']==tweets_test['sentiment']).value_counts()

In [13]:
import pickle
from sklearn.externals import joblib

In [32]:
os.chdir("../../NotablyLoftyPotential")
filename = 'NB_sentiment_model.pkl'
pickle.dump(classifier_nb, open(filename, 'wb'), protocol=2)


filename = 'NB_vectorizer.pkl'
pickle.dump(vectorizer, open(filename, 'wb'), protocol=2)

In [33]:
import tarfile
tar = tarfile.open('NB_sentiment_model.pkl.gz', 'w:gz')
tar.add('NB_sentiment_model.pkl')
tar.close()

tar = tarfile.open('NB_vectorizer.pkl.gz', 'w:gz')
tar.add('NB_vectorizer.pkl')
tar.close()