In [102]:
from __future__ import division
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import enchant
from pymongo import MongoClient
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.naive_bayes import MultinomialNB
import seaborn as sns
import pdb
%matplotlib inline

## Get data

In [82]:
connection = MongoClient()
db = connection.wta

def mongo2PandasClean(mongodb, drop_id=True):
    df = pd.DataFrame(list(mongodb.find()))
    if drop_id:
        del df['_id']
    return df

df = mongo2PandasClean(db.trainingTR)

In [92]:
d = enchant.Dict("en_US")

def dropNonEnglish(row):
    eng = 0
    words = 0
    for word in row.split():
        words += 1
        if d.check(word):
            eng += 1
    return eng/words


df = df[df['Text'].map(dropNonEnglish) > 0.5]
df['Positive'] = [1 if int(r) > 3 else 0 for r in df['Rating']]

In [93]:
X = df['Text']
y = df['Positive']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, )

## Vectorize TRAINING data only

    * Max_df = 0.25
    * max_features = 100

In [94]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.25, max_features=100).fit(X_train)
X_train = tfidf.transform(X_train)
bag_o_words = tfidf.get_feature_names()

In [107]:
def prettyPrint(model, X_test, y_test, with_words=True):
    print "Accuracy: ", accuracy_score(y_test, model.predict(X_test.toarray()))
    print "Recall: ", recall_score(y_test, model.predict(X_test.toarray()))
    print "Precision: ", precision_score(y_test, model.predict(X_test.toarray()))
    if with_words:
        print "Top 20 Most Important Features: "
        ix = np.argsort(model.feature_importances_)[-10:]
        for i in ix:
            print bag_o_words[i]
    

## Transform test data based on trained model

In [96]:
X_test = tfidf.transform(X_test)

In [100]:
for model in [RandomForestClassifier(), GradientBoostingClassifier(), AdaBoostClassifier()]:
    mod = model.fit(X_train, y_train)
    print str(model)
    prettyPrint(model, X_test, y_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Accuracy:  0.934426229508
Recall:  1.0
Precision:  0.934426229508
Top 20 Most Important Features: 
want
road
time
love
really
place
way
waterfall
nice
trip
GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Accuracy:  0.934426229508
Recall:  1.0
Precision:  0.934426229508
Top 20 Most Important Features: 
rocks
love
walk
nice
mi

## Vectorizing again
    * max_df=0.5
    * max_features=100
    * bigrams

In [115]:
X = df['Text']
y = df['Positive']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

tfidf = TfidfVectorizer(stop_words='english', ngram_range=[1,2], max_df=0.5, max_features=100).fit(X_train)
X_train = tfidf.transform(X_train)
bag_o_words = tfidf.get_feature_names()
X_test = tfidf.transform(X_test)
for model in [RandomForestClassifier(), GradientBoostingClassifier(), AdaBoostClassifier()]:
    mod = model.fit(X_train, y_train)
    print str(model)
    prettyPrint(model, X_test, y_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Accuracy:  0.934426229508
Recall:  1.0
Precision:  0.934426229508
Top 20 Most Important Features: 
10
doing
waterfalls
nice
time
really
trip
hike
trail
ll
GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Accuracy:  0.926229508197
Recall:  0.991228070175
Precision:  0.933884297521
Top 20 Most Important Features: 
awesome
day h

## Vectorizing again
    * max_df=0.5
    * max_features=100
    * bigrams

In [116]:
X = df['Text']
y = df['Positive']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

tfidf = TfidfVectorizer(stop_words='english', ngram_range=[1,3], max_df=0.5, max_features=150).fit(X_train)
X_train = tfidf.transform(X_train)
bag_o_words = tfidf.get_feature_names()
X_test = tfidf.transform(X_test)
for model in [RandomForestClassifier(), GradientBoostingClassifier(), AdaBoostClassifier()]:
    mod = model.fit(X_train, y_train)
    print str(model)
    prettyPrint(model, X_test, y_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Accuracy:  0.934426229508
Recall:  1.0
Precision:  0.934426229508
Top 20 Most Important Features: 
waterfall
elevation
did
hike
nice
time
long
past
love
trail
GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Accuracy:  0.934426229508
Recall:  1.0
Precision:  0.934426229508
Top 20 Most Important Features: 
springs
left
entire


## Attempting with Multinomial Bayes Base For ADAboosted model

http://www.aclweb.org/anthology/S14-2018

In [109]:
X = df['Text']
y = df['Positive']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

tfidf = TfidfVectorizer(stop_words='english', max_df=0.1, max_features=100).fit(X_train)
X_train = tfidf.transform(X_train)
bag_o_words = tfidf.get_feature_names()
X_test = tfidf.transform(X_test)
for model in [AdaBoostClassifier(base_estimator=MultinomialNB()), AdaBoostClassifier()]:
    mod = model.fit(X_train, y_train)
    print str(model)
    prettyPrint(model, X_test, y_test, with_words=False)
    print "--------------------------------------------------------------------------"

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          learning_rate=1.0, n_estimators=50, random_state=None)
Accuracy:  0.934426229508
Recall:  1.0
Precision:  0.934426229508
--------------------------------------------------------------------------
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Accuracy:  0.926229508197
Recall:  0.991228070175
Precision:  0.933884297521
--------------------------------------------------------------------------


## Attempting with Multinomial Bayes Base For ADAboosted model
 with bigrams and trigrams

In [119]:
X = df['Text']
y = df['Positive']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

tfidf = TfidfVectorizer(stop_words='english', max_df=0.1, max_features=100).fit(X_train)
X_train = tfidf.transform(X_train)
bag_o_words = tfidf.get_feature_names()
X_test = tfidf.transform(X_test)
for model in [AdaBoostClassifier(base_estimator=MultinomialNB()), AdaBoostClassifier()]:
    mod = model.fit(X_train, y_train)
    print str(model)
    prettyPrint(model, X_test, y_test, with_words=False)
    print "--------------------------------------------------------------------------"

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          learning_rate=1.0, n_estimators=50, random_state=None)
Accuracy:  0.934426229508
Recall:  1.0
Precision:  0.934426229508
--------------------------------------------------------------------------
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Accuracy:  0.926229508197
Recall:  0.991228070175
Precision:  0.933884297521
--------------------------------------------------------------------------


## Time for some GridSearching!


In [110]:
from sklearn.grid_search import GridSearchCV

In [None]:
AdaBoostClassifier()

In [117]:
X = df['Text']
y = df['Positive']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

tfidf = TfidfVectorizer(stop_words='english', ngram_range=[1,3], max_df=0.1, max_features=100).fit(X_train)
X_train = tfidf.transform(X_train)
bag_o_words = tfidf.get_feature_names()
X_test = tfidf.transform(X_test)
tuning_params = [{'learning_rate': [0.25,0.5,0.75,1], 'n_estimators': [50, 100, 150, 500]}]
clf = GridSearchCV(AdaBoostClassifier(base_estimator=MultinomialNB()), tuning_params, cv=5)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [50, 100, 150, 500], 'learning_rate': [0.25, 0.5, 0.75, 1]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [118]:
clf.best_params_

{'learning_rate': 0.25, 'n_estimators': 50}

## Pickling time

In [120]:
import cPickle as pickle

In [127]:
X = df['Text']
y = df['Positive']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

tfidf = TfidfVectorizer(stop_words='english', ngram_range=[1,3], max_df=0.1, max_features=100).fit(X_train)
X_train = tfidf.transform(X_train)

with open('../pickle/vectorizer.pkl', 'wb') as fid:
    pickle.dump(tfidf, fid)  
    
model = AdaBoostClassifier(base_estimator=MultinomialNB(), learning_rate=0.25, n_estimators=50)
model = model.fit(X_train, y_train)
with open('../pickle/SAmodel.pkl', 'wb') as fid:
    pickle.dump(model, fid)