In [2]:
from __future__ import division
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import re
import enchant
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler
import seaborn as sns
import pdb
%matplotlib inline

In [3]:
connection = MongoClient()
db = connection.wta

def mongo2PandasClean(mongodb, drop_id=True):
    df = pd.DataFrame(list(mongodb.find()))
    if drop_id:
        del df['_id']
    return df

In [4]:
train_df = mongo2PandasClean(db.trainingTR)

In [5]:
train_df.head()

Unnamed: 0,Name,Rating,Text
0,Pine Ridge Trail to Sykes Hot Springs,5,Not much to see but the hot springs are worth ...
1,Pine Ridge Trail to Sykes Hot Springs,3,The proximity to San Francisco is super conven...
2,Pine Ridge Trail to Sykes Hot Springs,1,We hiked in and back in one day. A little ove...
3,Pine Ridge Trail to Sykes Hot Springs,3,This hike is very strenuous. The first 4 mile...
4,Pine Ridge Trail to Sykes Hot Springs,5,"Challenging, beautiful and worth it! go mid we..."


In [6]:
train_df['Rating'].value_counts()

5    369
4    127
3     20
1      4
2      3
Name: Rating, dtype: int64

So imbalanced!  So for this I'm going to use scikit-learn's imbalance package.

https://github.com/scikit-learn-contrib/imbalanced-learn/blob/master/examples/over-sampling/plot_random_over_sampling.py

In order to use this example, I have to complete the following steps:

    1) Check that all of my entries are in English and remove those that aren't
    
    2) Vectorize the text into featues and apply PCA
    
    3) Apply oversampling

### Drop nonEnglish rows

In [7]:
d = enchant.Dict("en_US")

In [8]:
def dropNonEnglish(row):
    eng = 0
    words = 0
    for word in row.split():
        words += 1
        if d.check(word):
            eng += 1
    return eng/words
        

In [9]:
train_df.shape

(523, 3)

In [10]:
train_df = train_df[train_df['Text'].map(dropNonEnglish) > 0.5]

In [11]:
train_df.shape

(487, 3)

In [12]:
train_df['Positive'] = [1 if int(r) > 3 else 0 for r in train_df['Rating']]

### Vectorize Text

In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.5, max_features=100).fit(train_df['Text'])

In [None]:
features = tfidf.transform(train_df['Text'])

In [None]:
train_df['Positive'].value_counts()

In [None]:
X = features.toarray()
y = train_df['Positive']

In [None]:
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

In [None]:
sns.set()
almost_black = '#262626'
palette = sns.color_palette()
# Apply the random over-sampling
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[0][y == 0], X_vis[1][y == 0], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[1][y == 1], X_vis[1][y == 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax2.set_title('Random over-sampling')

### Resampling with SMOTE

In [13]:
sample = train_df[train_df['Positive'] == 0].index.tolist()
num_samples = len(train_df[train_df['Positive']==1]) - len(train_df[train_df['Positive']==0]) 
ix = np.random.choice(sample, size=num_samples, replace=True)
train_df_equal = train_df.append(train_df.ix[ix])

In [14]:
train_df_equal['Positive'].value_counts()

1    466
0    466
Name: Positive, dtype: int64

In [15]:
X = train_df['Text']
y = train_df['Positive']

## Sentiment Analyzer

https://github.com/zipfian/DSCI6004-instructor/blob/master/week5/5_1_SentimentMultinomialBayes/Text%20classification%20and%20Naive%20Bayes.pdf

http://www.nltk.org/book/ch06.html

![MNB classifier](MNB.png)

In [24]:
import re, math, collections, itertools, os
import nltk, nltk.classify.util, nltk.metrics.scores
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.cross_validation import train_test_split
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist




In [68]:
def trainModel(df):
    pos = df['Text'][df['Positive']==1]
    neg = df['Text'][df['Positive']==0]
    posWords = []
    negWords = []

    for row in pos:
        words = row.lower().split()
        words = [dict([(word.strip(string.punctuation), True) for word in words]), 'pos']
        posWords.append(words)
    for row in neg:
        words = row.lower().split()
        words = [dict([(word.strip(string.punctuation), True) for word in words]), 'neg']
        posWords.append(words)

    posCutoff = int(math.floor(len(posWords)*3/4))
    negCutoff = int(math.floor(len(negWords)*3/4))
    trainFeatures = posWords[:posCutoff] + negWords[:negCutoff]
    testFeatures = posWords[posCutoff:] + negWords[negCutoff:]

    classifier = NaiveBayesClassifier.train(trainFeatures)

    print(nltk.classify.accuracy(classifier, testFeatures))
    print classifier.show_most_informative_features(5)

In [72]:
trainModel(train_df_equal)

1.0
Most Informative Features
                   paper = True              neg : pos    =     39.2 : 1.0
                    test = True              neg : pos    =     33.9 : 1.0
                location = True              neg : pos    =     27.5 : 1.0
                  groups = True              neg : pos    =     24.6 : 1.0
               depending = True              neg : pos    =     24.6 : 1.0
None


0.9568788501026694