In [2]:
#!/usr/bin/env python

import os
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import roc_auc_score as AUC
from Word2VecUtility3 import Word2VecUtility3



In [3]:
data = pd.read_csv('AFF_binarysub100k.csv', sep=',', index_col=False, encoding='utf-8')
data.head()

Unnamed: 0,Score,Text
0,0,Greenies tries to position itself as a healthy...
1,0,I can't comment on the other flavors of Silk s...
2,0,I have been giving my dog these treats for yea...
3,0,The Good: The Marley Coffee One Love coffee po...
4,0,Never received the salsa and the vendor never ...


In [4]:
#split dataset into train/test sets
train_data = data.sample(frac=0.5,random_state=200) #0.5 was used to speed up CV times initally!
test_data = data.drop(train_data.index)

train_data.to_csv('train.csv', index=False, sep=',', encoding='utf-8')
test_data.to_csv('test.csv', index=False, sep=',', encoding='utf-8')

In [5]:
#load in the training data
df = pd.read_csv('train.csv', index_col=False)
df.head()

Unnamed: 0,Score,Text
0,1,We first tasted this cheese at Epcot at WDW. I...
1,1,brings enough heat to the table without scorch...
2,0,I normally love this brand of kcups. However ...
3,1,"It is great...I want to buy more, but the pric..."
4,1,I have told any number of people when I can't ...


In [7]:
#split the training data into a training and validation set, where test is the val set.
train_i, test_i = train_test_split( np.arange( len( df )), train_size = 0.8, random_state = 44 )

train = df.iloc[train_i]
test = df.iloc[test_i]

In [11]:
#print "Parsing train reviews..."

clean_train_reviews = []
for review in train['Text']:
    clean_train_reviews.append( " ".join( Word2VecUtility3.review_to_wordlist( review )))

#print "Parsing test reviews..."

clean_test_reviews = []
for review in test['Text']:
    clean_test_reviews.append( " ".join( Word2VecUtility3.review_to_wordlist( review )))

In [12]:
print(len(clean_train_reviews))
print(len(clean_test_reviews))

36960
9241


In [13]:
#create features
vectorizer = TfidfVectorizer( max_features = 5000, ngram_range = ( 1, 3 ), 
    sublinear_tf = True )


#print "Vectorizing train..."
train_data_features = vectorizer.fit_transform( clean_train_reviews )


#print "Vectorizing test..."
test_data_features = vectorizer.transform( clean_test_reviews )



In [None]:
# X_train = train_data_features
# X_test = test_data_features

# Model Tuning 

In [14]:
from __future__ import print_function
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn import svm
import logging

In [15]:
parameter_candidates = [
  {'C': [0.1, 0.5, 1, 10], 'kernel': ['linear']}
]

# Create a classifier object with the classifier and parameter candidates
clf = GridSearchCV(estimator=svm.SVC(), param_grid=parameter_candidates, cv =5, n_jobs=-1)

# Train the classifier on data1's feature and target data
clf.fit(train_data_features, train['Score'])   

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'C': [0.1, 0.5, 1, 10], 'kernel': ['linear']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [19]:
# View the accuracy score
print('Best score for data:', clf.best_score_)
# View the best parameters for the model found using grid search
print('Best C:',clf.best_estimator_.C)

Best score for data: 0.927218614719
Best C: 1


In [20]:
# Apply the classifier trained using training data to test set (validation in this case), and view the accuracy score
clf.score(test_data_features, test['Score'])  

0.92803809111568014

In [29]:
SVM = svm.SVC(C=1, kernel='linear').fit(train_data_features, train['Score'])

In [28]:
# Train a new classifier using the best parameters found by the grid search
SVM = svm.SVC(C=1, kernel='linear').fit(train_data_features, train['Score']).score(test_data_features, test['Score'])

# Testing on Heldout Test Data

In [23]:
test_data = pd.read_csv('test.csv', index_col=False)
test_data.head()


Unnamed: 0,Score,Text
0,0,Greenies tries to position itself as a healthy...
1,0,I can't comment on the other flavors of Silk s...
2,0,I have been giving my dog these treats for yea...
3,0,The Good: The Marley Coffee One Love coffee po...
4,0,Never received the salsa and the vendor never ...


In [32]:
print(len(test_data))

46201


In [34]:
clean_test2_reviews = []
for review in test_data['Text']:
    clean_test2_reviews.append( " ".join( Word2VecUtility3.review_to_wordlist( review )))

In [35]:
#print "Vectorizing test..."
test_data_features2 = vectorizer.transform( clean_test2_reviews )

In [36]:
# Apply the classifier trained using training data to test, and view the accuracy score
SVM.score(test_data_features2, test_data['Score']) 

0.93251228328391156