In [38]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score

import json

### Load In Data

#### Data Class

In [2]:
class Category:
    ELECTRONICS = "ELECTRONICS"
    
class Sentiment:
    POSITIVE = "POSITIVE"
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"

class Review:
    def __init__(self, category, text, score):
        self.category = category
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: # Amazon review is a 4 or 5
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_x(self, vectorizer):
        return vectorizer.transform(self.get_text())
    
    def get_y(self):
        return [x.sentiment for x in self.reviews]

#### Load in Data

In [3]:
file_name = './data/Electronics_small.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review_json = json.loads(line)
        review = Review(Category.ELECTRONICS, review_json['reviewText'], review_json['overall'])
        reviews.append(review)

### Data Prep

In [42]:
train, test = train_test_split(reviews, test_size = 0.33, random_state=42)

train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

corpus = train_container.get_text()
# vectorizer = CountVectorizer(binary=True)
# vectorizer.fit(corpus)
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

print("here")

train_x = train_container.get_x(vectorizer)
train_y = train_container.get_y()

test_x = test_container.get_x(vectorizer)
test_y = test_container.get_y()

here


### Classification

In [82]:
from sklearn import svm

clf = svm.SVC(C=1, kernel='linear', gamma='auto')
clf.fit(train_x, train_y)



SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [83]:
test_set = ['great', "wouldn't good good good good great", 'this is not good']
new_test = vectorizer.transform(test_set)

clf.predict(new_test)



array(['POSITIVE', 'POSITIVE', 'POSITIVE'], dtype='<U8')

In [93]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

# print(train_x)
gnb.fit(train_x,train_y)
gnb.score(test_x,test_y)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

#### Performance

In [84]:
y_pred = clf.predict(test_x)

f1_score(test_y, y_pred, average=None)

# for i in range(len(y_pred)):
#     print(y_pred[i], test_y[i])




array([0.        , 0.        , 0.90728477])

In [87]:
test_y.count(Sentiment.NEGATIVE)
len(test_y)

330

In [85]:
clf.score(test_x, test_y)

0.8303030303030303

#### Tuning (with grid search)

In [63]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C':[0.1,1,8,16,32,64], 'gamma': ('auto','scale')}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(train_x, train_y)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1, 8, 16, 32, 64],
                         'gamma': ('auto', 'scale'),
                         'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [64]:
clf.score(test_x, test_y)

0.8303030303030303