In [41]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import json

### Load In Data

#### Data Class

In [48]:
class Category:
    ELECTRONICS = "ELECTRONICS"
    
class Sentiment:
    POSITIVE = "POSITIVE"
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"

class Review:
    def __init__(self, category, text, score):
        self.category = category
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: # Amazon review is a 4 or 5
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_x(self, vectorizer):
        return vectorizer.transform(self.get_text())
    
    def get_y(self):
        return [x.sentiment for x in self.reviews]

#### Load in Data

In [33]:
file_name = './data/Electronics_small.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review_json = json.loads(line)
        review = Review(Category.ELECTRONICS, review_json['reviewText'], review_json['overall'])
        reviews.append(review)
        

        

### Data Prep

In [57]:
train, test = train_test_split(reviews, test_size = 0.33, random_state=42)

train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

corpus = train_container.get_text()
vectorizer = CountVectorizer()
vectorizer.fit(corpus)

train_x = train_container.get_x(vectorizer)
train_y = train_container.get_y()

test_x = test_container.get_x(vectorizer)
test_y = test_container.get_y()

### Classification

In [59]:
train_y.count(Sentiment.POSITIVE)


540

In [76]:
from sklearn import svm

clf = svm.SVC(C=32, kernel='linear')
clf.fit(train_x, train_y)

SVC(C=32, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [81]:
test_set = ['this product sucked', 'this is terrible', 'this is great']
new_test = vectorizer.transform(test_set)

clf.predict(new_test)



array(['POSITIVE', 'POSITIVE', 'POSITIVE'], dtype='<U8')

#### Performance

#### Tuning (with grid search)