### Libraries

In [1]:
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
import json

### Data Class

In [2]:
class Category:
    ELECTRONICS = "ELECTRONICS"
    BOOKS = "BOOKS"
    CLOTHING = "CLOTHING"
    GROCERY = "GROCERY"
    PATIO = "PATIO"
    
class Sentiment:
    Highly_Suggested = "Very_Suggested"
    Medium_Suggested = "Medium_Suggested"
    Fairly_Suggested = "Fairly_Suggested"
    Low_Suggested = "Low_Suggested"
    IN_REVIEW ="IN_REVIEW"
    Not_Suggested = "Not_Suggested"
    
class Review:
    def __init__(self, category, text, score):
        self.category = category
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score == 5:
            return Sentiment.Highly_Suggested
        elif self.score == 4:
            return Sentiment.Medium_Suggested
        elif self.score == 3:
            return Sentiment.Fairly_Suggested
        elif self.score == 2:
            return Sentiment.Low_Suggested
        elif self.score == 1:
            return Sentiment.IN_REVIEW
        else:
            return Sentiment.Not_Suggested

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_x(self, vectorizer):
        return vectorizer.transform(self.get_text())
    
    def get_y(self):
        return [x.sentiment for x in self.reviews]
    
    def get_category(self):
        return [x.category for x in self.reviews]

### Load Data

In [3]:
file_names = ['Electronics_small.json', 'Books_small.json', 'Clothing_small.json', 'Grocery_small.json', 'Patio_small.json']
file_categories = [Category.ELECTRONICS, Category.BOOKS, Category.CLOTHING, Category.GROCERY, Category.PATIO]

reviews = []
for i in range(len(file_names)):
    file_name = file_names[i]
    category = file_categories[i]
    with open(file_name) as f:
        for line in f:
            review_json = json.loads(line)
            review = Review(category, review_json['reviewText'], review_json['overall'])
            reviews.append(review)

### Data Preparation

In [4]:
train, test = train_test_split(reviews, test_size = 0.33, random_state=42)

train_container = ReviewContainer(train)
#train_container.evenly_distribute()
test_container = ReviewContainer(test)
#test_container.evenly_distribute()

corpus = train_container.get_text()
# vectorizer = CountVectorizer(binary=True)
# vectorizer.fit(corpus)
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

train_x = train_container.get_x(vectorizer)
train_y = train_container.get_category()

test_x = test_container.get_x(vectorizer)
test_y = test_container.get_category()

### Classification

In [5]:
from sklearn import svm

clf = svm.SVC(C=16, kernel='linear', gamma='auto')
clf.fit(train_x, train_y)

SVC(C=16, gamma='auto', kernel='linear')

In [6]:
test_set = ["very brilliant", "bad book do not buy", "horrible waste of time"]
new_test = vectorizer.transform(test_set)

clf.predict(new_test)

array(['CLOTHING', 'BOOKS', 'BOOKS'], dtype='<U11')

In [7]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

# print(train_x)
gnb.fit(train_x.todense(),train_y)
gnb.score(test_x.todense(),test_y)

0.8109090909090909

### Performance

In [8]:
y_pred = clf.predict(test_x)

f1_score(test_y, y_pred, average=None)

# for i in range(len(y_pred)):
#     print(y_pred[i], test_y[i])

array([0.95111111, 0.89323308, 0.88567294, 0.89891135, 0.91693291])

In [9]:
clf.score(test_x, test_y)

0.9090909090909091

### Tuning with GridSearch

In [10]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C':[0.1,1,8,16,32]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.1, 1, 8, 16, 32], 'kernel': ('linear', 'rbf')})

In [11]:
clf.score(test_x, test_y)


0.9187878787878788

### Pickle Model

### Save the model

In [12]:
import pickle

with open('Amazon_Reviews.pkl', 'wb') as f:
    pickle.dump(clf, f)
    
with open('Amazon_Reviews_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

### Load the model

In [13]:
import pickle 

with open('Amazon_Reviews.pkl', 'rb') as f:
    clf = pickle.load(f)

with open('Amazon_Reviews.pkl', 'rb') as f:
    vectorizer = pickle.load(f)
    