# Create a class to easily acces text, and score

In [2]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review: # we create a class, Review(text, score)
    def __init__(self, text, score): # it means that we can designate values to text and score
        self.text = text # Review(text, score).text
        self.score = score
        self.sentiment = self.get_sentiment() 
        
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: # score 4 or 5
            return Sentiment.POSITIVE
        
class ReviewContainer: # ReviewContainer(reviews)
    def __init__(self, reviews: list):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews] # we get the list of text from reviews list
        
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews] # we get the sentiment from reviews list
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)] # say we have 450 negatives and 600 positives. We only select the 450 positives
        self.reviews = negative + positive_shrunk # append the 2 lists. our new reviews length. using the above example, we have 450 negatives + 450 positives which was shrunk
        random.shuffle(self.reviews) # after that we shuffle the list to get random positive and random negative
        print(f"Before: {len(negative)} NEG, {len(positive)} POS"), print(f"After: {len(negative)} NEG, {len(positive_shrunk)} POS")

# Import data and append it into a list

In [3]:
import json
import pandas as pd

file_name = r'C:\Users\kgonzales21\Downloads\IE things\IE things\Datasets\Books small 10000.json'.replace("\\", "/")

reviews = [] # [Review(text=review['reviewText'], score=review['overall']),
             #  Review(text=review['reviewText'], score=review['overall']),
             #  Review(text=review['reviewText'], scorereview['overall']), ...]

with open(file_name) as f:
    for line in f:
        review = json.loads(line) # converts the json file into a python dict {'reviewerID' : A1F2H80A1ZNN1N, ...}
        for key, value in review.items():
            print(f"{key} : {value}")
        print("\n")
        reviews.append(Review(review['reviewText'], review['overall'])) # packages the comment and score into a class, and passes each to the list

reviewerID : A1F2H80A1ZNN1N
asin : B00GDM3NQC
reviewerName : Connie Correll
helpful : [0, 0]
reviewText : I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ago and just finished book 5.  Sloane Monroe is a great character and being able to follow her through both private life and her PI life gets a reader very involved!  Although clues may be right in front of the reader, there are twists and turns that keep one guessing until the last page!  These are books you won't be disappointed with.
overall : 5.0
summary : Can't stop reading!
unixReviewTime : 1390435200
reviewTime : 01 23, 2014


reviewerID : AI3DRTKCSK4KX
asin : B00A5MREAM
reviewerName : Grandma
helpful : [0, 0]
reviewText : I enjoyed this short book. But it was way way to short ....I can see how easily it would have been to add several chapters.
overall : 3.0
summary : A leaf on the wind of all hallows
unixReviewTime : 1399593600
reviewTime : 05 9, 2014


reviewerID : A3KAKFHY9DAC8A
asin : 

In [4]:
print("Comment: ", reviews[5].text)
print("Score: ", reviews[5].score) # thus, when we access a Review(...) from the list, we need to call out the parameter
print("Sentiment: ", reviews[5].get_sentiment()) # same with the code below
print("Sentiment: ", reviews[5].sentiment) # same with the code above
print("Data size: ", len(reviews))

Comment:  I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia's trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character's voice on a strong subject and making it so that other peoples story may be heard through Mia's.
Score:  5.0
Sentiment:  POSITIVE
Sentiment:  POSITIVE
Data size:  10000


# Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

In [6]:
print("train: ", len(train))
print("test: ", len(test)) # x variable of test data

train:  6700
test:  3300


In [7]:
train_container.evenly_distribute() # from the function we created, this splits the review list into equal numbers of POSITIVE AND NEGATIVE

Before: 436 NEG, 5611 POS
After: 436 NEG, 436 POS


In [8]:
train_x = train_container.get_text() # x_train, [comment, comment, ...]
train_y = train_container.get_sentiment() # y_train [NEG, POS, ...]

In [9]:
test_container.evenly_distribute()

Before: 208 NEG, 2767 POS
After: 208 NEG, 208 POS


In [10]:
test_x = test_container.get_text() # x_test
test_y = test_container.get_sentiment() # y_test

In [11]:
print("Train data: {x}, {y}".format(x = len(train_x) , y = len(train_y)))
print("Test data: {0}, {1}".format(len(test_x) , len(test_y)))

Train data: 872, 872
Test data: 416, 416


# Feature Extraction (Tokenization)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer() # calls and designate a name to the function

In [13]:
X_train_vectors = vectorizer.fit_transform(train_x) # fits (make a model) and transforms (scales it) train x data - Vectorized train x data
print(X_train_vectors.toarray())
print(vectorizer.get_feature_names_out()) # vocabulary

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['00' '000' '01' ... 'zombies' 'zone' 'zora']


In [14]:
X_test_vectors = vectorizer.transform(test_x) # transforms test x (we do not fit it as it will not be used as a model)
print(X_test_vectors.toarray())
print(vectorizer.get_feature_names_out())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['00' '000' '01' ... 'zombies' 'zone' 'zora']


# Classification Models

## Linear SVM

In [15]:
from sklearn import svm

# Model
clf_svm = svm.SVC(kernel='linear') # create an SVM classifier
clf_svm.fit(X_train_vectors, train_y) # Fits the X_train_vectors to train_y using SVM

# Sample
print(test_x[0])
clf_svm.predict(X_test_vectors[0]) # Uses the model we created to predict a Vectorized test x data

Right away one could tell that this book was not true historical fiction. The innocent , beautiful, chaste, but learned bath maid enraptured everyone she met including a prince, a doctor, and a brewer. In the meantime religion fights with science and bloodletting fights with medicine.  Many of the characters were so stock it was like shopping at a big box store. However it is a quick read and if you like ripped bodice type of romance novels this will fit the bill.


array(['POSITIVE'], dtype='<U8')

## Decision Tree 

In [16]:
from sklearn.tree import DecisionTreeClassifier

# Model
clf_dec_tree = DecisionTreeClassifier() 
clf_dec_tree.fit(X_train_vectors, train_y) # fit X_train_vectors to train_y

# Sample
print(test_x[0])
clf_dec_tree.predict(X_test_vectors[0])

Right away one could tell that this book was not true historical fiction. The innocent , beautiful, chaste, but learned bath maid enraptured everyone she met including a prince, a doctor, and a brewer. In the meantime religion fights with science and bloodletting fights with medicine.  Many of the characters were so stock it was like shopping at a big box store. However it is a quick read and if you like ripped bodice type of romance novels this will fit the bill.


array(['POSITIVE'], dtype='<U8')

## Naive Bayes

In [17]:
class DenseTransformer(): # Makes X dense using toarray() function
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [18]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline # we use pipeline and store all functions inside of it

# Model
pipeline = Pipeline([('to_dense', DenseTransformer()), 
                     ('classifier', GaussianNB())])
pipeline.fit(X_train_vectors, train_y)

# Sample
print(test_x[0])
pipeline.predict(X_test_vectors[0])

Right away one could tell that this book was not true historical fiction. The innocent , beautiful, chaste, but learned bath maid enraptured everyone she met including a prince, a doctor, and a brewer. In the meantime religion fights with science and bloodletting fights with medicine.  Many of the characters were so stock it was like shopping at a big box store. However it is a quick read and if you like ripped bodice type of romance novels this will fit the bill.


array(['POSITIVE'], dtype='<U8')

## Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

# Model
clf_log = LogisticRegression()
clf_log.fit(X_train_vectors, train_y)

# Sample
print(test_x[0])
clf_log.predict(X_test_vectors[0])

Right away one could tell that this book was not true historical fiction. The innocent , beautiful, chaste, but learned bath maid enraptured everyone she met including a prince, a doctor, and a brewer. In the meantime religion fights with science and bloodletting fights with medicine.  Many of the characters were so stock it was like shopping at a big box store. However it is a quick read and if you like ripped bodice type of romance novels this will fit the bill.


array(['POSITIVE'], dtype='<U8')

# Evaluation Metrics

### Mean Accuracy

In [20]:
print("Decision Tree: ", clf_dec_tree.score(X_test_vectors, test_y))
print("Naive Bayes: ", pipeline.score(X_test_vectors, test_y))
print("SVM: ", clf_svm.score(X_test_vectors, test_y))
print("Logistic Regression", clf_log.score(X_test_vectors, test_y))


Decision Tree:  0.6442307692307693
Naive Bayes:  0.6346153846153846
SVM:  0.7980769230769231
Logistic Regression 0.8173076923076923


### F1 Score

In [21]:
from sklearn.metrics import f1_score

print("SVM: ", f1_score(test_y, clf_svm.predict(X_test_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL], zero_division=0))
print("Logistic Regression: ", f1_score(test_y, clf_log.predict(X_test_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL], zero_division=0))

SVM:  [0.8028169  0.79310345 0.        ]
Logistic Regression:  [0.82325581 0.81094527 0.        ]


# Testing out the model

In [24]:
prototype_data = [input("Enter a comment: ")]
new_test = vectorizer.transform(prototype_data)

print(prototype_data)
clf_svm.predict(new_test)

['I enjoyed reading it. The plot was great and it caught my attention']


array(['POSITIVE'], dtype='<U8')

# Saving our Model

In [83]:
import pickle

model_path = r'C:\Users\kgonzales21\Downloads\IE things\IE things\ML Models/Sentiment_Classifier.pkl'.replace("\\", "/")

with open(model_path, 'wb') as f:
    pickle.dump(clf_svm, f)

# Load Model

In [84]:
with open(model_path, 'rb') as f:
    loaded_clf_svm = pickle.load(f)

In [85]:
# Loads the model again

prototype_data = [input("Enter a comment: ")]
new_test = vectorizer.transform(prototype_data)

loaded_clf_svm.predict(new_test)

array(['NEGATIVE'], dtype='<U8')