In [130]:
import pandas as pd

pet_df = pd.read_json('pet_supplies_3000.json', lines=True)

In [131]:
pet_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A14CK12J7C7JRK,1223000893,Consumer in NorCal,"[0, 0]",I purchased the Trilogy with hoping my two cat...,3,Nice Distraction for my cats for about 15 minutes,1294790400,"01 12, 2011"
1,A39QHP5WLON5HV,1223000893,Melodee Placial,"[0, 0]",There are usually one or more of my cats watch...,5,Entertaining for my cats,1379116800,"09 14, 2013"
2,A2CR37UY3VR7BN,1223000893,Michelle Ashbery,"[0, 0]",I bought the triliogy and have tested out all ...,4,Entertaining,1355875200,"12 19, 2012"
3,A2A4COGL9VW2HY,1223000893,Michelle P,"[2, 2]",My female kitty could care less about these vi...,4,Happy to have them,1305158400,"05 12, 2011"
4,A2UBQA85NIGLHA,1223000893,"Tim Isenhour ""Timbo""","[6, 7]","If I had gotten just volume two, I would have ...",3,You really only need vol 2,1330905600,"03 5, 2012"


In [132]:
pd.value_counts(pet_df['helpful'])

[0, 0]      2092
[1, 1]       373
[2, 2]        99
[0, 1]        71
[3, 3]        44
            ... 
[22, 27]       1
[26, 27]       1
[1, 7]         1
[15, 17]       1
[7, 11]        1
Name: helpful, Length: 103, dtype: int64

In [108]:
pd.value_counts(pet_df['overall'])

5    74178
4    20744
3    11907
2     6912
1     6907
Name: overall, dtype: int64

In [109]:
class Sentiment:
    positive = "POSITIVE"
    neutral = "NEUTRAL"
    negative = "NEGATIVE"


class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = Sentiment.negative if score <= 2 else Sentiment.neutral if score <= 3 else Sentiment.positive

In [110]:
import re
import string
import contractions


def clean_text(text):
    text = text.lower()

    # Remove punctuation marks
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove digits from the text
    text = re.sub(r'\w*\d\w*', '', text)
    return text


def fix_text(text):
    # Contractions will expand words such as it's to it is
    text = contractions.fix(text)
    # Call our clean_text function
    text = clean_text(text)

    # Read the stopwords from the stopwords.txt file and remove line breaks
    # The result will be a list of strings
    stopwords = [word.strip() for word in open('stopwords.txt')]
    # Remove all stopwords from the text
    text = ' '.join([word for word in text.split() if word not in stopwords])

    return text

In [111]:
import json

reviews = []

with open('pet_supplies_30000.json', 'r') as in_file:
    for line in in_file:
        data = json.loads(line)
        text = data['reviewText']
        score = data['overall']
        text = fix_text(text)
        reviews.append(Review(text, score))

In [112]:
print(reviews[0].text)

purchased trilogy hoping two cats age would interested yr old cat fascinated minutes pictures came got bored year old watched minutes walked away possible wonderful courtyard full greenery trees one neighbors bird feeder enough going outside prefer real life versus taped version likely pass friend cats much wildlife watch mine


In [113]:
print(reviews[1].score)

5.0


In [114]:
print(reviews[1].sentiment)

POSITIVE


In [115]:
print(len(reviews))

30000


In [116]:
from sklearn.model_selection import train_test_split

training_data, test_data = train_test_split(reviews, test_size=0.33, random_state=42)

In [117]:
print(len(training_data))
print(len(test_data))

20100
9900


In [118]:
print(training_data[0].text)
print('*'*40)
print(test_data[0].text)

use curved glass aquarium whole slew fish breeding crowded situation started three fish least sixty heavy duty filter keeps water clean clear despite huge population aquarium inhabitants saved dayupdate september changed cartridges filter must say despite utterly filthy get water still stays crystal clear cannot think higher accolade gazillion platies green severum cory cat two clown loaches picture health hail tetra whisper
****************************************
kids love food treat meal keep going back bowl look morei love fact red barn never recall great job redbarn


In [119]:
train_x = [review.text for review in training_data]
train_y = [review.sentiment for review in training_data]

test_x = [review.text for review in test_data]
test_y = [review.sentiment for review in test_data]

In [120]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

In [121]:
vectorizer.get_feature_names()



['aa',
 'aaa',
 'aaaand',
 'aafco',
 'aand',
 'aannounce',
 'aaquaclear',
 'aaqueon',
 'aat',
 'ab',
 'aback',
 'abandon',
 'abandoned',
 'abandonment',
 'abandonned',
 'abandons',
 'abate',
 'abated',
 'abbey',
 'abbeys',
 'abby',
 'abdomen',
 'abhor',
 'abhors',
 'abide',
 'abigail',
 'abilities',
 'abilitiessummary',
 'ability',
 'abissell',
 'abit',
 'able',
 'ablebeen',
 'abnormal',
 'abnormally',
 'aboout',
 'abosrbant',
 'abotu',
 'abound',
 'aboundsynergy',
 'abouta',
 'aboutbut',
 'abouti',
 'abouts',
 'aboutthat',
 'aboutthe',
 'aboutthis',
 'aboutupdate',
 'aboutwhat',
 'aboutwill',
 'aboveaverage',
 'abovekaytee',
 'aboverubber',
 'abovetank',
 'aboveupdate',
 'abraded',
 'abrasion',
 'abrasions',
 'abrasive',
 'abscess',
 'absence',
 'absencehowever',
 'absent',
 'absessed',
 'absoloutely',
 'absolute',
 'absolutely',
 'absolutelyrefused',
 'absolutley',
 'absorb',
 'absorbance',
 'absorbancy',
 'absorbant',
 'absorbanti',
 'absorbed',
 'absorbency',
 'absorbent',
 'absorb

In [122]:
train_x_vectors.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [123]:
from sklearn.svm import SVC

# Create a support vector machine classifier model
clf_svm = SVC(kernel='linear')
# Train the model
clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

In [124]:
test_vectors = vectorizer.transform(test_x)

clf_svm.score(test_vectors, test_y)

0.7730303030303031

In [125]:
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression Model
clf_log = LogisticRegression(max_iter=1000)
clf_log.fit(train_x_vectors, train_y)

LogisticRegression(max_iter=1000)

In [126]:
clf_log.score(test_vectors, test_y)

0.8066666666666666