In [1]:
import numpy as np 
import pandas as pd
import gzip

In [7]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield eval(l)

data = []
for review in parse('reviews_Musical_Instruments_5.json.gz'):
    data.append([review['reviewText'], review['overall']])
df = pd.DataFrame(data, columns=["Review_text", "Review_class"])
df

Unnamed: 0,Review_text,Review_class
0,"Not much to write about here, but it does exac...",5.0
1,The product does exactly as it should and is q...,5.0
2,The primary job of this device is to block the...,5.0
3,Nice windscreen protects my MXL mic and preven...,5.0
4,This pop filter is great. It looks and perform...,5.0
...,...,...
10256,"Great, just as expected. Thank to all.",5.0
10257,I've been thinking about trying the Nanoweb st...,5.0
10258,I have tried coated strings in the past ( incl...,4.0
10259,"Well, MADE by Elixir and DEVELOPED with Taylor...",4.0


In [8]:
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [14]:
def clean_line(text):
    text = text.lower()
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = pattern.sub('', text)
    text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
    tokens = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words("english"))
    stop_words.discard("not")
    PS = PorterStemmer()
    words = [PS.stem(w) for w in words if not w in stop_words]
    words = ' '.join(words)
    return words

def clean_text(df):
    all_reviews = list()
    lines = df["Review_text"].values.tolist()
    for text in lines:
        all_reviews.append(clean_line(text))
    return all_reviews

all_reviews = clean_text(df)

In [15]:
all_reviews[0:20]

['not much write exactli suppos filter pop sound record much crisp one lowest price pop filter amazon might well buy honestli work despit price',
 'product exactli quit affordablei not realiz doubl screen arriv even better expecteda ad bonu one screen carri small hint smell old grape candi use buy reminisc sake not stop put pop filter next nose smell record dif need pop filter work well expens one may even come pleas aroma like mine didbuy product',
 'primari job devic block breath would otherwis produc pop sound allow voic pass notic reduct volum high frequenc doubl cloth filter block pop let voic color metal clamp mount attach mike stand secur enough keep attach goos neck need littl coax stay put',
 'nice windscreen protect mxl mic prevent pop thing gooseneck margin abl hold screen posit requir care posit clamp avoid sag',
 'pop filter great look perform like studio filter record vocal elimin pop get record sing',
 'good bought anoth one love heavi cord gold connector bass sound grea

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
TV = TfidfVectorizer(min_df=3)
X = TV.fit_transform(all_reviews).toarray()
y = (df.to_numpy()[:,1]).astype('double')
print(np.shape(X))
print(np.shape(y))

(10261, 5794)
(10261,)


In [37]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# from sklearn.tree import DecisionTreeClassifier
# model = DecisionTreeClassifier(criterion="entropy", random_state=41)
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, precision_score
print(accuracy_score(y_test, y_pred))
# print(f1_score(y_test, y_pred))
# print(precision_score(y_test, y_pred))

0.2990745250852411


In [61]:
def sentiment_predictor(text):
    text = clean_line(text)
    # print(text)
    text = TV.transform([text]).toarray()
    return model.predict(text)

In [62]:
inputFile = open("input.txt", "r")
inputText = str(inputFile.read())
print(sentiment_predictor(inputText))

[5.]


In [69]:
sentenceList = []
for sampleText in inputText.split("."):
    sentenceList.append(sentiment_predictor(sampleText)[0])
sentenceList = [int(txt) for txt in sentenceList]
sentenceList

[1, 1, 1, 4, 5, 1, 1, 5, 5, 2, 5, 2, 3, 1, 5, 5, 5, 1, 1, 5, 5, 5, 5, 4, 5, 1]

In [72]:
print("average: " + str(sum(sentenceList) / len(sentenceList)))
from collections import Counter
print(Counter(sentenceList))

average: 3.230769230769231
Counter({5: 12, 1: 9, 4: 2, 2: 2, 3: 1})
