In [None]:
import pandas as pd
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('/content/drive/MyDrive/Video_Games_5.json.gz')

In [None]:
print(df.columns)
print(df.shape)

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'style', 'image'],
      dtype='object')
(497577, 12)


In [None]:
count = df.groupby("asin", as_index=False).count()
mean = df.groupby("asin", as_index=False).mean()

dfMerged = pd.merge(df, count, how='right', on=['asin'])

dfMerged["totalReviewers"] = dfMerged["reviewerID_y"]
dfMerged["overallScore"] = dfMerged["overall_x"]
dfMerged["summaryReview"] = dfMerged["summary_x"]

dfNew = dfMerged[['asin','summaryReview','overallScore',"totalReviewers"]]

dfMerged = dfMerged.sort_values(by='totalReviewers', ascending=False)
dfCount = dfMerged[dfMerged.totalReviewers >= 50]

dfProductReview = df.groupby("asin", as_index=False).mean()
ProductReviewSummary = dfCount.groupby("asin")["summaryReview"].apply(list)
ProductReviewSummary = pd.DataFrame(ProductReviewSummary)

In [None]:
import re
import string
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer

ProductReviewSummary.to_csv("ProductReviewSummary.csv")
df3 = pd.read_csv("ProductReviewSummary.csv")
df3 = pd.merge(df3, dfProductReview, on="asin", how='inner')
df3 = df3[['asin','summaryReview','overall']]

regEx = re.compile('[^a-z]+')
def cleanReviews(reviewText):
    reviewText = reviewText.lower()
    reviewText = regEx.sub(' ', reviewText).strip()
    return reviewText

df3["summaryClean"] = df3["summaryReview"].apply(cleanReviews)
df3 = df3.drop_duplicates(['overall'], keep='last')
df3 = df3.reset_index()

reviews = df3["summaryClean"] 
countVector = CountVectorizer(max_features = 300, stop_words='english') 
transformedReviews = countVector.fit_transform(reviews) 

dfReviews = DataFrame(transformedReviews.A, columns=countVector.get_feature_names())
dfReviews = dfReviews.astype(int)

In [None]:
import numpy as np

X = np.array(dfReviews)

tpercent = 0.9
tsize = int(np.floor(tpercent * len(dfReviews)))
dfReviews_train = X[:tsize]
dfReviews_test = X[tsize:]

lentrain = len(dfReviews_train)
lentest = len(dfReviews_test)

print(lentrain, lentest)

1594 178


In [None]:
from sklearn.neighbors import NearestNeighbors

neighbor = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(dfReviews_train)

distances, indices = neighbor.kneighbors(dfReviews_train)

In [None]:
from sklearn import neighbors
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

df5_train_target = df3["overall"][:lentrain]
df5_test_target = df3["overall"][lentrain:lentrain+lentest]
df5_train_target = df5_train_target.astype(int)
df5_test_target = df5_test_target.astype(int)

n_neighbors = 3
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
knnclf.fit(dfReviews_train, df5_train_target)
knnpreds_test = knnclf.predict(dfReviews_test)

print(classification_report(df5_test_target, knnpreds_test))
print (accuracy_score(df5_test_target, knnpreds_test))

              precision    recall  f1-score   support

           2       0.00      0.00      0.00         1
           3       0.41      0.33      0.36        40
           4       0.82      0.87      0.84       137

    accuracy                           0.74       178
   macro avg       0.41      0.40      0.40       178
weighted avg       0.72      0.74      0.73       178

0.7415730337078652


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
for i in range(lentest):
    a = neighbor.kneighbors([dfReviews_test[i]])
    related_product_list = a[1]

    first_related_product = [item[0] for item in related_product_list]
    first_related_product = str(first_related_product).strip('[]')
    first_related_product = int(first_related_product)
    second_related_product = [item[1] for item in related_product_list]
    second_related_product = str(second_related_product).strip('[]')
    second_related_product = int(second_related_product)
    
    print ("Based on product reviews, for ", df3["asin"][lentrain + i] ," average rating is ",df3["overall"][lentrain + i])
    print ("The first similar product is ", df3["asin"][first_related_product] ," average rating is ",df3["overall"][first_related_product])
    print ("The second similar product is ", df3["asin"][second_related_product] ," average rating is ",df3["overall"][second_related_product])
    print ("-----------------------------------------------------------")


Based on product reviews, for  B00ZGT8ILI  average rating is  4.547169811320755
The first similar product is  B00LH6CBA8  average rating is  4.098039215686274
The second similar product is  B008OQTS0U  average rating is  4.545454545454546
-----------------------------------------------------------
Based on product reviews, for  B00ZJ20YQO  average rating is  3.7333333333333334
The first similar product is  B002ELCV00  average rating is  3.9186991869918697
The second similar product is  B00LLIQS6U  average rating is  3.7761194029850746
-----------------------------------------------------------
Based on product reviews, for  B00ZJ211Q6  average rating is  4.04054054054054
The first similar product is  B0022TNO7S  average rating is  4.177083333333333
The second similar product is  B00XK9OVSU  average rating is  4.365168539325842
-----------------------------------------------------------
Based on product reviews, for  B00ZJBSCJQ  average rating is  4.337662337662338
The first similar pro