In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
electronics_json_data = [json.loads(line) for line in (open("dataset/Electronics_5.json", "r"))]

In [None]:
electronics_data = pd.DataFrame(electronics_json_data)

In [None]:
electronics_data.head(5)

In [None]:
electronics_data["overall"].value_counts()

In [None]:
electronics_data.columns

<br><b>Here according to data description :</b>
<li>reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B</li>
<li>asin - ID of the product, e.g. 0000013714</li>
<li>reviewerName - name of the reviewer</li>
<li>helpful - helpfulness rating of the review, e.g. 2/3</li>
<li>reviewText - text of the review</li>
<li>overall - rating of the product</li>
<li>summary - summary of the review</li>
<li>unixReviewTime - time of the review (unix time)</li>
<li>reviewTime - time of the review (raw)</li>

In [None]:
electronics_data.isnull().sum()

In [None]:
electronics_data.shape

In [None]:
electronics_data = electronics_data[0:100000]

In [None]:
electronics_data.shape




# Data Wrangling

#### <u>Adding helpful and not-helpful column</u>

In [None]:
#helpful col is in matrix it is better to seperate it into two columns
list_helpful_rev = []
list_not_helpful_rev = []

for h in electronics_data["helpful"]:
    list_helpful_rev.append(h[0])
    list_not_helpful_rev.append(h[1] - h[0])
    
#adding helpful review col, review that finds customers rev usefull
electronics_data["helpful_rev"] = list_helpful_rev
#adding not helpful review col, review that finds customers rev useless
electronics_data["not_helpful_rev"] = list_not_helpful_rev

#dropping helpful column
electronics_data = electronics_data.drop("helpful", axis = 1)

In [None]:
electronics_data.head(5)

#### <u>Adding rating_class column</u>

In [None]:
#now overall rating has 5 unique values (1,2,3,4,5) lets classify it as good rating and bad rating

threshold = 3

electronics_data["rating_class"] = electronics_data["overall"].apply(lambda x: "good" if x > 3 else "bad")
    

In [None]:
electronics_data.sample(5)

In [None]:
#calculating total number of good and bad ratings
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="darkgrid")
sns.countplot(x = "rating_class", data = electronics_data)
plt.show()

val = electronics_data["rating_class"].value_counts()
print(val)

In [None]:
print("{:.2f}% more percentage of good rev than bad reviews.".format( ((val[0] - val[1]) / (val[1]+val[0])) * 100 ))

#### <u>Adding helpful review ratio for each user</u>

In [None]:
##adding col helpful_rev_ratio
electronics_data["helpful_rev_ratio"] = electronics_data["helpful_rev"] / (electronics_data["helpful_rev"] + electronics_data["not_helpful_rev"])

In [None]:
electronics_data.sample(2)

In [None]:
electronics_data["helpful_rev_ratio"].fillna(0, inplace = True)

In [None]:
electronics_data.head(2)

In [None]:
#dropping columns reviewerName, unixReviewTime
electronics_data = electronics_data.drop(["reviewerName", "unixReviewTime"], axis = 1)

In [None]:
#Converting time in yyyy-mm-dd
electronics_data["reviewTime"] = pd.to_datetime(electronics_data["reviewTime"])

In [None]:
electronics_data.head(2)

In [None]:
electronics_data = electronics_data.rename(columns = {"overall":"rating"})

In [None]:
electronics_data[["helpful_rev", "not_helpful_rev", "rating", "helpful_rev_ratio"]].describe()

# Visualizing Data

#### <u>Number of Reviews over the year</u>

In [None]:
#creating additional column of month and year of review
electronics_data["rev_year"] = electronics_data["reviewTime"].dt.year
electronics_data["rev_month"] = electronics_data["reviewTime"].dt.month

In [None]:
#grouping review w.r.t year and taking count of reviewID
Yearly = electronics_data.groupby("rev_year")["reviewerID"].count().reset_index()
Yearly = Yearly.rename(columns = {"reviewerID":"Number_Of_Reviews"})
Yearly.head(5)

In [None]:
ax = sns.barplot(x = "rev_year", y = "Number_Of_Reviews", data = Yearly)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
plt.show()

We see that year 2013 has maximum number of reviews.


#### <u>Number Of Reviews By Month</u>

In [None]:
Monthly = electronics_data.groupby("rev_month")["reviewerID"].count().reset_index()
Monthly = Monthly.rename(columns = {"reviewerID":"Number_Of_Reviews"})
Monthly.head(2)

In [None]:
import calendar
Monthly["rev_month"] = Monthly["rev_month"].apply(lambda x : calendar.month_abbr[x])
Monthly

In [None]:
ax = sns.barplot(x = "rev_month", y = "Number_Of_Reviews", data = Monthly)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
plt.show()

#### <u>Average overall rating over years</u>

In [None]:
Yearly_avg_rating = electronics_data.groupby("rev_year")["rating"].mean().reset_index()
Yearly_avg_rating = Yearly_avg_rating.rename(columns = {"rating":"avg_rating"})

In [None]:
Yearly_avg_rating.head(3)

In [None]:
sns.lineplot(x = "rev_year", y = "avg_rating", data = Yearly_avg_rating)

#### <u>Average Rating V/S Average Helpfullness Rating by the User</u>

In [None]:
user_rating_stats = electronics_data[["reviewerID", "helpful_rev", "not_helpful_rev", "rating"]]

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
user_rating_stats["total_rev_given"] = user_rating_stats["helpful_rev"] + user_rating_stats["not_helpful_rev"]
user_rating_stats["percentage_helpful"] = (user_rating_stats["helpful_rev"]/(user_rating_stats["helpful_rev"] + user_rating_stats["not_helpful_rev"] ))*100
user_rating_stats["avg_rating_given"] = user_rating_stats.groupby("reviewerID")["rating"].mean()

In [None]:
user_rating_stats.sample()

In [None]:
user_rating_stats = user_rating_stats.drop(["helpful_rev", "not_helpful_rev", "rating"], axis = 1)

In [None]:

user_rating_stats["percentage_helpful"].fillna(0, inplace = True)
user_rating_stats["avg_rating_given"].fillna(0, inplace = True)

In [None]:
user_rating_stats["helpful_rev_given"] = electronics_data["helpful_rev"]

In [None]:
user_rating_stats[["reviewerID", "avg_rating_given", "total_rev_given", "helpful_rev_given", "percentage_helpful"]].sample(5)

In [None]:
sns.scatterplot(x = "total_rev_given", y = "percentage_helpful", data = user_rating_stats)


#### <u>Length of Review V/S Rating Given</u>

In [None]:
cmp_review_text_rating = electronics_data[["reviewText", "rating_class"]]

In [None]:
cmp_review_text_rating["reviewText_length"] = cmp_review_text_rating["reviewText"].apply(lambda x : len(x))

In [None]:
cmp_review_text_rating.head(5)

In [None]:
sns.barplot(x = "rating_class", y = "reviewText_length", data = cmp_review_text_rating )

In [None]:
electronics_data.to_csv("electronics_data_2.csv", sep=',', encoding='utf-8', index = False)