In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

In [2]:
#80-10-10 Train/Val/Test split. Random_state stays 42 if not the dataset file would have different results every run
data = pd.read_csv('data-crawling/final_books_dataset_duplicates_removed.csv')
train, validate, test = np.split(data.sample(frac=1, random_state=42),[int(.8*len(data)), int(.9*len(data))])


In [3]:
test.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1825 entries, 5542 to 15795
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   productAsin        1825 non-null   object
 1   ratingScore        1825 non-null   int64 
 2   reviewTitle        1825 non-null   object
 3   reviewReaction     317 non-null    object
 4   reviewDescription  1825 non-null   object
 5   isVerified         1825 non-null   bool  
 6   category           1825 non-null   object
dtypes: bool(1), int64(1), object(5)
memory usage: 101.6+ KB


In [4]:
test.head(10)

Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category
5542,670062510,4,Good item,,Good item,True,children
7993,1484707230,5,Great Series üëçüèª,,Absolutely wonderful book series! I ordered th...,True,children
10900,1542046599,5,Engaging Story,,Before You Go is an engaging read with a weave...,True,mystery
7758,1589255518,5,Reading is fundamental,,My grandson loves this story,True,children
4837,385376715,5,perfect gift for my new grand-baby!,11 people found this helpful,"Beautifully illustrated, with healthy & empowe...",True,children
6548,9387779262,5,so nice and helpful,,My kids love it,True,children
4481,B096MWJLNW,5,Amazing,2 people found this helpful,"I loved the cover, hard cover and excellent qu...",True,humor_entertainment
3716,63076098,5,Great gift for your Dave Grohl super fans.,,My wife is a super fan and was so happy to rec...,True,humor_entertainment
10837,1542046599,5,I enjoyed this writer and am eager to read mor...,,Intense story line that pulls you in and keep ...,True,mystery
6140,578629097,5,AMAZING!,,This book is AMAZING! Perfect way to build a c...,True,children


In [5]:
test_data = test.reset_index()
test_data = test_data.drop(["index"], axis=1)
test_data.head()

Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category
0,670062510,4,Good item,,Good item,True,children
1,1484707230,5,Great Series üëçüèª,,Absolutely wonderful book series! I ordered th...,True,children
2,1542046599,5,Engaging Story,,Before You Go is an engaging read with a weave...,True,mystery
3,1589255518,5,Reading is fundamental,,My grandson loves this story,True,children
4,385376715,5,perfect gift for my new grand-baby!,11 people found this helpful,"Beautifully illustrated, with healthy & empowe...",True,children


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14595 entries, 217 to 7006
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   productAsin        14595 non-null  object
 1   ratingScore        14595 non-null  int64 
 2   reviewTitle        14594 non-null  object
 3   reviewReaction     2442 non-null   object
 4   reviewDescription  14594 non-null  object
 5   isVerified         14595 non-null  bool  
 6   category           14595 non-null  object
dtypes: bool(1), int64(1), object(5)
memory usage: 812.4+ KB


In [7]:
train_data = train
train_data["Polarity"] = train_data["ratingScore"].apply(lambda x: 1 if x > 3 else (-1 if x < 3 else 0)) 
train_data.head(10)

Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category,Polarity
217,451524934,4,Clear why this is a classic,One person found this helpful,A remarkable book. I sometimes hesitate to rea...,True,humor_entertainment,1
5681,670062510,5,Read it. It's worth it.,,All of my kids had to read this and watch the ...,True,children,1
13138,1542046599,5,DEFINITELY read this,,I am a big fan of books from each characters v...,True,mystery,1
10557,1542046599,1,This Book Is Awful,One person found this helpful,"The story was convoluted, repetitive and borin...",True,mystery,-1
17533,399587683,5,This story was too cute!!!,,"I think I loved this book as much as ""The Wedd...",True,romance,1
563,451526341,1,I never got it because I cancelled the order.,,I never got it because I cancelled the order. ...,True,humor_entertainment,-1
18092,399587683,5,Fun LA based romantic comedy!,2 people found this helpful,I have been waiting to read The Proposal since...,True,romance,1
2929,B0176M3U10,5,Such a good read,,Loved this book. Trigger warning for DA.,True,humor_entertainment,1
2136,1451673310,5,and Social media is destroyed at any temp,One person found this helpful,A warning all the more relevant today and when...,True,humor_entertainment,1
3287,1250181909,5,Order it,,Love this! Makes bettering yourself fun!,True,humor_entertainment,1


In [8]:
validation_data = validate
validation_data["Polarity"] = validation_data["ratingScore"].apply(lambda x: 1 if x > 3 else (-1 if x < 3 else 0)) 
validation_data.head(10)

Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category,Polarity
5102,399226907,5,Classic book,,"A classic book to teach kids some food, number...",True,children,1
13379,1542046599,4,"This one definitely kept my interest, and kept...",,"This one definitely kept my interest, and kept...",True,mystery,1
11623,1542046599,4,Good read!,,Great story! Interesting how it all comes toge...,True,mystery,1
6599,9387779262,4,Fast delivery,2 people found this helpful,"Fast delivery. I would love to give 5 star, ho...",True,children,1
3764,63076098,5,Amazing stories from a great person,3 people found this helpful,Been a fan for many years and still Dave has t...,True,humor_entertainment,1
16235,1984806734,5,Fabulous!,,Sooo well written! A bit predictable but I lo...,True,romance,1
6410,9387779262,5,Great books,,My son loves these books,True,children,1
15499,1984806734,2,Throw it into a bonfire on an actual beach for...,2 people found this helpful,I read this book to give myself a mindless bre...,False,romance,-1
13705,1542046599,5,Good read!,,I much enjoyed how the characters were intertw...,True,mystery,1
25,451524934,5,1984 and my childhood,6 people found this helpful,"Mindlessly mechanical, very few things move he...",True,humor_entertainment,1


In [9]:
train_data = train_data.reset_index()
train_data = train_data.drop(["index"], axis=1)

validation_data = validation_data.reset_index()
validation_data = validation_data.drop(["index"], axis=1)


train.to_csv('train.csv')
validate.to_csv('validation.csv')
test.to_csv('test.csv')
train_data.to_csv('train_data_polarity.csv')
validation_data.to_csv('validation_data_polarity.csv')