In [58]:
# Importing all the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [59]:
df = pd.read_csv("kindle_reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


In [60]:
# Sampling the dataset 
df.describe()

Unnamed: 0.1,Unnamed: 0,overall,unixReviewTime
count,982619.0,982619.0,982619.0
mean,491309.0,4.347801,1378183000.0
std,283657.816418,0.955056,22984760.0
min,0.0,1.0,952214400.0
25%,245654.5,4.0,1365206000.0
50%,491309.0,5.0,1383523000.0
75%,736963.5,5.0,1396051000.0
max,982618.0,5.0,1406074000.0


In [61]:
unique_items_count = df['asin'].nunique()
print(unique_items_count)

61934


In [62]:
len(df)

982619

In [63]:
# Finding the reviewerID of those reviewers who have reviewed a book only once
grouped = df.groupby('reviewerName')
single_reviewers = []
count = 0
for group_name, group_df in grouped:
    if(len(group_df)==1):
        print(group_name)
        single_reviewers.append(group_name)
        
df = df[~df['reviewerName'].isin(single_reviewers)]

# These are the reviews we can delete from our dataset as these reviewers cannot reccommend anyone anything


A book a day
A. Brinskele "Ang"
ANNE JOLiN BOOK BLOG
Amazon Customer "Stacey W"
Amazon Customer "The Book guru"
Amazon Customer "lover of books"
Ande Lyons "Ande Lyons"
Antwan Floyd Sr.
Aramann
Arec
Arizona Paul
Blondie "mino"
Bob Holley
Brigitte Antoinette Ware
Cassandra Sherred "CYSherred"
Cassy Taylor
Cecily's Book Review "cecily bonney"
Chris's Book Addiction
Cocoa
Dannae L.
David P. Ehrlich
Dawn M Blackmon
Days fan
DebSunshine
Deborah Capers
Deep S
Duane Hennessy
Elizabeth Fidler
Good Karma
GreatExpectations
HonestMabel
Horseluver
J. Sabla
James Greenwood
Jessica Hunt "Geniune Reviews"
Jessica Wilson "specialjjjj"
Jo-Ann Eshbach
Judith Ann Abbott
Karie
Katie L. Carroll
Kelley M. Rogers
Kindle Customer "Cats   ^..^"
Kindle Customer "firstloves"
Kris @Imaginary Reads
LED
Lace "pollywog"
Love&amp;amp;amp;amp;amp;amp;amp;amp;amp;Sports
MM Read
Marion Rudnick "Marion Rudnick"
Mary Laprade
Megan C. Christmas "Megan"
Melstan
Michael L. Fowler "rare horror movie fan"
Michael brown
Milce
M

In [64]:
null_counts = df.isnull().sum()
print(null_counts)
df = df.dropna(subset=['reviewText'])
# We can also delete those rows where the review text is 

Unnamed: 0           0
asin                 0
helpful              0
overall              0
reviewText          22
reviewTime           0
reviewerID           0
reviewerName      3816
summary              1
unixReviewTime       0
dtype: int64


In [65]:
column_dtype = df['helpful'].dtype
print(column_dtype)

object


In [66]:
# We can remove those reviews who have a helpfulness rating of 0 and less than 20%
unique_values = df['helpful'].unique()
print(unique_values)

['[0, 0]' '[2, 2]' '[1, 1]' ... '[23, 62]' '[116, 121]' '[56, 80]']


In [67]:
helpless_ratings = []
for i in range (0,len(unique_values)-1):
    chk = eval(unique_values[i])
    if(chk[0]==0):
        helpless_ratings.append(unique_values[i])
        continue
    if((chk[0]/chk[1])<=0.6):
        helpless_ratings.append(unique_values[i])


In [68]:
df = df[~df['helpful'].isin(helpless_ratings)]
len(df)

356008

In [80]:
# Unicode Normalization 
import unicodedata
normalized_text = df['reviewText'].apply(lambda x: unicodedata.normalize('NFC', x))
df['reviewText'] = normalized_text

In [81]:
# Remove unwanted spaces,extra trailing and leading spaces,punctuations,full-stops,inverted commas,special characters
import re
def clean_text(text):
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text) 
    return cleaned_text.strip()
df['reviewText'] = df['reviewText'].apply(lambda x: clean_text(x))


In [83]:
# Removing stopwords from the dataset
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
def remove_stopwords(text):
    tokens = word_tokenize(text)    
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
df['reviewText'] = df['reviewText'].apply(lambda x: remove_stopwords(x))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shrishail\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shrishail\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
