In [1]:
import pandas as pd
import json
import pyspark
import numpy as np
import re
import string
from langdetect import detect

In [2]:
digital_videogames_data_path = 'Data/amazon_reviews_us_Digital_Video_Games_v1_00.tsv'

In [3]:
digital_videogames_df = pd.read_table(digital_videogames_data_path, sep='\t', error_bad_lines=False)

In [4]:
digital_videogames_df.shape

(144724, 15)

In [5]:
digital_videogames_df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,21269168,RSH1OZ87OYK92,B013PURRZW,603406193,Madden NFL 16 - Xbox One Digital Code,Digital_Video_Games,2,2,3,N,N,A slight improvement from last year.,I keep buying madden every year hoping they ge...,2015-08-31
1,US,133437,R1WFOQ3N9BO65I,B00F4CEHNK,341969535,Xbox Live Gift Card,Digital_Video_Games,5,0,0,N,Y,Five Stars,Awesome,2015-08-31
2,US,45765011,R3YOOS71KM5M9,B00DNHLFQA,951665344,Command & Conquer The Ultimate Collection [Ins...,Digital_Video_Games,5,0,0,N,Y,Hail to the great Yuri!,If you are prepping for the end of the world t...,2015-08-31
3,US,113118,R3R14UATT3OUFU,B004RMK5QG,395682204,Playstation Plus Subscription,Digital_Video_Games,5,0,0,N,Y,Five Stars,Perfect,2015-08-31
4,US,22151364,RV2W9SGDNQA2C,B00G9BNLQE,640460561,Saints Row IV - Enter The Dominatrix [Online G...,Digital_Video_Games,5,0,0,N,Y,Five Stars,Awesome!,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144719,US,53011810,R2G7DI8NYXZB5R,B001AUEITS,163061733,Crazy Machines 2 [Download],Digital_Video_Games,4,2,3,N,N,Worked first try for me,"I was worried due to the 2 reviews I saw here,...",2008-12-25
144720,US,53094564,R3QRKP4DS759BP,B001AU6TQ8,801870836,Crazy Machines 1 - The Wacky Contraptions Game...,Digital_Video_Games,1,13,16,N,N,"The Software May be Great, But I'll Never Know",I downloaded this as a Christmas present for m...,2008-12-24
144721,US,37181147,R24K4C0ZC3093U,B001AUEITS,163061733,Crazy Machines 2 [Download],Digital_Video_Games,3,3,3,N,N,Some install problems but good otherwise,The previous reviewer is correct in noting tha...,2008-09-10
144722,US,18614365,R13OA3TRCM8IBM,B001AUEITS,163061733,Crazy Machines 2 [Download],Digital_Video_Games,1,20,22,N,N,Do Not Download This!,I downloaded this for my son's birthday yester...,2008-09-01


In [6]:
digital_videogames_df.dtypes

marketplace          object
customer_id           int64
review_id            object
product_id           object
product_parent        int64
product_title        object
product_category     object
star_rating           int64
helpful_votes         int64
total_votes           int64
vine                 object
verified_purchase    object
review_headline      object
review_body          object
review_date          object
dtype: object

In [7]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

In [8]:
# converting all review_body text to string dtype to avoid errors
digital_videogames_df['review_body'] = digital_videogames_df['review_body'].apply(str)

# removing all non-alphanumeric and converting all text to lowercase
digital_videogames_df['review_body_clean'] = digital_videogames_df.review_body.map(alphanumeric).map(punc_lower)

digital_videogames_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,review_body_clean
0,US,21269168,RSH1OZ87OYK92,B013PURRZW,603406193,Madden NFL 16 - Xbox One Digital Code,Digital_Video_Games,2,2,3,N,N,A slight improvement from last year.,I keep buying madden every year hoping they ge...,2015-08-31,i keep buying madden every year hoping they ge...
1,US,133437,R1WFOQ3N9BO65I,B00F4CEHNK,341969535,Xbox Live Gift Card,Digital_Video_Games,5,0,0,N,Y,Five Stars,Awesome,2015-08-31,awesome
2,US,45765011,R3YOOS71KM5M9,B00DNHLFQA,951665344,Command & Conquer The Ultimate Collection [Ins...,Digital_Video_Games,5,0,0,N,Y,Hail to the great Yuri!,If you are prepping for the end of the world t...,2015-08-31,if you are prepping for the end of the world t...
3,US,113118,R3R14UATT3OUFU,B004RMK5QG,395682204,Playstation Plus Subscription,Digital_Video_Games,5,0,0,N,Y,Five Stars,Perfect,2015-08-31,perfect
4,US,22151364,RV2W9SGDNQA2C,B00G9BNLQE,640460561,Saints Row IV - Enter The Dominatrix [Online G...,Digital_Video_Games,5,0,0,N,Y,Five Stars,Awesome!,2015-08-31,awesome


In [9]:
def lang_detect(lang_string):
    try:
        return detect(lang_string)
    except:
        return np.nan

In [10]:
# creating a language column
digital_videogames_df['review_language'] = digital_videogames_df['review_body'].map(lang_detect)

In [11]:
digital_videogames_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,review_body_clean,review_language
0,US,21269168,RSH1OZ87OYK92,B013PURRZW,603406193,Madden NFL 16 - Xbox One Digital Code,Digital_Video_Games,2,2,3,N,N,A slight improvement from last year.,I keep buying madden every year hoping they ge...,2015-08-31,i keep buying madden every year hoping they ge...,en
1,US,133437,R1WFOQ3N9BO65I,B00F4CEHNK,341969535,Xbox Live Gift Card,Digital_Video_Games,5,0,0,N,Y,Five Stars,Awesome,2015-08-31,awesome,af
2,US,45765011,R3YOOS71KM5M9,B00DNHLFQA,951665344,Command & Conquer The Ultimate Collection [Ins...,Digital_Video_Games,5,0,0,N,Y,Hail to the great Yuri!,If you are prepping for the end of the world t...,2015-08-31,if you are prepping for the end of the world t...,en
3,US,113118,R3R14UATT3OUFU,B004RMK5QG,395682204,Playstation Plus Subscription,Digital_Video_Games,5,0,0,N,Y,Five Stars,Perfect,2015-08-31,perfect,en
4,US,22151364,RV2W9SGDNQA2C,B00G9BNLQE,640460561,Saints Row IV - Enter The Dominatrix [Online G...,Digital_Video_Games,5,0,0,N,Y,Five Stars,Awesome!,2015-08-31,awesome,af


In [14]:
digital_videogames_df.drop(columns='review_language',inplace=True)

In [12]:
digital_videogames_df.to_pickle("digital_videogames_df.pkl")

Ignore Below (Possible Future Work, Saved)

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'lemmatizer', 'textcat'])

In [None]:
doc = nlp(digital_videogames_df.review_body_clean[0])

In [None]:
for token in doc:
    print(token.text, token.pos_, token.lemma_, token.is_stop)

In [None]:
def review_lemmatizer(review_string, accepted_pos=['VERB','NOUN','PRON','ADJ']):
#     lemmatized_list = []
#     for token in doc:
#         if token.pos_ in accepted_pos and token.is_stop != True:
#             lemmatized_list.append(token)
#     return lemmatized_list
    doc = nlp(review_string)
#     return [str(token) for token in doc if token.pos_ in accepted_pos and token.is_stop != True]
    return ' '.join([str(token) for token in doc if token.pos_ in accepted_pos and token.is_stop != True])

In [None]:
%timeit review_lemmatizer(digital_videogames_df.review_body_clean[0])

In [None]:
from statistics import mean

In [None]:
len(digital_videogames_df.review_body_clean)

In [None]:
avge_review_length = (mean(list(digital_videogames_df.review_body.str.len())))

In [None]:
avge_review_length

In [None]:
avge_character_time = 10/len(digital_videogames_df.review_body[0])

In [None]:
avge_character_time

In [None]:
avge_pos_time = avge_review_length * avge_character_time

In [None]:
avge_pos_time

In [None]:
(((avge_pos_time * 144724)/10)/60)/60

In [None]:
len(digital_videogames_df.review_body[0])

In [None]:
digital_videogames_df['review_body_lemmatized'] = digital_videogames_df.review_body.apply(review_lemmatizer)

In [None]:
digital_videogames_df.review_body

In [None]:
digital_videogames_df.head()

In [None]:
digital_videogames_df.product_title.value_counts().head(10)