# Section 1: imports


In [1]:
import pandas as pd

from tqdm import tqdm
import json

# Section 2: Raw Data Ingestion

## Goals:

the goal of this section is to parse the .json files for the reviews and meta data. 

In [3]:
# path to review data
path_to_reviews = r'./Musical_Instruments.json'

amount_of_reviews = 1_000_000

# read all the reviews
all_parsed_data = []
with open(path_to_reviews, 'r') as review_file:
    for review in tqdm(review_file.readlines()):
        all_parsed_data.append(json.loads(review))

# output 1 review to see structure
print(all_parsed_data[1])

# all the item numbers we are trying to find in the meta
# using this should help us read the metafile quicker
items_to_find_in_meta = list(set([review['asin'] for review in all_parsed_data]))


100%|██████████| 1512530/1512530 [00:08<00:00, 188550.20it/s]


{'overall': 4.0, 'vote': '2', 'verified': True, 'reviewTime': '04 6, 2017', 'reviewerID': 'A29OWR79AM796H', 'asin': '0470536454', 'style': {'Format:': ' Hardcover'}, 'reviewerName': 'Amazon Customer', 'reviewText': 'Very helpful...', 'summary': 'Four Stars', 'unixReviewTime': 1491436800}


In [4]:
# function to set description to empty if it is missing
def parse_item_meta(item):
    if 'description' not in item:
        item['description'] = ''
    elif type(item['description']) == list:
        # print(item['description'], type(item['description']))
        item['description'] = item['description'][0] if len(item['description']) >= 1 else ''
    else:
        item['description'] = item['description']
    return item


In [5]:
import ast

# path to meta file
path_to_meta = r'./meta_Musical_Instruments.json'

# read all the metas for each product
all_parsed_meta_data = []
with open(path_to_meta, 'r') as meta_file:
    for meta in tqdm(meta_file.readlines()):
        meta = ast.literal_eval(meta)
        all_parsed_meta_data.append(parse_item_meta(meta))


# turns the list of meta into a dataframe
meta_data = pd.DataFrame(all_parsed_meta_data)

# prints the first few samples to see structure
meta_data.head()

100%|██████████| 120310/120310 [00:25<00:00, 4757.52it/s]


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"[Musical Instruments, Drums & Percussion, Hand...",,Cricket Rubbing the spine with the wooden stic...,,Wooden Percussion 2 Piece Set of 3 Inch Cricke...,"[B00NP8GYVS, B00NP80XMO, B00NP8M098]",,WADSUWAN SHOP,"[Wood percussion, Owl whistle*, Includes woode...","[>#141,729 in Musical Instruments (See Top 100...",[],Musical Instruments,,"December 2, 2013",,989983,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
1,"[Musical Instruments, Drums & Percussion, Hand...",,Frog - Rubbing its spine with the wooden stick...,,"Wooden Percussion 3 Piece Set Frog, Cricket an...","[B00NP8GYVS, B00NP80XMO, B01MY48HK5, B00AZZ1AJ...",,WADSUWAN SHOP,"[Wood percussion, Small 3 inches, Creates orig...","[>#1,622 in Musical Instruments (See Top 100 i...",[],Musical Instruments,,"December 2, 2013",$0.91,98906,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
2,"[Musical Instruments, Instrument Accessories, ...",,Vivaldi's famous set of four violin concertos ...,,Hal Leonard Vivaldi Four Seasons for Piano (Or...,[],,Hal Leonard,"[., ., .]","[>#330,653 in Musical Instruments (See Top 100...",[],Musical Instruments,,"May 10, 2011",$62.93,41291905,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
3,[],,"The Turn of the Screw (op. 54) vocal score, pu...",,The Turn of the Screw (vocal score),"[0486266842, 0793507669, 0393008789, 142341280...",,Boosey &amp; Hawkes,[],"[>#86,354 in Musical Instruments (See Top 100 ...",[],Musical Instruments,,"May 23, 2007",$107.79,60015500,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
4,[],,,,Suite for Organ (including the Trumpet Volunta...,[],,,[],"[>#482,025 in Musical Instruments (See Top 100...",[],Musical Instruments,,"February 8, 2013",,193757710,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,


# Section 3: Data prep

## Goals:

The goal of this section is to take the raw data from above and put it into a dataframe we can work with. This dataframe will include the following features:

1) Product Name

        Name of the product

2) Product Price

        Price of the product. Negative if not known.

3) Product Description

        Description of the product.

4) Review Score

        Review score from 1 to 5.

5) Review Title

        Review summary; The title of the review.

6) Review Text

        The raw review text.


In [6]:
def modify_price( price_entry ):
    raw_price = price_entry
    try:
        if type(price_entry) == list:
            if len(price_entry) < 1:
                return -1.0
            
            raw_price = price_entry[0]
        
        if type(raw_price) == float:
            return price_entry

        if type(raw_price) == str:
            try:
                raw_price = float(raw_price.replace('$', ''))
                return raw_price
            except:
                return -1.0
    except:
        return -1.0

meta_data['parsed_price'] = [modify_price(entry) for entry in meta_data['price']]
print(meta_data['parsed_price'])

0          -1.00
1           0.91
2          62.93
3         107.79
4          -1.00
           ...  
120305      7.99
120306      0.91
120307     -1.00
120308     -1.00
120309      7.99
Name: parsed_price, Length: 120310, dtype: float64


In [7]:
def modify_text(product_name_entry):
    raw_name = product_name_entry
    try:
        if type(raw_name) == list:
            if len(raw_name) < 1:
                return ''

            raw_name = raw_name[0]

        if type(raw_name) == str:
            return raw_name
    except:
        return ''


meta_data['parsed_title'] = [modify_text(entry) for entry in meta_data['title']]
meta_data['parsed_description'] = [modify_text(entry) for entry in meta_data['description']]
print(meta_data['parsed_title'])
print(meta_data['parsed_description'])


0         Wooden Percussion 2 Piece Set of 3 Inch Cricke...
1         Wooden Percussion 3 Piece Set Frog, Cricket an...
2         Hal Leonard Vivaldi Four Seasons for Piano (Or...
3                       The Turn of the Screw (vocal score)
4         Suite for Organ (including the Trumpet Volunta...
                                ...                        
120305    10 pcs 3pdt Stomp Footswitch incl PCB incl met...
120306    5 pcs 3pdt Stomp Footswitch incl. PCB, metal w...
120307    Optical Clear Quartz Crystal Singing Bowl Note...
120308    2x Deluxe Small 2&quot; Wood Frog Guiro Rasp -...
120309    Creanoso Guitar Strap Locks and Buttons Black ...
Name: parsed_title, Length: 120310, dtype: object
0         Cricket Rubbing the spine with the wooden stic...
1         Frog - Rubbing its spine with the wooden stick...
2         Vivaldi's famous set of four violin concertos ...
3         The Turn of the Screw (op. 54) vocal score, pu...
4                                                 

In [8]:
# drop duplicate data
meta_data.drop_duplicates(subset=['asin'], keep='first')

# set asin as index to support fast lookup
meta_data.set_index('asin', inplace=True)

print(meta_data)

test:  120310
test:  120310
                                                     category tech1  \
asin                                                                  
0000989983  [Musical Instruments, Drums & Percussion, Hand...         
0000098906  [Musical Instruments, Drums & Percussion, Hand...         
0041291905  [Musical Instruments, Instrument Accessories, ...         
0060015500                                                 []         
0193757710                                                 []         
...                                                       ...   ...   
B01HJDOF2Y  [Musical Instruments, Amplifiers & Effects, Gu...         
B01HJDJ1PA  [Musical Instruments, Amplifiers & Effects, Gu...         
B01HJEFFTK  [Musical Instruments, Drums & Percussion, Hand...         
B01HJEHEH6  [Musical Instruments, Drums & Percussion, Hand...         
B01HJETSF2  [Musical Instruments, Instrument Accessories, ...         

                                                

In [9]:
def generate_data_point_from_review( review ):

    # if we have no text for a review, skip this review
    if 'summary' not in review or 'reviewText' not in review:
        return None

    # find the item for the review
    try:
        review_item = meta_data.loc[str(review['asin'])]
    except:
        return None

    # if there are no items for this review
    if len(review_item) == 0:
        return None
    
    # return the new data point
    return {
        'product_name': review_item['parsed_title'],
        'product_price': review_item['parsed_price'],
        'product_description': review_item['parsed_description'],
        'review_score': float(review['overall']),
        'review_title': review['summary'],
        'review_text': review['reviewText'],
    }


# get all review information
print("starting")
data_points = [ 
    generate_data_point_from_review(review)
    for review in tqdm(all_parsed_data)
 ]

# filter out Nones - these were reviews missing text
data_points = [ x for x in data_points if x ]

# turn review information into a dataframe
df = pd.DataFrame(data_points)
df.head()


starting


100%|██████████| 1512530/1512530 [2:10:23<00:00, 193.32it/s]


Unnamed: 0,product_name,product_price,product_description,review_score,review_title,review_text
0,Wiley Publishers Crocheting For Dummies Revised,-1,,5.0,Terrific Book for Learning the Art of Crochet,Crocheting for Dummies by Karen Manthey & Susa...
1,Wiley Publishers Crocheting For Dummies Revised,-1,,4.0,Four Stars,Very helpful...
2,Wiley Publishers Crocheting For Dummies Revised,-1,,5.0,Five Stars,EASY TO UNDERSTAND AND A PROMPT SERVICE TOO
3,Wiley Publishers Crocheting For Dummies Revised,-1,,4.0,Four Stars,My girlfriend use quite often
4,Wiley Publishers Crocheting For Dummies Revised,-1,,5.0,Very happy.,Arrived as described. Very happy.


# Data Output

---

This section we will output all the processed data, as well as a small section of inputs to feed the model in the next step

In [12]:
# save processed data for later use
df.to_csv('processed_dataset.csv', index=False)

In [None]:
# Generate the model inputs
def gen_x(row):
    return (
        f"Product Name: {row['product_name']}\n"
        f"Product Score: {row['review_score']}\n"
        f"Product Review: {row['review_text']}"
    )

df['model_inputs'] = df.apply(gen_x, axis=1)

In [None]:
# save the model inputs for next step
df['model_inputs'][:500_000].to_csv('real_reviews.csv', index=False)