### Processing pipeline for 'meta_amazon_instant_video.json' 

<font color='royalblue' size=3>
<b>

Parameters for the notebook

In [4]:

datapath = '../DATA/'
df_name = "meta_Books"


###
###  Change here which fields will be extracted from the amazon product
###
def set_df_cell_from_product(amazonProduct, rowNumber, fieldName1, fieldName2) :
    if (amazonProduct is None) : 
        amazon_products_df.set_value(rowNumber, fieldName1, [])
        amazon_products_df.set_value(rowNumber, fieldName2, [])     
    else : 
        amazon_products_df.set_value(rowNumber, "actors",
                                     amazonProduct.actors)
        amazon_products_df.set_value(rowNumber, "directors",
                                     amazonProduct.directors)
        amazon_products_df.set_value(rowNumber, "creators", 
                                     amazonProduct.creators)
        amazon_products_df.set_value(rowNumber, "authors", 
                                     amazonProduct.authors)
        amazon_products_df.set_value(rowNumber, "model", 
                                     amazonProduct.model)
        #amazon_products_df.set_value(rowNumber, "artists",    # There is definitely no "artists" field
        #                             amazonProduct.artists)

# How frequently should we save our data to file
data_save_freq = 10000
# How many network errors we accept before we give up
network_errors_limit = 7
# feature we're interested in
features = ['asin', 'categories', 'description', 'title', 'salesRank']
columns_to_add = ['actors', 'directors', 'creators', 'authors', "model"]
json_name = df_name+'.json.gz'

In [None]:
#essential imports
import pandas as pd
import numpy as np
import json

# Sleep
import time

# Strict JSON conversion
import json 
import gzip 

# Progress display
from IPython.display import clear_output

# Amazon API querying
from amazon.api import AmazonAPI
from amazon.api import AsinNotFound

from urllib.request import HTTPError
from socket import gaierror
from urllib.request import URLError


<font color='royalblue' size=3>
<b>

Open metadata file

In [None]:
## Load all ASINs we're going to query - use the metadata files
## for this, as they contains each ASIN once and only once.
##

''' This function was provided on the amazon dataset's webpage
    It loads a gzipped file directly into a dataframe
'''
def gz_to_dataframe(datapath, filename):
    def parse(path): 
        g = gzip.open(path, 'rb') 
        for l in g: 
            yield eval(l) 
    def getDF(path): 
        i = 0 
        df = {} 
        for d in parse(path): 
            df[i] = d 
            i += 1 
        return pd.DataFrame.from_dict(df, orient='index') 
    return getDF(datapath+filename)
    
amazon_products_df = gz_to_dataframe(datapath, json_name)
amazon_products_df.head(3)

In [None]:
amazon_products_df = amazon_products_df[features]
amazon_products_df.head(3)

<font color='royalblue' size=3>
<b>

Prepare for amazon api usage

In [None]:
## Sign in with amazon API 
##

def get_amazon_interface():
    f = open("api_creds")
    ar = f.read().split("\n")
    return AmazonAPI(ar[0], ar[1], ar[2])
    return ar[0], ar[1], ar[2]

amazon = get_amazon_interface()

In [None]:
## Here, we define some API query helpers
##

''' Product lookup with API, asin can be a string ('one by one' lookup)
    or a list of strings ('bulk lookup').
    bulk lookup provides better performance
'''
def get_prod(asin) : 
    if not isinstance(asin, str): 
        acc_str = asin[0]
        for e in asin : 
            acc_str += ','+e
        print(acc_str)
        return amazon.lookup(ItemId=acc_str)
    else :
        return amazon.lookup(ItemId=asin)
    
''' Splits the interval [start-end] into bulks of size bulksize
'''    
def gen_bulk_index(start, end, bulksize=10, includeEnd=False):
    size = end - start + 1
    bulks = [list(range(start+(i*bulksize), start + (i+1)*bulksize)) for i in range(0, int(size/bulksize))]
    if includeEnd : 
        bulks.append(list(range(bulks[len(bulks)-1][bulksize-1], end+1)))
    else : 
        bulks.append(list(range(bulks[len(bulks)-1][bulksize-1], end)))
    return bulks    


In [None]:
## Also, we have some functions to save the state of our data structure - in case we need to 
## to shutdown the computer and restart the query loop at a later time (not used here)
##

def save_progress(dataframe, nb_rows_processed):
    dataframe.to_csv(datapath+df_name+"_temp.csv", index=False)
    file = open(datapath+df_name+"_progress", "w")
    file.write(str(nb_rows_processed))

    
def load_progress():
    dataframe = pd.read_csv(datapath+df_name+"_temp.csv")
    file = open(datapath+df_name+"_progress", "r")
    nb_rows_processed = file.readline()
    return dataframe, int(nb_rows_processed)


<font color='royalblue' size=3>
<b>

Bulk lookups parameters and loop

In [None]:
## Parameters & initialization for bulk item lookup
##

bulksize = 10

# Change this to restore progress from file
fresh_run = True

if fresh_run : 
    # used to restart from where we were in case of unexepected network error
    lastItemLookedUp = 0
    for col in columns_to_add : 
        amazon_products_df[col] = pd.Series(dtype=object)
else : 
    amazon_products_df, lastItemLookedUp = load_progress()
    
print("last item looked up : ", lastItemLookedUp,  "  -  time : ",time.strftime("%H:%M:%S"), "\n\n")

In [None]:
## Querying loop
##

ref_for_progress = lastItemLookedUp
lastErrorMet=lastItemLookedUp
loop_complete = False
caught_httperrors=0
caught_gaierrors=0
errors_counter=0


while not loop_complete : 
    try : 
        
        for bulk in gen_bulk_index(ref_for_progress, amazon_products_df.shape[0], bulksize=bulksize) : 
            # update progess every 100 items
            if ((bulk[0]-(ref_for_progress)) % 100 == 0) : 
                clear_output()
                print("    ",int(100 * (bulk[0]+1) / amazon_products_df.shape[0]), "% completed (",bulk[0], " rows)", "  -  time : ",time.strftime("%H:%M:%S"))
                print("         Last Item Looked up : ", lastItemLookedUp, " / ", amazon_products_df.shape[0])
                print("         HTTPErrors caught : ", caught_httperrors)
                print("         gaierrors caught : ", caught_gaierrors)
                print("\n\n\n")

            # get asins for the bulk and fetch the matching AmazonProducts
            asins = amazon_products_df['asin'][bulk].tolist()
            noAsinFound = False
            try : 
                prods = get_prod(asins)
            except AsinNotFound : 
                noAsinFound = True
                
            # if query was successful, reset error counter
            errors_counter = 0;
            
            # Then, process each product to add necessary informations in the dataframe
            if noAsinFound : 
                # Skip this bulk and reset the flag
                print("No Asin Found for bulk : ", bulk)
            elif (type(prods) is list) and (len(prods) == bulksize) :              
                # Case : we found exactly one result per ASIN
                #        process by bulk
                    for i, prod in enumerate(prods) : 
                        set_df_cell_from_product(prod, bulk[i], "actors", "directors")
            elif (type(prods) is list) or (type(prods) is AmazonApi.AmazonProduct) :  
                # Case : we obtained a list of AmazonProducts or a single AmazonProduct
                #        fallback to 1-by-1 querying
                for n in bulk :               
                    asin = amazon_products_df['asin'][n]
                    try : 
                        prod = get_prod(asin)
                    except(AsinNotFound): 
                        prod = None
                    set_df_cell_from_product(prod, n, "actors", "directors")
                    time.sleep(0.5)

            # Save progress
            lastItemLookedUp = bulk[bulksize-1]
            
            # Save data to file according to specified frequency
            if ((bulk[0]-(ref_for_progress)) % data_save_freq == 0) : 
                save_progress(amazon_products_df, lastItemLookedUp)
            
            # limit query frequency to avoid 503 errors
            time.sleep(min(bulksize/25, 5))
            
        loop_complete = True
        

    except HTTPError :
        errors_counter += 1
        caught_httperrors += 1
        # If we didn't make any progress, something must be wrong
        if errors_counter > network_errors_limit : 
            print("HTTPError caught at original_lastItemLookedUp  -  breaking")
            break 
        # else retry
        print("\n\nhttpError\n\n")
        ref_for_progress = lastItemLookedUp
        lastErrorMet = lastItemLookedUp
        
    except(gaierror, URLError): 
        errors_counter += 1
        caught_gaierrors += 1
        if errors_counter > network_errors_limit : 
            print("gaierror/urlerror caught too many times  -  breaking")
            break 
        # else retry
        print("\n\ngaierror/urlerror\n\n")
        ref_for_progress = lastItemLookedUp
        lastErrorMet = lastItemLookedUp
        

if(loop_complete):
    clear_output()    
    # save results
    amazon_products_df.to_csv(datapath+df_name+".csv")
    print("amazon query loop completed !")
    

In [None]:
amazon_products_df

In [None]:
## Run this cell to manually save results
##

save_progress(amazon_products_df, lastItemLookedUp)