### Processing pipeline for 'meta_Movies_and_TV.json'

In [1]:
datapath = 'DATA/'
filename = 'meta_Digital_Music.json.gz'

features = ['asin', 'title', 'description']

In [2]:
# Essential imports
import pandas as pd
import numpy as np
import json

# Strict JSON conversion
import json 
import gzip 

# Sleep
import time

# Progress display
from IPython.display import clear_output

In [9]:
def gz_to_dataframe(datapath, filename):
    def parse(path): 
        g = gzip.open(path, 'rb') 
        for l in g: 
            yield eval(l) 
    def getDF(path): 
        i = 0 
        df = {} 
        for d in parse(path): 
            df[i] = d 
            i += 1 
        return pd.DataFrame.from_dict(df, orient='index') 
    return getDF(datapath+filename)
    
product_df = gz_to_dataframe(datapath, filename)
product_df = product_df[features]

In [10]:
product_df.head()

Unnamed: 0,asin,title,description
0,5555991584,Memory of Trees,
1,6308051551,Don't Drink His Blood,NEW Combo BLUWAVS CD and FLAC FILE
2,7901622466,On Fire,
3,B0000000ZW,Changing Faces,
4,B00000016W,Pet Sounds,


In [13]:
# https://www.amazon.com/Memory-Trees-Enya/dp/B00122OCJ0/ref=sr_1_1?ie=UTF8&qid=1511729210&sr=8-1&keywords=5555991584
p = get_prod('5555991584')

In [18]:
##
## What field ???
##

p.artist

AttributeError: 'AmazonProduct' object has no attribute 'artist'

### 1. Fetch artists names with amazon API

In [11]:
## Identify with amazon api servers
##

from amazon.api import AmazonAPI

def get_api_keys():
    f = open("api_creds")
    ar = f.read().split("\n")
    return ar[0], ar[1], ar[2]

AMAZON_ACCESS_KEY, AMAZON_SECRET_KEY, AMAZON_ASSOC_TAG = get_api_keys()

amazon = AmazonAPI(AMAZON_ACCESS_KEY, AMAZON_SECRET_KEY, AMAZON_ASSOC_TAG)

In [12]:
## API query helpers
##


''' Product lookup with API, asin can be a string ('one by one' lookup)
    or a list of strings ('bulk lookup').
    bulk lookup provides better performance
'''
def get_prod(asin) : 
    if not isinstance(asin, str): 
        acc_str = asin[0]
        for e in asin : 
            acc_str += ','+e
        print(acc_str)
        return amazon.lookup(ItemId=acc_str)
    else :
        return amazon.lookup(ItemId=asin)
    
    
''' Splits the interval [start-end] into bulks of size bulksize
'''    
def gen_bulk_index(start, end, bulksize=10, includeEnd=False):
    size = end - start + 1
    bulks = [list(range(start+(i*bulksize), start + (i+1)*bulksize)) for i in range(0, int(size/bulksize))]
    if includeEnd : 
        bulks.append(list(range(bulks[len(bulks)-1][bulksize-1], end+1)))
    else : 
        bulks.append(list(range(bulks[len(bulks)-1][bulksize-1], end)))
    return bulks    


def get_directors(prod) : 
    return prod.directors

def get_actors(prod) : 
    return prod.actors

#### 1.a API bulk querying

In [7]:
## Parameters for bulk item lookup, values should be retained between runs
##

bulksize = 10

# used to restart from where we were in case of an error
lastItemLookedUp = 0

In [8]:
## Show time and item of start - since next cell's output will be continuously cleared
##

print("last item looked up : ", lastItemLookedUp,  "  -  time : ",time.strftime("%H:%M:%S"), "\n\n")

last item looked up :  0   -  time :  21:42:13 




In [None]:
original_lastItemLookedUp = lastItemLookedUp


for bulk in gen_bulk_index(lastItemLookedUp, product_df.shape[0], bulksize=bulksize) : 
    # display progess
    if ((bulk[0]-(original_lastItemLookedUp)) % 100 == 0) : 
        clear_output()
        print("    ",int(100 * (bulk[0]+1) / product_df.shape[0]), "% completed (",bulk[0], " rows)", "  -  time : ",time.strftime("%H:%M:%S"))
        print("     Last Item Looked up : ", lastItemLookedUp, " / ", product_df.shape[0])
        print("\n\n\n")
    
    # get asins for the bulk
    asins = product_df['asin'][bulk].tolist()
    # fetch products using the bulk_lookup
    prods = get_prod(asins)

    # Then, process each product to add necessary informations in the dataframe
    for i, prod in enumerate(prods) : 
        actors = get_actors(prod)
        product_df.set_value(bulk[i], "actors", actors)
        directors = get_directors(prod)
        product_df.set_value(bulk[i], "directors", directors)
        
    # Save progress
    lastItemLookedUp = bulk[bulksize-1]
        
    # limit query frequency to avoid getting 503'd
    time.sleep(min(bulksize/25, 10))