### Processing pipeline for 'meta_Movies_and_TV.json'

In [4]:
datapath = 'DATA/'
filename = 'meta_Movies_and_TV.json.gz'

features = ['asin', 'title', 'description']

In [None]:
# Essential imports
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

# Strict JSON conversion
import json 
import gzip 

# Sleep
import time

# Progress display
from IPython.display import clear_output

In [7]:
def gz_to_dataframe(datapath, filename):
    def parse(path): 
        g = gzip.open(path, 'rb') 
        for l in g: 
            yield eval(l) 
    def getDF(path): 
        i = 0 
        df = {} 
        for d in parse(path): 
            df[i] = d 
            i += 1 
        return pd.DataFrame.from_dict(df, orient='index') 
    return getDF(datapath+filename)
    
product_df = gz_to_dataframe(datapath, filename)

In [17]:
product_df[features].shape

(208321, 3)

In [10]:
deaths=pd.read_csv("DATA/deaths.csv")
deaths.head(10)

Unnamed: 0,Name,Birth Date,Death Date,Description,Actor,Author,Musician
0,Jack Weston,1924,1996-05-03,", american actor",True,False,False
1,John Beradino,1917,1996-05-19,", american baseball player and actor",True,False,False
2,Jon Pertwee,1919,1996-05-20,", british actor",True,False,False
3,Lash LaRue,1917,1996-05-21,", american actor",True,False,False
4,Paul Delph,1957,1996-05-21,", american musician and producer",False,False,True
5,Jacob Druckman,1928,1996-05-24,", american composer",False,False,True
6,Enrique Álvarez Félix,1934,1996-05-24,", mexican actor",True,False,False
7,Joseph Mitchell (writer)|Joseph Mitchell,1908,1996-05-24,", american writer",False,True,False
8,Bradley Nowell,1968,1996-05-25,", american musician",False,False,True
9,Tamara Toumanova,1919,1996-05-29,", russian dancer and actress",True,False,False


### 1. fetch artists names with amazon API

In [1]:
## Identify with amazon api servers
##

from amazon.api import AmazonAPI

def get_api_keys():
    f = open("api_creds")
    ar = f.read().split("\n")
    return ar[0], ar[1], ar[2]

AMAZON_ACCESS_KEY, AMAZON_SECRET_KEY, AMAZON_ASSOC_TAG = get_api_keys()

amazon = AmazonAPI(AMAZON_ACCESS_KEY, AMAZON_SECRET_KEY, AMAZON_ASSOC_TAG)

In [109]:
## API query helpers
##


''' Product lookup with API, asin can be a string ('one by one' lookup)
    or a list of strings ('bulk lookup').
    bulk lookup provides better performance
'''
def get_prod(asin) : 
    if not isinstance(asin, str): 
        acc_str = asin[0]
        for e in asin : 
            acc_str += ','+e
        print(acc_str)
        return amazon.lookup(ItemId=acc_str)
    else :
        return amazon.lookup(ItemId=asin)
    
    
''' Splits the interval [start-end] into bulks of size bulksize
'''    
def gen_bulk_index(start, end, bulksize=10, includeEnd=False):
    size = end - start + 1
    bulks = [list(range(start+(i*bulksize), start + (i+1)*bulksize)) for i in range(0, int(size/bulksize))]
    if includeEnd : 
        bulks.append(list(range(bulks[len(bulks)-1][bulksize-1], end+1)))
    else : 
        bulks.append(list(range(bulks[len(bulks)-1][bulksize-1], end)))
    return bulks    


def get_directors(prod) : 
    return prod.directors

def get_actors(prod) : 
    return prod.actors

In [126]:
## API "one-by-one" querying
##

product_df['actors'] = pd.Series(dtype=object)
product_df['directors'] = pd.Series(dtype=object)

for i, prod_row in enumerate(product_df.iterrows()) : 
    if i%100 == 0 : print(int(100 * (i+1) / product_df.shape[0]), "% completed (",i, " rows)", "  -  time : ",time.strftime("%H:%M:%S"))
    prod = get_prod(prod_row[1]['asin'])
    product_df.set_value(i, 'actors', get_actors(prod))
    product_df.set_value(i, 'directors', get_directors(prod))
    time.sleep(1)
    
    if(i>=300):
        break
    

0 % completed ( 0  rows)   -  time :  20:18:17




0 % completed ( 100  rows)   -  time :  20:21:00


KeyboardInterrupt: 

### API "bulk" querying

In [127]:
## Parameters for bulk item lookup, should be kept between runs
##

bulksize = 10

# used to restart from where we were in case of an error
lastItemLookedUp = 0

In [147]:
print("last item looked up : ", lastItemLookedUp,  "  -  time : ",time.strftime("%H:%M:%S"), "\n\n")

last item looked up :  17835   -  time :  21:16:07 




In [None]:
original_lastItemLookedUp = lastItemLookedUp


for bulk in gen_bulk_index(lastItemLookedUp, product_df.shape[0], bulksize=bulksize) : 
    # display progess
    if ((bulk[0]-(original_lastItemLookedUp)) % 100 == 0) : 
        clear_output()
        print("    ",int(100 * (bulk[0]+1) / product_df.shape[0]), "% completed (",bulk[0], " rows)", "  -  time : ",time.strftime("%H:%M:%S"))
        print("     Last Item Looked up : ", lastItemLookedUp, " / ", product_df.shape[0])
        print("\n\n\n")
    
    # get asins for the bulk
    asins = product_df['asin'][bulk].tolist()
    # fetch products using the bulk_lookup
    prods = get_prod(asins)

    # Then, process each product to add necessary informations in the dataframe
    for i, prod in enumerate(prods) : 
        actors = get_actors(prod)
        product_df.set_value(bulk[i], "actors", actors)
        directors = get_directors(prod)
        product_df.set_value(bulk[i], "directors", directors)
        
    # Save progress
    lastItemLookedUp = bulk[bulksize-1]
        
    # limit query frequency to avoid getting 503'd
    time.sleep(min(bulksize/50, 10))

     15 % completed ( 32771  rows)   -  time :  21:48:15
     Last Item Looked up :  32770  /  208322




B0000065RB,B0000065RB,B0000066X5,B0000066QT,B0000066S1,B0000067AO,B0000066HK,B00000671U,B0000065MC,B00000663A,B0000066IH


In [None]:
product_df.head()