### Processing pipeline for 'meta_Movies_and_TV.json'

In [1]:
datapath = 'DATA/'
filename = 'meta_Movies_and_TV.json.gz'

features = ['asin', 'title', 'description']

In [2]:
# Essential imports
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

# Strict JSON conversion
import json 
import gzip 

# Sleep
import time

# Progress display
from IPython.display import clear_output

In [3]:
def gz_to_dataframe(datapath, filename):
    def parse(path): 
        g = gzip.open(path, 'rb') 
        for l in g: 
            yield eval(l) 
    def getDF(path): 
        i = 0 
        df = {} 
        for d in parse(path): 
            df[i] = d 
            i += 1 
        return pd.DataFrame.from_dict(df, orient='index') 
    return getDF(datapath+filename)
    
product_df = gz_to_dataframe(datapath, filename)[features]

In [38]:
product_df.shape

(208321, 11)

In [5]:
deaths=pd.read_csv("DATA/deaths.csv")
deaths.head(10)

Unnamed: 0,Name,Birth Date,Death Date,Description,Actor,Author,Musician
0,Jack Weston,1924,1996-05-03,", american actor",True,False,False
1,John Beradino,1917,1996-05-19,", american baseball player and actor",True,False,False
2,Jon Pertwee,1919,1996-05-20,", british actor",True,False,False
3,Lash LaRue,1917,1996-05-21,", american actor",True,False,False
4,Paul Delph,1957,1996-05-21,", american musician and producer",False,False,True
5,Jacob Druckman,1928,1996-05-24,", american composer",False,False,True
6,Enrique Álvarez Félix,1934,1996-05-24,", mexican actor",True,False,False
7,Joseph Mitchell (writer)|Joseph Mitchell,1908,1996-05-24,", american writer",False,True,False
8,Bradley Nowell,1968,1996-05-25,", american musician",False,False,True
9,Tamara Toumanova,1919,1996-05-29,", russian dancer and actress",True,False,False


### 1. fetch artists names with amazon API

In [6]:
## Identify with amazon api servers
##

from amazon.api import AmazonAPI
from amazon.api import AsinNotFound

def get_api_keys():
    f = open("api_creds")
    ar = f.read().split("\n")
    return ar[0], ar[1], ar[2]

AMAZON_ACCESS_KEY, AMAZON_SECRET_KEY, AMAZON_ASSOC_TAG = get_api_keys()

amazon = AmazonAPI(AMAZON_ACCESS_KEY, AMAZON_SECRET_KEY, AMAZON_ASSOC_TAG)

In [35]:
## API query helpers
##


''' Product lookup with API, asin can be a string ('one by one' lookup)
    or a list of strings ('bulk lookup').
    bulk lookup provides better performance
'''
def get_prod(asin) : 
    if not isinstance(asin, str): 
        acc_str = asin[0]
        for e in asin : 
            acc_str += ','+e
        print(acc_str)
        return amazon.lookup(ItemId=acc_str)
    else :
        return amazon.lookup(ItemId=asin)
    
    
''' Splits the interval [start-end] into bulks of size bulksize
'''    
def gen_bulk_index(start, end, bulksize=10, includeEnd=False):
    size = end - start + 1
    bulks = [list(range(start+(i*bulksize), start + (i+1)*bulksize)) for i in range(0, int(size/bulksize))]
    if includeEnd : 
        bulks.append(list(range(bulks[len(bulks)-1][bulksize-1], end+1)))
    else : 
        bulks.append(list(range(bulks[len(bulks)-1][bulksize-1], end)))
    return bulks    


''' Save and load progress to file, as opposed to keeping it in memory inbetween cells
'''
def save_progress(dataframe, nb_rows_processed):
    dataframe.to_csv(datapath+"meta_movies_TV_temp.csv")
    file = open(datapath+"meta_movies_TV_progress", "w")
    file.write(str(nb_rows_processed))

    
def load_progress():
    dataframe = pd.read_csv(datapath+"meta_movies_TV_temp.csv")
    file = open(datapath+"meta_movies_TV_progress", "r")
    nb_rows_processed = file.readline()
    return dataframe, int(nb_rows_processed)


def get_directors(prod) : 
    return prod.directors

def get_actors(prod) : 
    return prod.actors

### API "bulk" querying

In [9]:
## Parameters for bulk item lookup, should be kept between runs
##

bulksize = 10

# used to restart from where we were in case of an error
lastItemLookedUp = 0
incompleterows = 0
emptyrows = 0

In [39]:
fresh_run = False

if fresh_run : 
    product_df = product_df[features]
    product_df['actors'] = pd.Series(dtype=object)
    product_df['directors'] = pd.Series(dtype=object)
else : 
    product_df, lastItemLookedUp = load_progress()

  if self.run_code(code, result):


In [40]:
print("last item looked up : ", lastItemLookedUp,  "  -  time : ",time.strftime("%H:%M:%S"), "\n\n")

last item looked up :  115733   -  time :  13:39:29 




In [44]:
original_lastItemLookedUp = lastItemLookedUp

''' Helper : extracts wanted data from the amazon product 'amazonProduct' and 
             puts it in the dataframe at the given positions   
'''
def set_df_values_from_product(amazonProduct, rowNumber, fieldName1, fieldName2) :
    if (amazonProduct is None) : 
        product_df.set_value(rowNumber, fieldName1, [])
        product_df.set_value(rowNumber, fieldName2, [])     
    else : 
        actors = get_actors(amazonProduct)
        product_df.set_value(rowNumber, fieldName1, actors)
        directors = get_directors(amazonProduct)
        product_df.set_value(rowNumber, fieldName2, directors)


for bulk in gen_bulk_index(lastItemLookedUp, product_df.shape[0], bulksize=bulksize) : 
    # display progess
    if ((bulk[0]-(original_lastItemLookedUp)) % 100 == 0) : 
        clear_output()
        print("    ",int(100 * (bulk[0]+1) / product_df.shape[0]), "% completed (",bulk[0], " rows)", "  -  time : ",time.strftime("%H:%M:%S"))
        print("     Last Item Looked up : ", lastItemLookedUp, " / ", product_df.shape[0])
        print("     Incomplete rows : ", incompleterows)
        print("     Empty rows : ", emptyrows)
        print("\n\n\n")
    
    # get asins for the bulk and fetch the matching AmazonProducts
    asins = product_df['asin'][bulk].tolist()
    prods = get_prod(asins)

    # Then, process each product to add necessary informations in the dataframe
    # Problem : the API can fail silently (no ASIN found -> the result will contain one less product
    #           than given asins) which breaks the matching ASIN <-> result if we process in bulk
    # Solution : detect discrepancy and fall back to 'one-by-one' querying
    if (type(prods) is list) and (len(prods) == bulksize) :              
        # Case : we found exactly one result per ASIN
        #        process by bulk
            for i, prod in enumerate(prods) : 
                set_df_values_from_product(prod, bulk[i], "actors", "directors")
    elif (type(prods) is list) or (type(prods) is AmazonApi.AmazonProduct) :  
        # Case : we obtained a list of AmazonProducts or a single AmazonProduct
        #        fallback to 1-by-1 querying
        for n in bulk :               
            asin = product_df['asin'][n]
            try : 
                prod = get_prod(asin)
            except(AsinNotFound): 
                prod = None
            set_df_values_from_product(prod, n, "actors", "directors")
            time.sleep(0.15)
        incompleterows += 1
    else : 
        emtpyrows += 1
        
    # Save progress
    lastItemLookedUp = bulk[bulksize-1]
        
    # limit query frequency to avoid getting 503'd
    time.sleep(min(bulksize/15, 10))

     59 % completed ( 123142  rows)   -  time :  14:09:22
     Last Item Looked up :  123141  /  208321
     Incomplete rows :  367
     Empty rows :  0




B000XPXTR8,B000XPXTR8,B000XPXSC4,B000XPXUQI,B000XPXURM,B000XPXTUA,B000XPXSK6,B000XPXU1S,B000XPXTIC,B000XPXT96,B000XPYI6O
B000XPZDS6,B000XPZDS6,B000XPXUM2,B000XPXSXI,B000XPZTJE,B000XPZSKE,B000XPZSB8,B000XPZSAO,B000XPZSWW,B000XPZSPO,B000XPZTL2
B000XPZUEI,B000XPZUEI,B000XPZT12,B000XPZT6M,B000XPZSMC,B000XPXSA6,B000XPZSDQ,B000XPZHN2,B000XPZTLC,B000XPXUMM,B000XPZSKO
B000XQ1OL0,B000XQ1OL0,B000XQ1NYI,B000XPZS6I,B000XPZTJY,B000XQ1P3M,B000XPZSD6,B000XQ1P50,B000XQ1Q68,B000XQ1P2S,B000XPZTLM
B000XQ27GQ,B000XQ27GQ,B000XQ2NJW,B000XQ3LAC,B000XQ4HQE,B000XQ1OQK,B000XQ173U,B000XQ4HR8,B000XQ4HPA,B000XQ3I7I,B000XQ4HQO
B000XQ94GC,B000XQ94GC,B000XQ9C6O,B000XQ95IY,B000XQCXZQ,B000XQBTE2,B000XQFIBW,B000XQHRAM,B000XQMBAS,B000XQRBI0,B000XQR626
B000XQXFDA,B000XQXFDA,B000XQSY0O,B000XR2Y3Q,B000XR6UC2,B000XR8UL6,B000XR8SRC,B000XR9R5E,B000XR9R5Y,B000XR6QWG,B000XR9

HTTPError: HTTP Error 503: Service Unavailable

In [45]:
save_progress(product_df, lastItemLookedUp)