### Processing pipeline for 'meta_Movies_and_TV.json'

In [1]:
datapath = 'DATA/'
filename = 'meta_Movies_and_TV.json.gz'

features = ['asin', 'title', 'description']

In [2]:
# Essential imports
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

# Strict JSON conversion
import json 
import gzip 

# Sleep
import time

# Progress display
from IPython.display import clear_output

In [3]:
def gz_to_dataframe(datapath, filename):
    def parse(path): 
        g = gzip.open(path, 'rb') 
        for l in g: 
            yield eval(l) 
    def getDF(path): 
        i = 0 
        df = {} 
        for d in parse(path): 
            df[i] = d 
            i += 1 
        return pd.DataFrame.from_dict(df, orient='index') 
    return getDF(datapath+filename)
    
product_df = gz_to_dataframe(datapath, filename)

In [4]:
product_df[features].shape

(208321, 3)

In [5]:
deaths=pd.read_csv("DATA/deaths.csv")
deaths.head(10)

Unnamed: 0,Name,Birth Date,Death Date,Description,Actor,Author,Musician
0,Jack Weston,1924,1996-05-03,", american actor",True,False,False
1,John Beradino,1917,1996-05-19,", american baseball player and actor",True,False,False
2,Jon Pertwee,1919,1996-05-20,", british actor",True,False,False
3,Lash LaRue,1917,1996-05-21,", american actor",True,False,False
4,Paul Delph,1957,1996-05-21,", american musician and producer",False,False,True
5,Jacob Druckman,1928,1996-05-24,", american composer",False,False,True
6,Enrique Álvarez Félix,1934,1996-05-24,", mexican actor",True,False,False
7,Joseph Mitchell (writer)|Joseph Mitchell,1908,1996-05-24,", american writer",False,True,False
8,Bradley Nowell,1968,1996-05-25,", american musician",False,False,True
9,Tamara Toumanova,1919,1996-05-29,", russian dancer and actress",True,False,False


### 1. fetch artists names with amazon API

In [6]:
## Identify with amazon api servers
##

from amazon.api import AmazonAPI

def get_api_keys():
    f = open("api_creds")
    ar = f.read().split("\n")
    return ar[0], ar[1], ar[2]

AMAZON_ACCESS_KEY, AMAZON_SECRET_KEY, AMAZON_ASSOC_TAG = get_api_keys()

amazon = AmazonAPI(AMAZON_ACCESS_KEY, AMAZON_SECRET_KEY, AMAZON_ASSOC_TAG)

In [7]:
## API query helpers
##


''' Product lookup with API, asin can be a string ('one by one' lookup)
    or a list of strings ('bulk lookup').
    bulk lookup provides better performance
'''
def get_prod(asin) : 
    if not isinstance(asin, str): 
        acc_str = asin[0]
        for e in asin : 
            acc_str += ','+e
        print(acc_str)
        return amazon.lookup(ItemId=acc_str)
    else :
        return amazon.lookup(ItemId=asin)
    
    
''' Splits the interval [start-end] into bulks of size bulksize
'''    
def gen_bulk_index(start, end, bulksize=10, includeEnd=False):
    size = end - start + 1
    bulks = [list(range(start+(i*bulksize), start + (i+1)*bulksize)) for i in range(0, int(size/bulksize))]
    if includeEnd : 
        bulks.append(list(range(bulks[len(bulks)-1][bulksize-1], end+1)))
    else : 
        bulks.append(list(range(bulks[len(bulks)-1][bulksize-1], end)))
    return bulks    


def get_directors(prod) : 
    return prod.directors

def get_actors(prod) : 
    return prod.actors

In [None]:
## API "one-by-one" querying
##

product_df['actors'] = pd.Series(dtype=object)
product_df['directors'] = pd.Series(dtype=object)

for i, prod_row in enumerate(product_df.iterrows()) : 
    if i%100 == 0 : print(int(100 * (i+1) / product_df.shape[0]), "% completed (",i, " rows)", "  -  time : ",time.strftime("%H:%M:%S"))
    prod = get_prod(prod_row[1]['asin'])
    product_df.set_value(i, 'actors', get_actors(prod))
    product_df.set_value(i, 'directors', get_directors(prod))
    time.sleep(1)
    
    if(i>=300):
        break
    

In [25]:
product_df[product_df['actors'].isna() == True]
product_df['actors'].isna() 

0         False
1         False
2         False
3         False
4         False
5         False
6         False
7         False
8         False
9         False
10        False
11        False
12        False
13        False
14        False
15        False
16        False
17        False
18        False
19        False
20        False
21        False
22        False
23        False
24        False
25        False
26        False
27        False
28        False
29        False
          ...  
208292     True
208293     True
208294     True
208295     True
208296     True
208297     True
208298     True
208299     True
208300     True
208301     True
208302     True
208303     True
208304     True
208305     True
208306     True
208307     True
208308     True
208309     True
208310     True
208311     True
208312     True
208313     True
208314     True
208315     True
208316     True
208317     True
208318     True
208319     True
208320     True
208321     True
Name: actors, Length: 20

### API "bulk" querying

In [45]:
## Parameters for bulk item lookup, should be kept between runs
##

bulksize = 10

# used to restart from where we were in case of an error
lastItemLookedUp = 0
incompleterows = 0
emptyrows = 0

In [46]:
print("last item looked up : ", lastItemLookedUp,  "  -  time : ",time.strftime("%H:%M:%S"), "\n\n")

last item looked up :  0   -  time :  13:41:26 




In [None]:
original_lastItemLookedUp = lastItemLookedUp

''' Helper : extracts wanted data from the amazon product 'amazonProduct' and 
             puts it in the dataframe at the given positions   
'''
def set_df_values_from_product(amazonProduct, rowNumber, fieldName1, fieldName2) :
    if (amazonProduct==None) : 
        product_df.set_value(rowNumber, fieldName1, [])
        product_df.set_value(rowNumber, fieldName2, [])        
    actors = get_actors(amazonProduct)
    product_df.set_value(rowNumber, fieldName1, actors)
    directors = get_directors(amazonProduct)
    product_df.set_value(rowNumber, fieldName2, directors)


for bulk in gen_bulk_index(lastItemLookedUp, product_df.shape[0], bulksize=bulksize) : 
    # display progess
    if ((bulk[0]-(original_lastItemLookedUp)) % 100 == 0) : 
        clear_output()
        print("    ",int(100 * (bulk[0]+1) / product_df.shape[0]), "% completed (",bulk[0], " rows)", "  -  time : ",time.strftime("%H:%M:%S"))
        print("     Last Item Looked up : ", lastItemLookedUp, " / ", product_df.shape[0])
        print("     Incomplete rows : ", incompleterows)
        print("     Empty rows : ", emptyrows)
        print("\n\n\n")
    
    # get asins for the bulk and fetch the matching AmazonProducts
    asins = product_df['asin'][bulk].tolist()
    prods = get_prod(asins)

    # Then, process each product to add necessary informations in the dataframe
    # Problem : the API can fail silently (no ASIN found -> the result will contain one less product
    #           than given asins) which breaks the matching ASIN <-> result if we process in bulk
    # Solution : detect discrepancy and fall back to 'one-by-one' querying
    if (type(prods) is list) and (len(prods) == bulksize) :              
        # Case : we found exactly one result per ASIN
        #        process by bulk
            for i, prod in enumerate(prods) : 
                set_df_values_from_product(prod, bulk[i], "actors", "directors")
    elif (type(prods) is list) or (type(prods) is AmazonApi.AmazonProduct) :  
        # Case : we obtained a list of AmazonProducts or a single AmazonProduct
        #        fallback to 1-by-1 querying
        for n in bulk :               
            asin = product_df['asin'][n]
            prod = get_prod(asin)
            set_df_values_from_product(prod, n, "actors", "directors")
        incompleterows += 1
    else : 
        emtpyrows += 1
        
    # Save progress
    lastItemLookedUp = bulk[bulksize-1]
        
    # limit query frequency to avoid getting 503'd
    time.sleep(min(bulksize/25, 10))

     0 % completed ( 300  rows)   -  time :  13:42:27
     Last Item Looked up :  299  /  208322
     Incomplete rows :  0
     Empty rows :  0




0563412879,0563412879,0570088933,0578000601,0578002019,0578009927,057804546X,0578046725,0578047861,0578045427,0578057247
0578058758,0578058758,0578057239,0590295918,0590582380,0594580897,0609607510,0609810553,0615111017,0615111114,0615113702
0615114180,0615114180,0615115187,0615117120,0615113729,0615118240,0615127894,0615122434,0615172083,0615181848,0615195555
0615208185,0615208185,0615212050,061522072X,061522153X,0615219225,0615219675,0615229476,0615229492,0615239420,061524226X
0615255868,0615255868,0615254721,0615270778,0615270557,0615274641,0615278094,0615290183,061529118X,061530091X,0615315763
0615328091,0615328091,0615329071,0615327087,0615334423,0615336132,0615344216,0615348572,0615353789,0615363628,0615377777


In [None]:
features = ['asin', 'categories', 'description', 'title', 'salesRank', 'actors', 'directors']
product_df = product_df[features]
product_df.head()

In [172]:
product_df.to_csv("meta_movies_TV_processed.csv")

In [12]:
product_df = pd.read_csv("meta_movies_TV_processed.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [174]:
test.shape

(208322, 8)

In [176]:
test.head()

Unnamed: 0.1,Unnamed: 0,asin,categories,description,title,salesRank,actors,directors
0,0,0000143561,"[['Movies & TV', 'Movies']]","3Pack DVD set - Italian Classics, Parties and ...","Everyday Italian (with Giada de Laurentiis), V...",{'Movies & TV': 376041},[],[]
1,1,0000589012,"[['Movies & TV', 'Movies']]",,Why Don't They Just Quit? DVD Roundtable Discu...,{'Movies & TV': 1084845},[],[]
2,2,0000695009,"[['Movies & TV', 'Movies']]",,Understanding Seizures and Epilepsy DVD,{'Movies & TV': 1022732},[],[]
3,3,000107461X,"[['Movies & TV', 'Movies']]",,Live in Houston [VHS],{'Movies & TV': 954116},['Douglas Miller'],[]
4,4,0000143529,"[['Movies & TV', 'Movies']]",Disc 1: Flour Power (Scones; Shortcakes; South...,My Fair Pastry (Good Eats Vol. 9),{'Movies & TV': 463562},['Alton Brown'],[]
