In [1]:
import pandas as pd 
import numpy as np 
import requests
import os



# Time Series Acquire Exercises scratch notebook
- All exercises for this lesson can be found in the acquire.py file
- Here is only the scratch work, where I ran some tests

In [42]:
################# Exercise 1 #################
# Using the code from the lesson as a guide and the REST API 
# from https://python.zach.lol/api/v1/items as we did in the lesson, 
# create a dataframe named items that has all of the data for items.

items_url = 'https://python.zach.lol/api/v1/items'
response = requests.get(items_url)
data = response.json()

#set maxpage to the payload's max_page
maxpage = data['payload']['max_page']

# initialize list to store pages
items_list = []
# loop through next pages to get them all together
# need maxpage + 1 because of the way range works
for page in range(1, maxpage+1):
    url = items_url + '?page=' + str(page)
    response = requests.get(url)
    data = response.json()
    page_items = data['payload']['items']
    items_list += page_items

item_df = pd.DataFrame(items_list)

In [44]:
stores_url = 'https://python.zach.lol/api/v1/stores'

response = requests.get(stores_url)

data = response.json()

stores_df = pd.DataFrame(data['payload']['stores'])

In [126]:
sales_url = 'https://python.zach.lol/api/v1/sales'

def get_api_pages(base_url, endpoint):
    ''' 
    This function takes in a base url, a string of the thing you want
    Creates a dataframe of all the endpoint's pages in a dataframe
    The endpoint aka the "thing string"
    ex. get_api_pages('https://python.zach.lol/api/v1/sales', 'sales')
    '''
    response = requests.get(base_url)
    data = response.json()
    
    maxpage = data['payload']['max_page'] #set maxpage to the payload's max_page
    
    items_list = [] # initialize list to store pages

    for page in range(1, maxpage + 1):
        url = base_url + '?page=' + str(page)
        response = requests.get(url)
        page_data = response.json()
        page_items = page_data['payload'][endpoint]
        items_list += page_items
    
    return pd.DataFrame(items_list)

In [47]:
sales_df = get_api_pages(sales_url, 'sales')

In [48]:
sales_df.shape

(913000, 5)

In [50]:
# put in csv, index = False so there's no duplicate
sales_df.to_csv('sales.csv', index=False)

In [51]:
sales_df.head()

Unnamed: 0,item,sale_amount,sale_date,sale_id,store
0,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,1
1,1,11.0,"Wed, 02 Jan 2013 00:00:00 GMT",2,1
2,1,14.0,"Thu, 03 Jan 2013 00:00:00 GMT",3,1
3,1,13.0,"Fri, 04 Jan 2013 00:00:00 GMT",4,1
4,1,10.0,"Sat, 05 Jan 2013 00:00:00 GMT",5,1


In [24]:
# Join items, sales, and stores
item_df.head()

Unnamed: 0,item_brand,item_id,item_name,item_price,item_upc12,item_upc14
0,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
1,Caress,2,Caress Velvet Bliss Ultra Silkening Beauty Bar...,6.44,11111065925,11111065925
2,Earths Best,3,Earths Best Organic Fruit Yogurt Smoothie Mixe...,2.43,23923330139,23923330139
3,Boars Head,4,Boars Head Sliced White American Cheese - 120 Ct,3.14,208528800007,208528800007
4,Back To Nature,5,Back To Nature Gluten Free White Cheddar Rice ...,2.61,759283100036,759283100036


In [84]:
# Merge sales and item, set the right table index to the id of that column, then use the right index for the join
sales_items = sales_df.merge(item_df.set_index('item_id'), how='left', left_on = 'item', right_index = True)

In [85]:
sales_items.sample(5)

Unnamed: 0,item,sale_amount,sale_date,sale_id,store,item_brand,item_name,item_price,item_upc12,item_upc14
239461,14,70.0,"Fri, 13 Sep 2013 00:00:00 GMT",239462,2,Pamprin,Pamprin Maximum Strength Multi-symptom Menstru...,7.54,41167300121,41167300121
49445,3,47.0,"Fri, 24 May 2013 00:00:00 GMT",49446,8,Earths Best,Earths Best Organic Fruit Yogurt Smoothie Mixe...,2.43,23923330139,23923330139
398297,22,89.0,"Sun, 18 Aug 2013 00:00:00 GMT",398298,9,Betty Crocker,Betty Crocker Twin Pack Real Potatoes Scallope...,7.31,16000288829,16000288829
628516,35,36.0,"Wed, 08 Jan 2014 00:00:00 GMT",628517,5,Natures Way,Natures Way Forskohlii - 60 Ct,5.19,33674100066,33674100066
568438,32,45.0,"Mon, 07 Jul 2014 00:00:00 GMT",568439,2,Barefoot,Barefoot Pinot Grigio 187,0.68,8500004528,8500004528


In [87]:
# Merge sales/items to stores, set the right table index to the id of that column, then use the right index for the join
sales_full = sales_items.merge(stores_df.set_index('store_id'), how = 'left', left_on= 'store', right_index = True )

In [88]:
# look at full dataframe
sales_full.head()

Unnamed: 0,item,sale_amount,sale_date,sale_id,store,item_brand,item_name,item_price,item_upc12,item_upc14,store_address,store_city,store_state,store_zipcode
0,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
1,1,11.0,"Wed, 02 Jan 2013 00:00:00 GMT",2,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
2,1,14.0,"Thu, 03 Jan 2013 00:00:00 GMT",3,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
3,1,13.0,"Fri, 04 Jan 2013 00:00:00 GMT",4,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
4,1,10.0,"Sat, 05 Jan 2013 00:00:00 GMT",5,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253


In [189]:
# exercise 6
ops_germany = pd.read_csv('https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv')

In [163]:
def get_sales_data(sales_url = 'https://python.zach.lol/api/v1/sales', 
                    endpoint = 'sales'):
    '''
    This function reads in the sales data from the zach api,
    writes data to a csv file if a local file does not exist, 
    and returns a df.
    '''

    if os.path.isfile('sales.csv'):
        
        # If csv file exists read in data from csv file.
        df = pd.read_csv('sales.csv')
        
    else:
        
        # Read fresh data from db into a DataFrame
        df = get_api_pages(sales_url, endpoint)
        
        # Cache data
        df.to_csv('sales.csv')

    return df

In [143]:
sales_data2 = get_sales_data()

In [144]:
def get_full_zach_data():
    ''' 
    This function gets the sales data, the items data, and the stores
    data. Joins them together into a single dataframe. And returns that
    dataframe
    '''
    base_url = 'https://python.zach.lol/api/v1/'

    endpoint_list = ['sales', 'items', 'stores']

    sales_df = get_sales_data()

    items_df = get_api_pages(base_url + 'items', 'items')

    stores_df = get_api_pages(base_url + 'stores', 'stores')
    
    return sales_df, items_df, stores_df

In [145]:
sales_df, items_df, stores_df = get_full_zach_data()

In [155]:
def join_zach_data(df1, df2, df3):
    '''
    This function takes in three tuples with the dataframe and the 
    key for that dataframe to be joined on (i.e. (df, 'key'))
    Returns one dataframe with them all left joined together. 
    Df1 needs two keys! for the first join and the second join
    Joins df1 to df2, then those two to df3
    i.e. join_zach_data((sales_df, 'item', 'store'), (items_df, 'item_id'), (stores_df, 'store_id'))
    '''
    
    # Merge sales and item, set the right table index to the id of that column, then use the right index for the join
    join_1 = df1[0].merge(df2[0].set_index(df2[1]), how='left', left_on = df1[1], right_index = True)
    
    join_full = join_1.merge(df3[0].set_index(df3[1]), how = 'left', left_on= df1[2], right_index = True )
    
    return join_full

In [158]:
joined_full = join_zach_data((sales_df, 'item', 'store'), (items_df, 'item_id'), (stores_df, 'store_id'))

#### This function combines all the other functions into one

In [188]:
def the_whole_shebang():
    '''
    This function does a whole thing with getting the zach data specifically
    '''
    
    sales_df, items_df, stores_df = get_full_zach_data()
    
    all_sales_data = join_zach_data((sales_df, 'item', 'store'), (items_df, 'item_id'), (stores_df, 'store_id'))
    
    return all_sales_data
    

#### Trying to get the loop through the endpoint thing to work

In [167]:


def get_api_pages(base_url, endpoint):
    ''' 
    This function takes in a base url, a string of the thing you want
    Creates a dataframe of all the endpoint's pages in a dataframe
    The endpoint aka the "thing string"
    ex. get_api_pages('https://python.zach.lol/api/v1/sales', 'sales')
    '''
    response = requests.get(base_url)
    data = response.json()
    print(base_url)
    
    maxpage = data['payload']['max_page'] #set maxpage to the payload's max_page
    
    items_list = [] # initialize list to store pages

    for page in range(1, maxpage + 1):
        url = base_url + '?page=' + str(page)
        response = requests.get(url)
        page_data = response.json()
        page_items = page_data['payload'][endpoint]
        items_list += page_items
    
    return pd.DataFrame(items_list)

In [180]:
def get_api_data(base_url, endpoint):
    '''
    This function reads in the data from an api.
    If there's a csv of that file it reads from there. if not it creates one
    uses get_api_pages function
    '''

    if os.path.isfile(f'{endpoint}.csv'):
        
        # If csv file exists read in data from csv file.
        df = pd.read_csv(f'{endpoint}.csv')
        
    else:
        
        # Read fresh data from db into a DataFrame
        df = get_api_pages(base_url, endpoint)
        
        # Cache data
        df.to_csv(f'{endpoint}.csv', index=False)

    return df

In [181]:
def get_all_dataframes_from_api(base_url, endpoint_list):
    '''
    This function takes in a url and an endpoint list
    Endpoint list is list of strings 
    returns a tuple with all the dataframes in it
    '''
    df_list = [] # initalize dataframe list
    
    for endpoint in endpoint_list: # loop through endpoint list that was entered
    
        # use exec() function to execute a formatted string
        # run through get api data function
        exec(f"{endpoint}_df = get_api_data(base_url + '{endpoint}', '{endpoint}')")
        
        # use exec() to append new dataframe to list
        exec(f"df_list.append({endpoint}_df)")
    
    return tuple(df_list)

In [170]:
base_url = 'https://python.zach.lol/api/v1/'
endpoint_list = ['sales', 'items', 'stores']

for endpoint in endpoint_list: # loop through endpoint list that was entered

    # use exec() function to execute a formatted string
    # run through get api data function
    exec(f"{endpoint}_df = get_api_data(base_url + '{endpoint}', '{endpoint}')")

https://python.zach.lol/api/v1/items
https://python.zach.lol/api/v1/stores


In [186]:
sales_df, items_df, stores_df = get_all_dataframes_from_api(base_url, endpoint_list)

In [187]:
sales_df.head()

Unnamed: 0,item,sale_amount,sale_date,sale_id,store
0,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,1
1,1,11.0,"Wed, 02 Jan 2013 00:00:00 GMT",2,1
2,1,14.0,"Thu, 03 Jan 2013 00:00:00 GMT",3,1
3,1,13.0,"Fri, 04 Jan 2013 00:00:00 GMT",4,1
4,1,10.0,"Sat, 05 Jan 2013 00:00:00 GMT",5,1


In [161]:
base_url = 'https://python.zach.lol/api/v1/'

endpoint_list = ['items', 'stores']

#items_df = get_api_pages(base_url + 'items', 'items')


for endpoint in endpoint_list:
    
    exec(f"{endpoint}_df = get_api_pages(base_url + '{endpoint}', '{endpoint}')")

items_df.head(), stores_df.head()

(       item_brand  item_id                                          item_name  \
 0        Riceland        1                     Riceland American Jazmine Rice   
 1          Caress        2  Caress Velvet Bliss Ultra Silkening Beauty Bar...   
 2     Earths Best        3  Earths Best Organic Fruit Yogurt Smoothie Mixe...   
 3      Boars Head        4   Boars Head Sliced White American Cheese - 120 Ct   
 4  Back To Nature        5  Back To Nature Gluten Free White Cheddar Rice ...   
 
    item_price    item_upc12    item_upc14  
 0        0.84   35200264013   35200264013  
 1        6.44   11111065925   11111065925  
 2        2.43   23923330139   23923330139  
 3        3.14  208528800007  208528800007  
 4        2.61  759283100036  759283100036  ,
              store_address   store_city  store_id store_state store_zipcode
 0   12125 Alamo Ranch Pkwy  San Antonio         1          TX         78253
 1         9255 FM 471 West  San Antonio         2          TX         78251
 2  

In [None]:
a = f'{endpoint}_' + 'url'
c = base_url + endpoint
eec('%s = %s', %(a, c))

%s = 