## Exercises

Create a new local git repository and remote repository on github named `time-series-exercises`. Save this work for this module in your `time-series-exercises` repo.

The end result of this exercise should be a file named `acquire.py`.



In [2]:
import pandas as pd
import requests

1. Using the code from the lesson as a guide and the REST API from https://python.zgulde.net/api/v1/items as we did in the lesson, create a dataframe named `items` that has all of the data for items.


In [3]:
# setup
domain = 'https://python.zgulde.net'


In [4]:
help = requests.get(domain).json()['help']
print(requests.get(domain+help).json()['payload'])


The API accepts GET requests for all endpoints, where endpoints are prefixed
with

    /api/{version}

Where version is "v1"

Valid endpoints:

- /stores[/{store_id}]
- /items[/{item_id}]
- /sales[/{sale_id}]

All endpoints accept a `page` parameter that can be used to navigate through
the results.



In [5]:
requests.get(domain).json()['api']

'/api/v1'

In [6]:
def get_json(url):
    response = requests.get(url)
    return response.json()

In [7]:
api = get_json(domain)['api']
get_json(domain+api)

{'payload': {'routes': ['/stores',
   '/stores/{store_id}',
   '/items',
   '/items/{item_id}',
   '/sales',
   '/sales/{sale_id}']},
 'status': 'ok'}

In [34]:
api = get_json(domain)['api']
routes = get_json(domain+api)['payload']['routes']
routes

['/stores',
 '/stores/{store_id}',
 '/items',
 '/items/{item_id}',
 '/sales',
 '/sales/{sale_id}']

In [24]:
api = get_json(domain)['api']
routes = get_json(domain+api)['payload']['routes']
valid_enpoints = routes[::2]
valid_enpoints

['/stores', '/items', '/sales']

In [55]:
endpoint = api+'/items'
def walk_pages(domain, endpoint):
    out = []
    this = endpoint.split('/')[-1]
    response = get_json(domain+endpoint)
    if response['status'] == 'ok':
        payload = response['payload']
        out.extend(payload[this])
        next = payload['next_page']
        while next:
            response = get_json(domain+next)
            payload = response['payload']
            out.extend(payload[this])
            next = payload['next_page']
    return out

In [54]:
endpoint.split('/')[-1]

'items'

In [74]:
pd.DataFrame(walk_pages(domain, endpoint)).head()

In [50]:
valid_enpoints

['/stores', '/items', '/sales']

1. Using the code from the lesson as a guide and the REST API from https://python.zgulde.net/api/v1/items as we did in the lesson, create a dataframe named `items` that has all of the data for items.

2. Do the same thing, but for `stores` (https://python.zgulde.net/api/v1/stores)

3. Extract the data for `sales` (https://python.zgulde.net/api/v1/sales). There are a lot of pages of data here, so your code will need to be a little more complex. Your code should continue fetching data from the next page until all of the data is extracted.


In [57]:
out = []
for i, endpoint in enumerate(valid_enpoints):
    out.append(walk_pages(domain, api+endpoint))


In [62]:

table = {
    'stores' : pd.DataFrame(out[0]),
    'items' : pd.DataFrame(out[1]),
    'sales' : pd.DataFrame(out[2]),
}

4. Save the data in your files to local csv files so that it will be faster to access in the future.


In [64]:
import os
path = './data'
if not os.path.exists(path):
    os.makedirs(path)
for k, v in table.items():
    file = f'{path}/{k}.csv'
    print(file)
    v.to_csv(file)

./data/stores.csv
./data/items.csv
./data/sales.csv


5. Combine the data from your three separate dataframes into one large dataframe.


In [68]:
table['stores'].head()

Unnamed: 0,store_address,store_city,store_id,store_state,store_zipcode
0,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253
1,9255 FM 471 West,San Antonio,2,TX,78251
2,2118 Fredericksburg Rdj,San Antonio,3,TX,78201
3,516 S Flores St,San Antonio,4,TX,78204
4,1520 Austin Hwy,San Antonio,5,TX,78218


In [69]:
table['items'].head()

Unnamed: 0,item_brand,item_id,item_name,item_price,item_upc12,item_upc14
0,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
1,Caress,2,Caress Velvet Bliss Ultra Silkening Beauty Bar...,6.44,11111065925,11111065925
2,Earths Best,3,Earths Best Organic Fruit Yogurt Smoothie Mixe...,2.43,23923330139,23923330139
3,Boars Head,4,Boars Head Sliced White American Cheese - 120 Ct,3.14,208528800007,208528800007
4,Back To Nature,5,Back To Nature Gluten Free White Cheddar Rice ...,2.61,759283100036,759283100036


In [70]:
table['sales'].head()

Unnamed: 0,item,sale_amount,sale_date,sale_id,store
0,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,1
1,1,11.0,"Wed, 02 Jan 2013 00:00:00 GMT",2,1
2,1,14.0,"Thu, 03 Jan 2013 00:00:00 GMT",3,1
3,1,13.0,"Fri, 04 Jan 2013 00:00:00 GMT",4,1
4,1,10.0,"Sat, 05 Jan 2013 00:00:00 GMT",5,1


In [71]:
df = table['sales'].join(
    other=table['items'].set_index('item_id'), 
    on='item', 
    how='left').join(
        other=table['stores'].set_index('store_id'), 
        on='store',
        how='left'
    )

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   item           913000 non-null  int64  
 1   sale_amount    913000 non-null  float64
 2   sale_date      913000 non-null  object 
 3   sale_id        913000 non-null  int64  
 4   store          913000 non-null  int64  
 5   item_brand     913000 non-null  object 
 6   item_name      913000 non-null  object 
 7   item_price     913000 non-null  float64
 8   item_upc12     913000 non-null  object 
 9   item_upc14     913000 non-null  object 
 10  store_address  913000 non-null  object 
 11  store_city     913000 non-null  object 
 12  store_state    913000 non-null  object 
 13  store_zipcode  913000 non-null  object 
dtypes: float64(2), int64(3), object(9)
memory usage: 97.5+ MB


6. Acquire the Open Power Systems Data for Germany, which has been rapidly expanding its renewable energy production in recent years. The data set includes country-wide totals of electricity consumption, wind power production, and solar power production for 2006-2017. You can get the data here: https://raw.githubusercontent.com/jenfly/opsd/master/opsd\_germany\_daily.csv


In [73]:
power = pd.read_csv('https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv')
power.head()

Unnamed: 0,Date,Consumption,Wind,Solar,Wind+Solar
0,2006-01-01,1069.184,,,
1,2006-01-02,1380.521,,,
2,2006-01-03,1442.533,,,
3,2006-01-04,1457.217,,,
4,2006-01-05,1477.131,,,


7. Make sure all the work that you have done above is reproducible. That is, you should put the code above into separate functions in the `acquire.py` file and be able to re-run the functions and get the same data.

In [90]:
PATH = './data'

In [95]:
import pandas as pd
import os
import requests


PATH = './data'

def get_json(url):
    response = requests.get(url)
    return response.json()


def walk_pages(domain, endpoint):
    out = []
    this = endpoint.split('/')[-1]
    response = get_json(domain+endpoint)
    if response['status'] == 'ok':
        payload = response['payload']
        out.extend(payload[this])
        next = payload['next_page']
        while next:
            response = get_json(domain+next)
            payload = response['payload']
            out.extend(payload[this])
            next = payload['next_page']
    return out


def new_zgulde_data():
    domain = 'https://python.zgulde.net'
    api = get_json(domain)['api']
    if api == '/api/v1':
        routes = get_json(domain+api)['payload']['routes']
        valid_enpoints = routes[::2]

        out = {}
        for endpoint in valid_enpoints:
            e = endpoint.split('/')[-1]
            t = pd.DataFrame(walk_pages(domain, api+endpoint))
            out[e] = t

        return out

    else:
        raise Exception(f'API version has been changed and may not work with this script.  Expected "/api/v1", instead got {api}')


def cache_dict(dict, path = PATH ):
    if not os.path.exists(path):
        os.makedirs(path)
    for k, v in dict.items():
        file = f'{path}/{k}.csv'
        v.to_csv(file)


def read_folder(path = PATH ):
    if os.path.exists(path):
        out = {}
        dir = os.listdir(path)
        for file in dir:
            name = file.split('.')[0]
            out[name] = pd.read_csv(f'{path}/{file}', index_col=0)
        return out


def get_zgulde(path = PATH):
    cached = False
    # Check if there is cached data to load
    if os.path.exists(path):
        dir = os.listdir(path)
        if len(dir) > 0:
            cached = True
    
    if cached:
        d = read_folder(path)
    else:
        d = new_zgulde_data()
        cache_dict(d)
    return d


def join_zgulde(dict):
    df = dict['sales'].join(
        other=dict['items'].set_index('item_id'), 
        on='item', 
        how='left').join(
            other=dict['stores'].set_index('store_id'), 
            on='store',
            how='left'
        )
    return df


def wrangle_zgulde(path=PATH):
    d = get_zgulde(path)
    df = join_zgulde(d)
    return df


In [None]:
df = wrangle_zgulde()
df.head()

In [99]:
import acquire

xf = acquire.wrangle_zgulde()
xf.shape

(913000, 14)