In [29]:
import quandl
import pgeocode
import re
import numpy as np
import pandas as pd
import time
import threading

# logging warnings and errors
import logging
import sys
from io import StringIO

In [17]:
# # capture the output from the logging module
# str_out = StringIO()
# str_err = StringIO()

# configure the logging module to use the captured output
# handler_out = logging.StreamHandler(str_out)
# handler_err = logging.StreamHandler(str_err)
logging.basicConfig(filename='log_file.log',
                    level=logging.DEBUG,
                    format='%(asctime)s %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p',
                    filemode='a')
#                     handlers=[handler_out, handler_err])
logger = logging.getLogger()

In [18]:
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [26]:
max_calls_per_day = 50000
max_calls_per_10_seconds = 300
max_calls_per_10_minutes = 2000
concurrency_limit = 1

def api_call(max_calls_per_day, max_calls_per_10_seconds, max_calls_per_10_minutes, concurrency_limit):
    current_calls = 0
    time_of_last_call = time.time()
    while current_calls < max_calls_per_day:
        if current_calls % (max_calls_per_10_seconds * 6) == 0:
            time_since_last_call = time.time() - time_of_last_call
            if time_since_last_call < 600:
                time.sleep(600 - time_since_last_call)
        elif current_calls % max_calls_per_10_seconds == 0:
            time_since_last_call = time.time() - time_of_last_call
            if time_since_last_call < 10:
                time.sleep(10 - time_since_last_call)
        current_calls += 1
        print('current_calls ',current_calls)

        time_of_last_call = time.time()

In [5]:
def check_state(search_str):
    search_str_list = [x.strip() for x in search_str.split(';')]
    for x in search_str_list:
        if x in states:
            return x
        

def check_county(search_str):
    search_str_list = [x.strip() for x in search_str.split(';')]
    for x in search_str_list:
        if 'county' in x.lower():
            return x
        
        
def check_city(search_str):
    search_str_list = [x.strip() for x in search_str.split(';')]
    if len(search_str_list) == 1:
        return np.nan
    if 'county' not in search_str_list[-1].lower():
        return search_str_list[-1]
    
def check_metro(search_str):
    search_str_list = [x.strip() for x in search_str.split(';')]
    if len(search_str_list) <= 3:
        return np.nan
    if 'county' not in search_str_list[2].lower():
        return search_str_list[2]    
    
def get_data(indicator_id,region_id=None,start_date=None):
    return quandl.get_table('ZILLOW/DATA', indicator_id=indicator_id, region_id=region_id)

In [6]:
api_key = 'xuisyPUDscg1rq-HiMz7'

In [7]:
quandl.ApiConfig.api_key = api_key

In [8]:
# get indicators
df_ind = quandl.get_table("ZILLOW/INDICATORS",paginate=True)

In [9]:
# get regions
df_regions = quandl.get_table("ZILLOW/REGIONS",paginate=True)
df_regions_zip = df_regions[df_regions['region_type']=='zip']

In [10]:
df_regions_zip = df_regions_zip.copy()
df_regions_zip['region_str_len'] = df_regions_zip.apply(lambda x: len(x['region'].split(';')),axis=1)
df_regions_zip['zip'] = df_regions_zip.apply(lambda x: re.search('(\d{5})',x['region']).group(),axis=1)
df_regions_zip['state'] = df_regions_zip.apply(lambda x: check_state(x['region']),axis=1)
df_regions_zip['county'] = df_regions_zip.apply(lambda x: check_county(x['region']),axis=1)
df_regions_zip['city'] = df_regions_zip.apply(lambda x: check_city(x['region']),axis=1)
df_regions_zip['metro'] = df_regions_zip.apply(lambda x: check_metro(x['region']),axis=1)

In [11]:
region_ids = df_regions_zip['region_id']
indicator_ids = df_ind['indicator_id']

In [12]:
# save indicator
df_ind.to_csv('zillow_indicators.csv')

In [28]:
# select indicator 
indicators = pd.read_csv('zillow_indicators_ingest.csv')
tracked_indicators = indicators[indicators['ingest']=='Y']['indicator_id']

In [14]:
# create empty dataframe
df_data = pd.DataFrame(columns=['indicator_id', 'region_id','date','value'])

In [15]:
def append_data(tracked_indicators,region_ids)
    for ind in tracked_indicators:
        for region in region_ids:
            try:
                data = get_data(ind,region)
                if data.empty is False:
                    data.reset_index(inplace=True)
                    data = data[['indicator_id', 'region_id','date','value']]
                    df_data = pd.concat([df_data, data], axis=0,ignore_index=True)
                    logger.info(f'Indicator: {ind} - Region: {region} - Successfully Ingested')
            except Exception as e:
                logger.error(f'Region: {region} - Indicator: {ind} - {e}')

# # retrieve the captured output
# log_out = str_out.getvalue()
# log_err = str_err.getvalue()

# # close the captured output
# str_out.close()
# str_err.close() 

KeyboardInterrupt: 

In [80]:



def append_data(tracked_indicators, region_ids):
    calls_made = 0
    calls_last_10s = 0
    calls_last_10m = 0
    lock = threading.Semaphore(1)
    # create empty dataframe
    df_data = pd.DataFrame(columns=['indicator_id', 'region_id','date','value'])

    loops_made = 0
    max_no_loop = len(tracked_indicators)+len(region_ids)+2
    
    for ind in tracked_indicators:
        for region in region_ids:
            lock.acquire()
            try:
                if calls_made >= 50000:
                    logger.warning('API call limit reached for the day')
                    return
                
                if calls_last_10s >= 300:
                    elapsed_time = time.time() - last_10s_time
                    if elapsed_time < 10:
                        time.sleep(10 - elapsed_time)
                    calls_last_10s = 0
                
                if calls_last_10m >= 2000:
                    elapsed_time = time.time() - last_10m_time
                    if elapsed_time < 600:
                        time.sleep(600 - elapsed_time)
                    calls_last_10m = 0
                
                data = get_data(ind, region)
                if data.empty is False:
                    
                    data.reset_index(inplace=True)
                    data = data[['indicator_id', 'region_id','date','value']]
                    df_data = pd.concat([df_data, data], axis=0, ignore_index=True)
                    logger.info(f'Indicator: {ind} - Region: {region} - Successfully Ingested')
                    print(f'Indicator: {ind} - Region: {region} - Successfully Ingested')
                
                calls_made += 1
                calls_last_10s += 1
                calls_last_10m += 1
                loops_made += 1
                
                last_10s_time = time.time()
                last_10m_time = time.time()
                
                if loops_made == max_no_loop:
                    return df_data
                
            except Exception as e:
                logger.error(f'Region: {region} - Indicator: {ind} - {e}')
            
            finally:
                lock.release()


In [None]:
append_data(tracked_indicators, region_ids)

Indicator: ZSFH - Region: 99999 - Successfully Ingested
Indicator: ZSFH - Region: 99998 - Successfully Ingested
Indicator: ZSFH - Region: 99997 - Successfully Ingested
Indicator: ZSFH - Region: 99996 - Successfully Ingested
Indicator: ZSFH - Region: 99995 - Successfully Ingested
Indicator: ZSFH - Region: 99994 - Successfully Ingested
Indicator: ZSFH - Region: 99993 - Successfully Ingested
Indicator: ZSFH - Region: 99992 - Successfully Ingested
Indicator: ZSFH - Region: 99991 - Successfully Ingested
Indicator: ZSFH - Region: 99990 - Successfully Ingested
Indicator: ZSFH - Region: 99989 - Successfully Ingested
Indicator: ZSFH - Region: 99988 - Successfully Ingested
Indicator: ZSFH - Region: 99987 - Successfully Ingested
Indicator: ZSFH - Region: 99986 - Successfully Ingested
Indicator: ZSFH - Region: 99985 - Successfully Ingested
Indicator: ZSFH - Region: 99984 - Successfully Ingested
Indicator: ZSFH - Region: 99983 - Successfully Ingested
Indicator: ZSFH - Region: 99982 - Successfully I

Indicator: ZSFH - Region: 99837 - Successfully Ingested
Indicator: ZSFH - Region: 99832 - Successfully Ingested
Indicator: ZSFH - Region: 99829 - Successfully Ingested
Indicator: ZSFH - Region: 99828 - Successfully Ingested
Indicator: ZSFH - Region: 99827 - Successfully Ingested
Indicator: ZSFH - Region: 99826 - Successfully Ingested
Indicator: ZSFH - Region: 99825 - Successfully Ingested
Indicator: ZSFH - Region: 99818 - Successfully Ingested
Indicator: ZSFH - Region: 99817 - Successfully Ingested
Indicator: ZSFH - Region: 99816 - Successfully Ingested
Indicator: ZSFH - Region: 99810 - Successfully Ingested
Indicator: ZSFH - Region: 99809 - Successfully Ingested
Indicator: ZSFH - Region: 99808 - Successfully Ingested
Indicator: ZSFH - Region: 99807 - Successfully Ingested
Indicator: ZSFH - Region: 99805 - Successfully Ingested
Indicator: ZSFH - Region: 99800 - Successfully Ingested
Indicator: ZSFH - Region: 99799 - Successfully Ingested
Indicator: ZSFH - Region: 99798 - Successfully I

Indicator: ZSFH - Region: 99633 - Successfully Ingested
Indicator: ZSFH - Region: 99631 - Successfully Ingested
Indicator: ZSFH - Region: 99629 - Successfully Ingested
Indicator: ZSFH - Region: 99627 - Successfully Ingested
Indicator: ZSFH - Region: 99626 - Successfully Ingested
Indicator: ZSFH - Region: 99625 - Successfully Ingested
Indicator: ZSFH - Region: 99624 - Successfully Ingested
Indicator: ZSFH - Region: 99623 - Successfully Ingested
Indicator: ZSFH - Region: 99618 - Successfully Ingested
Indicator: ZSFH - Region: 99614 - Successfully Ingested
Indicator: ZSFH - Region: 99613 - Successfully Ingested
Indicator: ZSFH - Region: 99608 - Successfully Ingested
Indicator: ZSFH - Region: 99607 - Successfully Ingested
Indicator: ZSFH - Region: 99601 - Successfully Ingested
Indicator: ZSFH - Region: 99598 - Successfully Ingested
Indicator: ZSFH - Region: 99597 - Successfully Ingested
Indicator: ZSFH - Region: 99595 - Successfully Ingested
Indicator: ZSFH - Region: 99591 - Successfully I

In [None]:
data.reset_index(inplace=True)
data = data[['indicator_id', 'region_id','date','value']]
pd.concat([df, data], axis=0,ignore_index=True)

In [None]:
df_data