In [1]:
import os
import time
import logging
import send2trash
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def rm_file(files):
    for name in files:
        filename = files[name][1]
        path = os.path.expanduser(os.path.join(files[name][0], filename))
        if os.path.exists(path):
            logger.info(f"Removing existing file: {path}")
            try:
                # Rename the file to a temporary name
                temp_path = os.path.splitext(path)[0] + "_temp" + os.path.splitext(path)[1]
                os.rename(path, temp_path)
                
                # Delete the renamed file
                send2trash.send2trash(temp_path)
            except Exception as e:
                logger.error(f"Error removing file {path}: {e}")
        else:
            logger.warning(f"{path} does not exist")


In [3]:
def download_file(files):
    
    # set Chrome options for headless mode
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")  # Necessary for Windows, can be optional for other OS

    # initialize a new Chrome browser instance
    driver = webdriver.Chrome()

    for name in files:
        try:
            # navigate to the download URL
            url = files[name][3]
            filename = files[name][1]
            path = os.path.expanduser(os.path.join(files[name][0], filename))

            # log start of download
            logger.info(f"download_file: started downloading file {filename} from URL {url}")

            driver.get(url)

            # check if the file has finished downloading
            WebDriverWait(driver, 300).until_not(lambda x: os.path.exists(path + '.crdownload'))
            while not os.path.exists(path):
                time.sleep(1)

            # log end of download
            logger.info(f"download_file: finished downloading file {filename}")
        except Exception as e:
            logger.error(f"Error downloading file {filename} from URL {url}: {e}")

    # close the browser
    driver.quit()


In [4]:
def data_flatten(output_path,path, filename, save_as):
    # log start of function
    logger.info(f"data_flatten: started processing file {filename}")
    # open csv in pandas and pivot table
    df_data = pd.read_csv(path+filename)
    # melt DataFrame to pivot dates to become values
    df_piv = df_data.melt(id_vars=df_data.columns[0:9], var_name='Date', value_name='Value')
    df_regions = df_piv[['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName',
       'State', 'City', 'Metro', 'CountyName']].drop_duplicates()
    df = df_piv[['RegionID','Date', 'Value']]
    df.columns = ['RegionID','Date',save_as]

    # write data to CSV file
    df.to_csv(f'./zillow_data/{save_as}.csv', index=False)
    df.to_csv(f'{output_path}/{save_as}.csv', index=False)
    
    logger.info(f"cleaned and saved {save_as}")

    return df

In [5]:
output_path = '~/OneDrive/Zillow_data'

# Home Value
hv_source_path = '~/Downloads/'
hv_source_filename = 'Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv'
hv_data_name = 'HomeValue'
hv_url_endpoint = 'https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv?t=1683765775'

# Rent Value
rv_source_path = '~/Downloads/'
rv_source_filename = 'Zip_zori_uc_sfrcondomfr_sm_month.csv'
rv_data_name = 'RentValue'
rv_url_endpoint = 'https://files.zillowstatic.com/research/public_csvs/zori/Zip_zori_uc_sfrcondomfr_sm_month.csv?t=1701182999' 

files = {
    'home_value': [hv_source_path,
                   hv_source_filename,
                   hv_data_name, 
                   hv_url_endpoint],
    
    'rent_value':[rv_source_path,
                  rv_source_filename,
                  rv_data_name,
                  rv_url_endpoint]}

In [6]:
rm_file(files)
download_file(files)
for name in files:
    data_flatten(output_path,*files[name][0:3])

2023-11-28 14:23:41,255 - INFO - download_file: started downloading file Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv from URL https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv?t=1683765775
2023-11-28 14:23:48,797 - INFO - download_file: finished downloading file Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv
2023-11-28 14:23:48,798 - INFO - download_file: started downloading file Zip_zori_uc_sfrcondomfr_sm_month.csv from URL https://files.zillowstatic.com/research/public_csvs/zori/Zip_zori_uc_sfrcondomfr_sm_month.csv?t=1701182999
2023-11-28 14:23:50,322 - INFO - download_file: finished downloading file Zip_zori_uc_sfrcondomfr_sm_month.csv
2023-11-28 14:23:50,418 - INFO - data_flatten: started processing file Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv
2023-11-28 14:24:10,954 - INFO - cleaned and saved HomeValue
2023-11-28 14:24:11,043 - INFO - data_flatten: started processing file Zip_zori_uc_sfrcondomfr_sm