In [1]:
import pandas as pd
import numpy as np
from path import Path



## ETL - Extract Data

In [2]:
# Read the raw zillow data file
file_path = Path("../../data/raw/zillow_data/City_Zhvi_SingleFamilyResidence.csv")
zillow_df = pd.read_csv(file_path)
zillow_df.head()

Unnamed: 0.1,Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,Metro,CountyName,1996-01-31,...,2019-06-30,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31,2020-01-31,2020-02-29,2020-03-31
0,0,6181,0,New York,City,NY,NY,New York-Newark-Jersey City,Queens County,208545.0,...,672433.0,671924.0,671423.0,670719.0,669974.0,669118.0,668736.0,668740.0,668581.0,668030.0
1,1,12447,1,Los Angeles,City,CA,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,192855.0,...,745290.0,746729.0,748924.0,751756.0,755716.0,759279.0,764877.0,770853.0,779717.0,788751.0
2,2,39051,2,Houston,City,TX,TX,Houston-The Woodlands-Sugar Land,Harris County,95018.0,...,189803.0,190437.0,191052.0,191483.0,192124.0,192620.0,193202.0,193427.0,193991.0,194986.0
3,3,17426,3,Chicago,City,IL,IL,Chicago-Naperville-Elgin,Cook County,126867.0,...,226322.0,226635.0,226796.0,226645.0,226505.0,226430.0,226454.0,226727.0,227077.0,227605.0
4,4,6915,4,San Antonio,City,TX,TX,San Antonio-New Braunfels,Bexar County,94406.0,...,183622.0,184246.0,184831.0,185752.0,186401.0,187159.0,187339.0,187886.0,188055.0,188650.0


## ETL - Transform Process
Keep only FL and CA data and then drop unnecessary columns

In [3]:
# Filter rows with state FL and CA only
state = ["FL", "CA"]
State_data = zillow_df[zillow_df.State.isin(state)]
State_data.dropna(axis=0,how="any",inplace=True)
State_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  State_data.dropna(axis=0,how="any",inplace=True)


Unnamed: 0.1,Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,Metro,CountyName,1996-01-31,...,2019-06-30,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31,2020-01-31,2020-02-29,2020-03-31
1,1,12447,1,Los Angeles,City,CA,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,192855.0,...,745290.0,746729.0,748924.0,751756.0,755716.0,759279.0,764877.0,770853.0,779717.0,788751.0
8,8,54296,8,San Diego,City,CA,CA,San Diego-Carlsbad,San Diego County,214715.0,...,709024.0,711846.0,714119.0,716734.0,718446.0,722025.0,726601.0,733084.0,738722.0,745231.0
11,11,33839,11,San Jose,City,CA,CA,San Jose-Sunnyvale-Santa Clara,Santa Clara County,232040.0,...,1085372.0,1076216.0,1074787.0,1079039.0,1083387.0,1088611.0,1097565.0,1113006.0,1128436.0,1145150.0
12,12,25290,12,Jacksonville,City,FL,FL,Jacksonville,Duval County,85992.0,...,191089.0,191476.0,192127.0,192753.0,193752.0,194465.0,195560.0,197045.0,199119.0,201483.0
14,14,20330,14,San Francisco,City,CA,CA,San Francisco-Oakland-Hayward,San Francisco County,299060.0,...,1463199.0,1464488.0,1462393.0,1467249.0,1473085.0,1482111.0,1495964.0,1504169.0,1512624.0,1515959.0


In [4]:
# Drop the unnecessary columns
State_dropped=State_data.drop(columns=["Unnamed: 0", "RegionID", "SizeRank", "RegionType", "StateName"])
State_dropped.head()

Unnamed: 0,RegionName,State,Metro,CountyName,1996-01-31,1996-02-29,1996-03-31,1996-04-30,1996-05-31,1996-06-30,...,2019-06-30,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31,2020-01-31,2020-02-29,2020-03-31
1,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,192855.0,192899.0,192974.0,193133.0,193265.0,193453.0,...,745290.0,746729.0,748924.0,751756.0,755716.0,759279.0,764877.0,770853.0,779717.0,788751.0
8,San Diego,CA,San Diego-Carlsbad,San Diego County,214715.0,214359.0,214117.0,213632.0,213343.0,213140.0,...,709024.0,711846.0,714119.0,716734.0,718446.0,722025.0,726601.0,733084.0,738722.0,745231.0
11,San Jose,CA,San Jose-Sunnyvale-Santa Clara,Santa Clara County,232040.0,231403.0,231196.0,230868.0,230976.0,231234.0,...,1085372.0,1076216.0,1074787.0,1079039.0,1083387.0,1088611.0,1097565.0,1113006.0,1128436.0,1145150.0
12,Jacksonville,FL,Jacksonville,Duval County,85992.0,86107.0,86225.0,86436.0,86621.0,86815.0,...,191089.0,191476.0,192127.0,192753.0,193752.0,194465.0,195560.0,197045.0,199119.0,201483.0
14,San Francisco,CA,San Francisco-Oakland-Hayward,San Francisco County,299060.0,298066.0,297387.0,296175.0,295267.0,294811.0,...,1463199.0,1464488.0,1462393.0,1467249.0,1473085.0,1482111.0,1495964.0,1504169.0,1512624.0,1515959.0


In [5]:
## Convert the Date columns to rows
FL_CA_df = pd.melt(State_dropped, id_vars=["RegionName","State","Metro","CountyName"], 
                  var_name="Date", value_name="Avg_Price")

In [6]:
FL_CA_df.set_index("RegionName")

Unnamed: 0_level_0,State,Metro,CountyName,Date,Avg_Price
RegionName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,1996-01-31,192855.0
San Diego,CA,San Diego-Carlsbad,San Diego County,1996-01-31,214715.0
San Jose,CA,San Jose-Sunnyvale-Santa Clara,Santa Clara County,1996-01-31,232040.0
Jacksonville,FL,Jacksonville,Duval County,1996-01-31,85992.0
San Francisco,CA,San Francisco-Oakland-Hayward,San Francisco County,1996-01-31,299060.0
...,...,...,...,...,...
Homeland,FL,Lakeland-Winter Haven,Polk County,2020-03-31,128242.0
Okahumpka,FL,Orlando-Kissimmee-Sanford,Lake County,2020-03-31,93233.0
Hillcrest Heights,FL,Lakeland-Winter Haven,Polk County,2020-03-31,180944.0
Everglades,FL,Naples-Immokalee-Marco Island,Collier County,2020-03-31,264616.0


In [7]:
# Drop the Rows where any field is null or blank
FL_CA_df.dropna(axis=0)

Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
0,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,1996-01-31,192855.0
1,San Diego,CA,San Diego-Carlsbad,San Diego County,1996-01-31,214715.0
2,San Jose,CA,San Jose-Sunnyvale-Santa Clara,Santa Clara County,1996-01-31,232040.0
3,Jacksonville,FL,Jacksonville,Duval County,1996-01-31,85992.0
4,San Francisco,CA,San Francisco-Oakland-Hayward,San Francisco County,1996-01-31,299060.0
...,...,...,...,...,...,...
328243,Homeland,FL,Lakeland-Winter Haven,Polk County,2020-03-31,128242.0
328244,Okahumpka,FL,Orlando-Kissimmee-Sanford,Lake County,2020-03-31,93233.0
328245,Hillcrest Heights,FL,Lakeland-Winter Haven,Polk County,2020-03-31,180944.0
328246,Everglades,FL,Naples-Immokalee-Marco Island,Collier County,2020-03-31,264616.0


In [8]:
# Convert the date field to Datetime field
FL_CA_df["Date"] = pd.to_datetime(FL_CA_df["Date"])
FL_CA_df.head()

Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
0,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,1996-01-31,192855.0
1,San Diego,CA,San Diego-Carlsbad,San Diego County,1996-01-31,214715.0
2,San Jose,CA,San Jose-Sunnyvale-Santa Clara,Santa Clara County,1996-01-31,232040.0
3,Jacksonville,FL,Jacksonville,Duval County,1996-01-31,85992.0
4,San Francisco,CA,San Francisco-Oakland-Hayward,San Francisco County,1996-01-31,299060.0


In [9]:
# Ensure Avg_Price field is of type integer 
FL_CA_df["Avg_Price"].dtype
FL_CA_df['Avg_Price'] = FL_CA_df['Avg_Price'].astype('Int64')
FL_CA_df.head()

Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
0,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,1996-01-31,192855
1,San Diego,CA,San Diego-Carlsbad,San Diego County,1996-01-31,214715
2,San Jose,CA,San Jose-Sunnyvale-Santa Clara,Santa Clara County,1996-01-31,232040
3,Jacksonville,FL,Jacksonville,Duval County,1996-01-31,85992
4,San Francisco,CA,San Francisco-Oakland-Hayward,San Francisco County,1996-01-31,299060


## ETL - Load data 
write the processed data to CSV as well as load the data to POSTGRES data base

In [10]:
#Export the data to CSV
file_path_export=Path("../../data/processed/Housing_cleaned.csv")
FL_CA_df.to_csv(file_path_export)

In [11]:
import os,sys
parentdir = Path(os.path.abspath("../.."))
sys.path.insert(0,parentdir)

# Get the DB password
from config import db_password

from sqlalchemy import create_engine
import psycopg2
# import the psycopg2 database adapter for PostgreSQL
from psycopg2 import connect, extensions, sql

from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

In [14]:
## RUN THIS BLOCK OF CODE IF YOU HAVE NOT CREATED THE 'covid_propert_pandemic' DATABASE.
## IF YOU HAVE ALREADY CREATED THE DATABASE, SKIP TO THE NEXT CELL.

# First Create a Database

# declare a new PostgreSQL connection object
conn = connect(
user = "postgres",
host = "localhost",
password = db_password
)

# object type: psycopg2.extensions.connection
print ("\ntype(conn):", type(conn))

# string for the new database name to be created
DB_NAME = "covid_property_pandemic"

# get the isolation leve for autocommit
autocommit = extensions.ISOLATION_LEVEL_AUTOCOMMIT
print ("ISOLATION_LEVEL_AUTOCOMMIT:", extensions.ISOLATION_LEVEL_AUTOCOMMIT)

"""
ISOLATION LEVELS for psycopg2
0 = READ UNCOMMITTED
1 = READ COMMITTED
2 = REPEATABLE READ
3 = SERIALIZABLE
4 = DEFAULT
"""

# set the isolation level for the connection's cursors
# will raise ActiveSqlTransaction exception otherwise
conn.set_isolation_level( autocommit )

# instantiate a cursor object from the connection
cursor = conn.cursor()

# use the execute() method to make a SQL request
#cursor.execute('CREATE DATABASE ' + str(DB_NAME))

# use the sql module instead to avoid SQL injection attacks
####### cursor.execute(f'DROP DATABASE IF EXISTS {DB_NAME}') - Don't use.

cursor.execute(sql.SQL(
"CREATE DATABASE {}"
).format(sql.Identifier( DB_NAME )))

# close the cursor to avoid memory leaks
cursor.close()

# close the connection to avoid memory leaks
conn.close()



type(conn): <class 'psycopg2.extensions.connection'>
ISOLATION_LEVEL_AUTOCOMMIT: 0


In [14]:
# Add data to table
import time
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/covid_property_pandemic"
engine = create_engine(db_string)

rows_imported = 0
# get the start_time from time.time()
start_time = time.time()
for data in pd.read_csv("../../data/processed/Housing_cleaned.csv", chunksize=15000):
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    data.to_sql(name='FL_CA_Housing_Data', con=engine, if_exists='append')
    rows_imported += len(data)

    # add elapsed time to final print out
    print(f'Done. {time.time() - start_time} total seconds elapsed')



importing rows 0 to 15000...Done. 1.9649240970611572 total seconds elapsed
importing rows 15000 to 30000...Done. 3.854435443878174 total seconds elapsed
importing rows 30000 to 45000...Done. 5.706032752990723 total seconds elapsed
importing rows 45000 to 60000...Done. 7.591034889221191 total seconds elapsed
importing rows 60000 to 75000...Done. 9.445930480957031 total seconds elapsed
importing rows 75000 to 90000...Done. 11.356061458587646 total seconds elapsed
importing rows 90000 to 105000...Done. 13.213990926742554 total seconds elapsed
importing rows 105000 to 120000...Done. 15.068191766738892 total seconds elapsed
importing rows 120000 to 135000...Done. 16.97353196144104 total seconds elapsed
importing rows 135000 to 150000...Done. 18.834017753601074 total seconds elapsed
importing rows 150000 to 165000...Done. 20.69613265991211 total seconds elapsed
importing rows 165000 to 180000...Done. 22.54523229598999 total seconds elapsed
importing rows 180000 to 195000...Done. 24.428927421