In [None]:
 # Python extension for interfacing with SQL and better table formatting with Pandas

# !pip install ipython-sql
# !pip install pandas

Our data for this project is initially in 2 CSV file (the last question uses another dataset from the last project, DSIP #3, more work is done with that data in that project). Let's investigate how the new dataset looks like first.

In [2]:
# load the extensions that we need to begin a separate cell
import pandas as pd
import sqlite3

In [6]:
# Getting a view of the estate csv table
homes_df = pd.read_csv('home_value_data.csv')
print(homes_df.head())
print(homes_df.info())

# Also just getting the census data set up to combine into a new database
census_df = pd.read_csv('census_data.csv')

   zip_code     city state                     metro       county     date  \
0     60657  Chicago    IL  Chicago-Naperville-Elgin  Cook County  1996-04   
1     60657  Chicago    IL  Chicago-Naperville-Elgin  Cook County  1996-05   
2     60657  Chicago    IL  Chicago-Naperville-Elgin  Cook County  1996-06   
3     60657  Chicago    IL  Chicago-Naperville-Elgin  Cook County  1996-07   
4     60657  Chicago    IL  Chicago-Naperville-Elgin  Cook County  1996-08   

      value  
0  334200.0  
1  335400.0  
2  336500.0  
3  337600.0  
4  338500.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4202944 entries, 0 to 4202943
Data columns (total 7 columns):
 #   Column    Dtype  
---  ------    -----  
 0   zip_code  int64  
 1   city      object 
 2   state     object 
 3   metro     object 
 4   county    object 
 5   date      object 
 6   value     float64
dtypes: float64(1), int64(1), object(5)
memory usage: 224.5+ MB
None


Looks like this dataset contains the specific area we have the data for, the date of the data, plus the median estimated value for that area (we know that this is what value means based off of the description for the data).

We also don't really have a primary key for this dataset either. Although a composite key from the zip_code and date columns together should be unique.

In [7]:
# Combining tables and making SQL database

# Make the SQLite database
conn = sqlite3.connect('estate.db')

# Write DataFrames to SQLite tables in the database (don't need the dataframe indexes)
homes_df.to_sql('homes', conn, if_exists='replace', index=False)
census_df.to_sql('census', conn, if_exists='replace', index=False)

# Close the connection
conn.close()

In [8]:
# Necessary in the Jupyter Notebook to load the SQL extension and connect to the database file to use SQL directly, currently using SQLite
# Formatting the SQL query outputs into a better format with Pandas

%load_ext sql
%sql sqlite:///estate.db
%config SqlMagic.autopandas=True

Let's just take a look and see if the zip_code and date combined really are our composite key (even though we won't adjust the table for this at all, just for investigation purposes)

In [10]:
%%sql
SELECT zip_code, date, COUNT(*)
FROM homes
GROUP BY zip_code, date
HAVING COUNT(*) > 1;

 * sqlite:///estate.db
Done.


Since nothing got returned, we know there are no duplicate zip_code/date combinations and they can work as our composite key (although again, we won't be adjusting or making a primary key column based on this).

Let's also check and make sure our data types are correct.

In [12]:
%%sql 
PRAGMA table_info(homes);

 * sqlite:///estate.db
Done.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,zip_code,INTEGER,0,,0
1,1,city,TEXT,0,,0
2,2,state,TEXT,0,,0
3,3,metro,TEXT,0,,0
4,4,county,TEXT,0,,0
5,5,date,TEXT,0,,0
6,6,value,REAL,0,,0


Everything looks good so we can start with the questions

1. How many distinct zip codes are in this dataset?

In [14]:
%%sql
SELECT COUNT(DISTINCT zip_code) AS '# ZIP Codes'
FROM homes;

 * sqlite:///estate.db
Done.


Unnamed: 0,# ZIP Codes
0,15452


2. How many zip codes are from each state?

In [18]:
%%sql
SELECT state, COUNT(DISTINCT zip_code) AS '# ZIP Codes'
FROM homes
GROUP BY state;

 * sqlite:///estate.db
Done.


Unnamed: 0,state,# ZIP Codes
0,AK,28
1,AL,221
2,AR,119
3,AZ,233
4,CA,1230
5,CO,261
6,CT,122
7,DC,18
8,DE,39
9,FL,795


3. What range of years are represented in the data?

In [40]:
%%sql
-- The date column is not in the correct format for the usual method of extracting just the year from a date
-- We can just extract the first part of the str though
SELECT DISTINCT SUBSTR(date, 1, 4) AS 'year'
FROM homes;

 * sqlite:///estate.db
Done.


Unnamed: 0,year
0,1996
1,1997
2,1998
3,1999
4,2000
5,2001
6,2002
7,2003
8,2004
9,2005


4. Using the most recent month of data available, what is the range of estimated home values across the nation?

In [29]:
%%sql
SELECT MAX(value), MIN(value), AVG(value)
FROM homes
WHERE date = (
    SELECT date
    FROM homes
    ORDER BY date DESC
    LIMIT 1
);

 * sqlite:///estate.db
Done.


Unnamed: 0,MAX(value),MIN(value),AVG(value)
0,17757800.0,21600.0,290012.619726


5. Using the most recent month of data available, which states have the highest average home values? How about the lowest?

In [34]:
%%sql
SELECT state, ROUND(AVG(value), 2) AS 'Avg Value'
FROM homes
WHERE date = (
    SELECT date
    FROM homes
    ORDER BY date DESC
    LIMIT 1
)
GROUP BY state
ORDER BY 2 DESC;

 * sqlite:///estate.db
Done.


Unnamed: 0,state,Avg Value
0,DC,826572.22
1,CA,750965.37
2,HI,711085.48
3,MA,475926.97
4,CO,442713.03
5,WA,414104.6
6,NJ,403476.05
7,NY,378121.09
8,NV,349834.65
9,UT,343776.86


6. Which states have the highest/lowest average home values for the year of 2017? What about for the year of 2007? 1997?

In [64]:
%%sql
SELECT state, SUBSTR(date, 1, 4) AS 'year', ROUND(AVG(value), 2) AS 'Avg Value'
FROM homes
WHERE year = '2017' OR year = '2007' OR year = '1997'
GROUP BY state, year
ORDER BY year DESC, 3 DESC;

 * sqlite:///estate.db
Done.


Unnamed: 0,state,year,Avg Value
0,DC,2017,778756.02
1,CA,2017,688609.97
2,HI,2017,653451.75
3,MA,2017,438244.63
4,CO,2017,394511.46
...,...,...,...
148,LA,1997,69803.65
149,TN,1997,68213.77
150,SD,1997,59295.00
151,OK,1997,56325.62


Took a bit of a different approach to the question, combined the different sub questions into 1 table and just gave all the states. The ordering makes it so each year is separated and then within those years the states are ordered by their average value.

7. What is the percent change in average home values from 2007 to 2017 by state? How about from 1997 to 2017?

In [84]:
%%sql
WITH avg_2007 AS (
    SELECT state, SUBSTR(date, 1, 4) AS year, ROUND(AVG(value), 2) AS '2007avg'
    FROM homes
    WHERE year = '2007'
    GROUP BY state
),
avg_2017 AS (
    SELECT state, SUBSTR(date, 1, 4) AS year, ROUND(AVG(value), 2) AS '2017avg'
    FROM homes
    WHERE year = '2017'
    GROUP BY state
)
-- Need to use "" here to make sure it uses the column alias properly
SELECT avg_2007.state, "2007avg", "2017avg", 
    (( "2017avg" - "2007avg" ) / "2007avg") * 100 AS 'Percent Change'
FROM avg_2007
JOIN avg_2017 ON avg_2007.state = avg_2017.state
ORDER BY 4 DESC;

 * sqlite:///estate.db
Done.


Unnamed: 0,state,2007avg,2017avg,Percent Change
0,ND,124564.08,183148.99,47.031945
1,DC,599893.98,778756.02,29.815608
2,SD,143045.61,184605.42,29.053538
3,TX,152363.82,192054.76,26.050108
4,CO,313130.9,394511.46,25.98931
5,OK,91773.39,112451.7,22.531923
6,IA,128333.03,156758.26,22.149582
7,NE,137831.83,166257.87,20.623712
8,TN,117068.67,138124.43,17.98582
9,WA,316000.91,369567.15,16.951293


In [85]:
%%sql
WITH avg_1997 AS (
    SELECT state, SUBSTR(date, 1, 4) AS year, ROUND(AVG(value), 2) AS '1997avg'
    FROM homes
    WHERE year = '1997'
    GROUP BY state
),
avg_2017 AS (
    SELECT state, SUBSTR(date, 1, 4) AS year, ROUND(AVG(value), 2) AS '2017avg'
    FROM homes
    WHERE year = '2017'
    GROUP BY state
)
-- Need to use "" here to make sure it uses the column alias properly
SELECT avg_1997.state, "1997avg", "2017avg", 
    (( "2017avg" - "1997avg" ) / "1997avg") * 100 AS 'Percent Change'
FROM avg_1997
JOIN avg_2017 ON avg_1997.state = avg_2017.state
ORDER BY 4 DESC;

 * sqlite:///estate.db
Done.


Unnamed: 0,state,1997avg,2017avg,Percent Change
0,DC,189769.44,778756.02,310.369562
1,CA,207479.51,688609.97,231.893
2,SD,59295.0,184605.42,211.333873
3,HI,216377.12,653451.75,201.996694
4,NY,126348.63,359342.49,184.405529
5,MA,168343.61,438244.63,160.327452
6,CO,154106.44,394511.46,155.999334
7,FL,108010.5,272582.98,152.367112
8,WA,147066.59,369567.15,151.292391
9,ME,96941.37,240862.11,148.461632


8. How would you describe the trend in home values for each state from 1997 to 2017? How about from 2007 to 2017? Which states would you recommend for making real estate investments?

The queries from the last question can be used to answer this question


From 1997-2017, all states had the average values increase (inflation). Only a few states had a huge increase, especially DC which was a lot higher than all the others.

From 2007-2017, there wasn't that much of an increase in the average values. In many states, the average actually decreases, or barely increases. ND was the one that had the biggest increase instead of DC (although that was second). ND may have had a big improvement from 1997 to 2017 but we can't see since we don't have data from there in 1997, although could just do from 1998.

For a recommendation DC or SD would be great choices as they have a strong history of going up in value that is still strong with the recent data in this dataset. CA, despite having a big increase from 1997-2017, has decrease recently. 

9. Join the house value data with the table of zip-code level census data. Do there seem to be any correlations between the estimated house values and characteristics of the area, such as population count or median household income?

In [115]:
%%sql
-- Here the zip codes are being put into groups based on the average value of estates over the years 
-- The higher the average, the higher # bucket
WITH buckets AS (
    SELECT zip_code, 
        ROUND(AVG(value), 2) AS "zip_avg",
        NTILE(5) OVER (ORDER BY ROUND(AVG(value), 2) ASC) AS 'avg_bucket'
    FROM homes
    GROUP BY zip_code
)
-- Per each bucket (from the estate averages in the zip), calculate the averages within that bucket
SELECT avg_bucket AS 'Estate Value Bucket', ROUND(AVG(zip_avg), 2) AS 'Average Estate Value', ROUND(AVG(pop_total), 2) AS 'Average Population', ROUND(AVG(median_household_income), 2) AS 'Average Median Income'
FROM buckets
JOIN census ON buckets.zip_code = census.zip_code
GROUP BY avg_bucket;

 * sqlite:///estate.db
Done.


Unnamed: 0,Estate Value Bucket,Average Estate Value,Average Population,Average Median Income
0,1,71716.02,10986.33,42025.04
1,2,110142.58,14943.55,50365.56
2,3,148897.96,17828.57,57763.97
3,4,212056.63,20381.72,68882.57
4,5,475980.36,23422.51,89825.01


We can see that when the average estate value in a zip code increases, it is direcly proportional to both the average population in that area and the average income in that area too.
This does make sense, if a place costs more to live there, people's income needs to be higher. Plus, cities tend to cost more as well and will have a higher population.