# Merging All Dataframes
## 90-803 Machine Learning Foundations with Python (Spring 2024)
### Team 17: Yoko, Tessa, Hannah

In [89]:
import pandas as pd
import seaborn as sns

## Read Data

In [90]:
home_values = pd.read_csv("homevalues.csv")
home_values.drop(columns=['Unnamed: 0'], inplace=True)
home_values.head(5)

Unnamed: 0,Date,Year,Month,RegionID,SizeRank,RegionName,RegionType,StateName,State,StateCodeFIPS,MunicipalCodeFIPS,Bottom-Tier Average Home Value Estimate,Top-Tier Average Home Value Estimate
0,2000-01-31,2000,1,3101,0,Los Angeles County,county,CA,CA,6,37,120563.286413,389752.00715
1,2000-01-31,2000,1,139,1,Cook County,county,IL,IL,17,31,69971.821493,299183.838961
2,2000-01-31,2000,1,1090,2,Harris County,county,TX,TX,48,201,61348.078875,215869.028803
3,2000-01-31,2000,1,2402,3,Maricopa County,county,AZ,AZ,4,13,80020.541957,248448.883325
4,2000-01-31,2000,1,2841,4,San Diego County,county,CA,CA,6,73,127922.587074,380822.96161


In [92]:
climate_data = pd.read_csv("climate_cleaned.csv")
climate_data['Date'] = pd.to_datetime(climate_data['Date'])
climate_data['Year'] = climate_data['Date'].dt.year
climate_data['Month'] = climate_data['Date'].dt.month
climate_data[climate_data['Name'] == 'Cook County']

Unnamed: 0,ID,Name,State,Value,Anomaly (1901-2000 base period),1901-2000 Mean,Date,Year,Month
393,GA-075,Cook County,Georgia,50.1,0.3,49.8,2000-01-31,2000,1
575,IL-031,Cook County,Illinois,24.9,2.4,22.5,2000-01-31,2000,1
1294,MN-031,Cook County,Minnesota,6.8,2.1,4.7,2000-01-31,2000,1
3500,GA-075,Cook County,Georgia,55.4,2.9,52.5,2000-02-29,2000,2
3682,IL-031,Cook County,Illinois,33.9,8.1,25.8,2000-02-29,2000,2
...,...,...,...,...,...,...,...,...,...
889177,IL-031,Cook County,Illinois,41.6,2.7,38.9,2023-11-30,2023,11
889896,MN-031,Cook County,Minnesota,27.8,2.8,25.0,2023-11-30,2023,11
892102,GA-075,Cook County,Georgia,52.7,1.9,50.8,2023-12-31,2023,12
892284,IL-031,Cook County,Illinois,38.8,11.8,27.0,2023-12-31,2023,12


In [93]:
from state_names import state_abbreviations
climate_data['StateAbbrv'] = climate_data['State'].map(state_abbreviations)
climate_data.sample(10)

Unnamed: 0,ID,Name,State,Value,Anomaly (1901-2000 base period),1901-2000 Mean,Date,Year,Month,StateAbbrv
46534,WI-051,Iron County,Wisconsin,23.8,0.3,23.5,2001-03-31,2001,3,WI
70247,NC-077,Granville County,North Carolina,53.1,5.1,48.0,2001-11-30,2001,11,NC
368483,NC-005,Alleghany County,North Carolina,46.3,4.3,42.0,2009-11-30,2009,11,NC
291576,TX-275,Knox County,Texas,68.3,3.6,64.7,2007-10-31,2007,10,TX
276783,CO-089,Otero County,Colorado,69.7,-0.9,70.6,2007-06-30,2007,6,CO
511555,ND-105,Williams County,North Dakota,61.6,6.6,55.0,2013-09-30,2013,9,ND
323581,GA-195,Madison County,Georgia,71.6,-0.6,72.2,2008-09-30,2008,9,GA
262387,MS-067,Jones County,Mississippi,48.4,0.6,47.8,2007-01-31,2007,1,MS
532478,MD-510,Baltimore City,Maryland,53.8,0.9,52.9,2014-04-30,2014,4,MD
261815,IA-147,Palo Alto County,Iowa,18.4,4.2,14.2,2007-01-31,2007,1,IA


In [94]:
mortgage_data = pd.read_csv("mortgage_rates.csv")
mortgage_data.drop(columns=['Unnamed: 0'], inplace=True)
mortgage_data.head(5)

Unnamed: 0,Year,Month,Initial Fees and Charges (%),Effective Rate (%),Term to Maturity,Loan Amount ($thou),Purchase Price ($thou),Loan-to-Price Ratio (%),Share of Total Market (%)
0,1990,1,1.86,10.07,15,78.1,122.1,66.5,11.8
1,1990,2,1.94,10.14,15,83.3,128.7,66.5,10.8
2,1990,3,2.03,10.3,15,72.3,111.1,69.0,12.4
3,1990,4,2.0,10.44,15,74.9,115.8,68.4,10.7
4,1990,5,1.96,10.43,15,83.4,129.3,68.0,11.7


## Merge Dataframes

In [95]:
homevalues_v_climate = pd.merge(left=home_values, right=climate_data[['Value', 'Anomaly (1901-2000 base period)', '1901-2000 Mean', 'Name', 'Year', 'Month', 'StateAbbrv']],\
                                 left_on=['RegionName', 'State', 'Year', 'Month'], right_on=['Name', 'StateAbbrv', 'Year', 'Month'])
homevalues_v_climate.drop(columns=['SizeRank', 'RegionID','RegionType', 'StateName', 'StateCodeFIPS', 'MunicipalCodeFIPS', 'Name'], inplace=True)
homevalues_v_climate.head(10)

Unnamed: 0,Date,Year,Month,RegionName,State,Bottom-Tier Average Home Value Estimate,Top-Tier Average Home Value Estimate,Value,Anomaly (1901-2000 base period),1901-2000 Mean,StateAbbrv
0,2000-01-31,2000,1,Los Angeles County,CA,120563.286413,389752.00715,52.3,4.5,47.8,CA
1,2000-01-31,2000,1,Cook County,IL,69971.821493,299183.838961,24.9,2.4,22.5,IL
2,2000-01-31,2000,1,Harris County,TX,61348.078875,215869.028803,57.4,5.6,51.8,TX
3,2000-01-31,2000,1,Maricopa County,AZ,80020.541957,248448.883325,55.3,4.5,50.8,AZ
4,2000-01-31,2000,1,San Diego County,CA,127922.587074,380822.96161,54.3,4.8,49.5,CA
5,2000-01-31,2000,1,Orange County,CA,156616.684239,432407.265612,57.9,3.9,54.0,CA
6,2000-01-31,2000,1,Kings County,NY,110107.2664,405157.195494,31.3,-0.1,31.4,NY
7,2000-01-31,2000,1,Miami-Dade County,FL,61121.785306,245604.646403,67.2,1.3,65.9,FL
8,2000-01-31,2000,1,Dallas County,TX,49695.161837,204073.628628,49.4,5.0,44.4,TX
9,2000-01-31,2000,1,Riverside County,CA,89602.13419,241029.07319,55.3,4.9,50.4,CA


In [96]:
homevalues_v_climate2 = pd.merge(left=homevalues_v_climate, right=mortgage_data, on=['Year', 'Month'])
homevalues_v_climate2.head(10)

Unnamed: 0,Date,Year,Month,RegionName,State,Bottom-Tier Average Home Value Estimate,Top-Tier Average Home Value Estimate,Value,Anomaly (1901-2000 base period),1901-2000 Mean,StateAbbrv,Initial Fees and Charges (%),Effective Rate (%),Term to Maturity,Loan Amount ($thou),Purchase Price ($thou),Loan-to-Price Ratio (%),Share of Total Market (%)
0,2000-01-31,2000,1,Los Angeles County,CA,120563.286413,389752.00715,52.3,4.5,47.8,CA,0.93,8.01,15,103.0,173.6,64.7,7.0
1,2000-01-31,2000,1,Los Angeles County,CA,120563.286413,389752.00715,52.3,4.5,47.8,CA,0.81,8.2,30,129.9,166.8,81.1,61.1
2,2000-01-31,2000,1,Cook County,IL,69971.821493,299183.838961,24.9,2.4,22.5,IL,0.93,8.01,15,103.0,173.6,64.7,7.0
3,2000-01-31,2000,1,Cook County,IL,69971.821493,299183.838961,24.9,2.4,22.5,IL,0.81,8.2,30,129.9,166.8,81.1,61.1
4,2000-01-31,2000,1,Harris County,TX,61348.078875,215869.028803,57.4,5.6,51.8,TX,0.93,8.01,15,103.0,173.6,64.7,7.0
5,2000-01-31,2000,1,Harris County,TX,61348.078875,215869.028803,57.4,5.6,51.8,TX,0.81,8.2,30,129.9,166.8,81.1,61.1
6,2000-01-31,2000,1,Maricopa County,AZ,80020.541957,248448.883325,55.3,4.5,50.8,AZ,0.93,8.01,15,103.0,173.6,64.7,7.0
7,2000-01-31,2000,1,Maricopa County,AZ,80020.541957,248448.883325,55.3,4.5,50.8,AZ,0.81,8.2,30,129.9,166.8,81.1,61.1
8,2000-01-31,2000,1,San Diego County,CA,127922.587074,380822.96161,54.3,4.8,49.5,CA,0.93,8.01,15,103.0,173.6,64.7,7.0
9,2000-01-31,2000,1,San Diego County,CA,127922.587074,380822.96161,54.3,4.8,49.5,CA,0.81,8.2,30,129.9,166.8,81.1,61.1


### References

1. https://seaborn.pydata.org/generated/seaborn.lineplot.html
2. https://www.fhfa.gov/DataTools/Downloads/Pages/Monthly-Interest-Rate-Data.aspx