# Modeling Climate and Housing Data

In [21]:
# importing relevant libraries
from datetime import datetime
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import geopandas as gpd
%matplotlib inline

# Importing preprocessing and modeling libraries
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import tree

# Importing metrics for model evaluation
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV 

# Importing class balance tools for handling imbalanced datasets
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from yellowbrick.target import ClassBalance

# Importing classifier models
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [22]:
data = pd.read_csv("all_data.csv", index_col=0)
data.head()

Unnamed: 0,Date,Year,Month,RegionName,State,Bottom-Tier Average Home Value Estimate,Top-Tier Average Home Value Estimate,Value,Anomaly (1901-2000 base period),1901-2000 Mean,StateAbbrv,Initial Fees and Charges (%),Effective Rate (%),Term to Maturity,Loan Amount ($thou),Purchase Price ($thou),Loan-to-Price Ratio (%),Share of Total Market (%)
0,2000-01-31,2000,1,Los Angeles County,CA,122826.545216,400192.738911,52.3,4.5,47.8,CA,0.93,8.01,15,103.0,173.6,64.7,7.0
1,2000-01-31,2000,1,Los Angeles County,CA,122826.545216,400192.738911,52.3,4.5,47.8,CA,0.81,8.2,30,129.9,166.8,81.1,61.1
2,2000-01-31,2000,1,Cook County,IL,71689.188026,302264.545103,24.9,2.4,22.5,IL,0.93,8.01,15,103.0,173.6,64.7,7.0
3,2000-01-31,2000,1,Cook County,IL,71689.188026,302264.545103,24.9,2.4,22.5,IL,0.81,8.2,30,129.9,166.8,81.1,61.1
4,2000-01-31,2000,1,Harris County,TX,61195.175569,217850.462915,57.4,5.6,51.8,TX,0.93,8.01,15,103.0,173.6,64.7,7.0


In [23]:
data.drop(columns=['Date'], inplace=True)
data.rename(columns={'Value': 'Average Temperature Value'}, inplace=True)

In [24]:
# Loading the shapefile in order to visualize clean data on the US map
us_counties = gpd.read_file('tl_2022_us_county/tl_2022_us_county.shp')
print(us_counties.columns)
us_counties.head()

Index(['STATEFP', 'COUNTYFP', 'COUNTYNS', 'GEOID', 'NAME', 'NAMELSAD', 'LSAD',
       'CLASSFP', 'MTFCC', 'CSAFP', 'CBSAFP', 'METDIVFP', 'FUNCSTAT', 'ALAND',
       'AWATER', 'INTPTLAT', 'INTPTLON', 'geometry'],
      dtype='object')


Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,31,39,835841,31039,Cuming,Cuming County,6,H1,G4020,,,,A,1477644346,10691216,41.9158651,-96.7885168,"POLYGON ((-96.55515 41.91587, -96.55515 41.914..."
1,53,69,1513275,53069,Wahkiakum,Wahkiakum County,6,H1,G4020,,,,A,680980770,61564427,46.2946377,-123.4244583,"POLYGON ((-123.72755 46.26449, -123.72755 46.2..."
2,35,11,933054,35011,De Baca,De Baca County,6,H1,G4020,,,,A,6016818946,29090018,34.3592729,-104.3686961,"POLYGON ((-104.89337 34.08894, -104.89337 34.0..."
3,31,109,835876,31109,Lancaster,Lancaster County,6,H1,G4020,,,,A,2169272978,22847034,40.7835474,-96.6886584,"POLYGON ((-96.68493 40.52330, -96.69219 40.523..."
4,31,129,835886,31129,Nuckolls,Nuckolls County,6,H1,G4020,,,,A,1489645185,1718484,40.1764918,-98.0468422,"POLYGON ((-98.27370 40.11840, -98.27374 40.122..."


In [25]:
# Merge the geospatial United States map with the data
data = data.merge(us_counties, left_on='RegionName', right_on='NAMELSAD')

data.head()

Unnamed: 0,Year,Month,RegionName,State,Bottom-Tier Average Home Value Estimate,Top-Tier Average Home Value Estimate,Average Temperature Value,Anomaly (1901-2000 base period),1901-2000 Mean,StateAbbrv,...,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,2000,1,Los Angeles County,CA,122826.545216,400192.738911,52.3,4.5,47.8,CA,...,G4020,,,,A,10515988166,1785003207,34.1963983,-118.2618616,"MULTIPOLYGON (((-118.67820 33.03973, -118.6756..."
1,2000,1,Los Angeles County,CA,122826.545216,400192.738911,52.3,4.5,47.8,CA,...,G4020,,,,A,10515988166,1785003207,34.1963983,-118.2618616,"MULTIPOLYGON (((-118.67820 33.03973, -118.6756..."
2,2000,1,Cook County,IL,71689.188026,302264.545103,24.9,2.4,22.5,IL,...,G4020,,,,A,591684404,12308449,31.1525157,-83.4294448,"POLYGON ((-83.51530 31.30235, -83.51487 31.302..."
3,2000,1,Cook County,IL,71689.188026,302264.545103,24.9,2.4,22.5,IL,...,G4020,,,,A,2447342616,1786339408,41.8942937,-87.6454546,"POLYGON ((-87.52500 41.64464, -87.52505 41.639..."
4,2000,1,Cook County,IL,71689.188026,302264.545103,24.9,2.4,22.5,IL,...,G4020,,,,A,3762130687,4887924296,47.7585562,-90.3443192,"POLYGON ((-90.75158 48.09099, -90.74146 48.094..."


In [31]:
data['Year'].unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019], dtype=int64)

In [36]:
test = data[data['Year'] == 2019].copy()
train = data[data['Year'] != 2019].copy()
y_test = test[['Bottom-Tier Average Home Value Estimate', 'Top-Tier Average Home Value Estimate']].copy()
y_train = train[['Bottom-Tier Average Home Value Estimate', 'Top-Tier Average Home Value Estimate']].copy()
X_train = train.drop(columns=['RegionName', 'State', 'StateAbbrv', 'Bottom-Tier Average Home Value Estimate', 'Top-Tier Average Home Value Estimate', \
                       'STATEFP', 'COUNTYFP', 'COUNTYNS', 'GEOID', 'NAME', 'NAMELSAD', 'LSAD',\
                       'CLASSFP', 'MTFCC', 'CSAFP', 'CBSAFP', 'METDIVFP', 'FUNCSTAT','geometry']).copy()
X_test = test.drop(columns=['RegionName', 'State', 'StateAbbrv', 'Bottom-Tier Average Home Value Estimate', 'Top-Tier Average Home Value Estimate', \
                       'STATEFP', 'COUNTYFP', 'COUNTYNS', 'GEOID', 'NAME', 'NAMELSAD', 'LSAD',\
                       'CLASSFP', 'MTFCC', 'CSAFP', 'CBSAFP', 'METDIVFP', 'FUNCSTAT','geometry']).copy()


In [42]:
# your code here
l_model = LinearRegression()
l_model.fit(X_train, y_train)
y_pred_test = l_model.predict(X_test)
y_pred_train = l_model.predict(X_train)

r_squared_train_linear = r2_score(y_train, y_pred_train)
r_squared_test_linear = r2_score(y_test, y_pred_test)

# sanity check for myself
r_squared_train_linear_2 = l_model.score(X_train, y_train)
r_squared_test_linear_2 = l_model.score(X_test, y_test)
# print('r_squared_test_2:',round(r_squared_test_linear_2, 3))
# print('r_squared_train_2:',round(r_squared_train_linear_2, 3))

mean_squared_error_train_linear = mean_squared_error(y_train, y_pred_train)
mean_squared_error_test_linear = mean_squared_error(y_test, y_pred_test)

# Printing results for training
print('r_squared_train:', round(r_squared_train_linear, 3))
print('mean_squared_error_train:', round(mean_squared_error_train_linear, 3))

# Printing results for testing
print('r_squared_test:',round(r_squared_test_linear, 3))
print('mean_squared_error_test:',round(mean_squared_error_test_linear, 3))


r_squared_train: 0.048
mean_squared_error_train: 10670562863.298
r_squared_test: 0.011
mean_squared_error_test: 15760329558.538


In [10]:
# Getting 300 values on a logarithmic scale
scaler = StandardScaler()
X_train_standarized = scaler.fit_transform(X_train)
X_test_standarized = scaler.fit_transform(X_test)
alphas = np.logspace(-5, 5, 30)
print(alphas)
# your code here
mse_train_lasso = {}
coefficients = []
l_model = Lasso(max_iter=1000, tol=0.1) 

for a in alphas:
    print(a)
    l_model.set_params(alpha=a)
    l_model.fit(X_train_standarized, y_train)
    y_pred_test = l_model.predict(X_test_standarized)
    print("Predicted")
    mse_train_lasso[a] = l_model.score(X_test_standarized, y_pred_test)
    coefficients.append(l_model.coef_)

lasso_best_alpha = min(mse_train_lasso, key=mse_train_lasso.get)
print("Alpha of min MSE: ", lasso_best_alpha)
print("Min MSE: ", mse_train_lasso[lasso_best_alpha])


[1.00000000e-05 2.21221629e-05 4.89390092e-05 1.08263673e-04
 2.39502662e-04 5.29831691e-04 1.17210230e-03 2.59294380e-03
 5.73615251e-03 1.26896100e-02 2.80721620e-02 6.21016942e-02
 1.37382380e-01 3.03919538e-01 6.72335754e-01 1.48735211e+00
 3.29034456e+00 7.27895384e+00 1.61026203e+01 3.56224789e+01
 7.88046282e+01 1.74332882e+02 3.85662042e+02 8.53167852e+02
 1.88739182e+03 4.17531894e+03 9.23670857e+03 2.04335972e+04
 4.52035366e+04 1.00000000e+05]
1e-05


In [None]:
# your code here
# Running Lasso model with best alpha
l_model = Lasso(max_iter=100000, tol=0.1)
l_model.set_params(alpha=lasso_best_alpha)
l_model.fit(X_train_standarized, y_train)

# Extracting the values of coefficients at the best alpha and adding to df
coef['Lasso_Coefficients'] = l_model.coef_

# Displaying results
coef


In [None]:
# your code here
plt.figure(figsize=(20, 6))

# Plot coefficients vs alphas
plt.subplot(121)
ax = plt.gca()
ax.plot(alphas, coefficients)
plt.axvline(x=lasso_best_alpha, color='r', linestyle='--')
ax.set_xscale('log')
ax.set_title("Alphas vs Coefficients")
ax.set_xlabel("Alphas")
ax.set_ylabel("Coefficients")

# Plot CV MSE vs alphas
plt.subplot(122)
ax = plt.gca()
ax.plot(alphas, mse_train_lasso.values())
plt.axvline(x=lasso_best_alpha, color='r', linestyle='--')
ax.set_xscale('log')
plt.xlabel('Alphas')
plt.ylabel('MSE')
plt.title('Alphas vs MSE')
