In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, r2_score
import joblib
from sklearn.linear_model import LinearRegression

# Step 1: Data collection and preparation
## Load the dataset


In [5]:
df = pd.read_csv('residential_properties.csv')


In [7]:
df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [12]:
# Merging the sqft Columns to size_sqft
df['size_sqft'] = df['sqft_living'] + df['sqft_lot'] + df['sqft_above'] + df['sqft_basement'] + df['sqft_living15'] + df['sqft_lot15']


In [14]:
# Drop the original columns 
df = df.drop(['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15'], axis=1)


KeyError: "['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15'] not found in axis"

In [16]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode,lat,long,size_sqft
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,0,3,7,1955,0,98178,47.5112,-122.257,15000
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,0,3,7,1951,1991,98125,47.721,-122.319,21711
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,0,3,6,1933,0,98028,47.7379,-122.233,22322
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,0,5,7,1965,0,98136,47.5208,-122.393,15280
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,0,3,8,1987,0,98074,47.6168,-122.045,20743


In [19]:
# Now dropping unwanted columns
df = df.drop(['date', 'bathrooms', 'condition', 'grade', 'lat', 'long'], axis=1)


In [20]:
df.head()

Unnamed: 0,id,price,bedrooms,floors,waterfront,view,yr_built,yr_renovated,zipcode,size_sqft
0,7129300520,221900.0,3,1.0,0,0,1955,0,98178,15000
1,6414100192,538000.0,3,2.0,0,0,1951,1991,98125,21711
2,5631500400,180000.0,2,1.0,0,0,1933,0,98028,22322
3,2487200875,604000.0,4,1.0,0,0,1965,0,98136,15280
4,1954400510,510000.0,3,1.0,0,0,1987,0,98074,20743


## Select relevant variables

In [21]:
Y = df['price']
X = df[['size_sqft', 'yr_built', 'zipcode', 'bedrooms']]
Z = df[['yr_renovated', 'floors', 'view', 'waterfront']]


# Splitting the dataset into train and test


In [22]:
X_train, X_test, Y_train, Y_test, Z_train, Z_test = train_test_split(X, Y, Z, test_size=0.2, train_size=0.8, random_state=123)


## First Stage regression

In [25]:
model_z = LinearRegression()
model_z.fit(Z_train, X_train)

In [26]:
## Predict X_train_hat
X_train_hat = model_z.predict(Z_train)

In [27]:
# Fit second stage regression
model_y = LinearRegression()
model_y.fit(X_train_hat, Y_train)

In [28]:
# Predict Y_train_hat
Y_train_hat = model_y.predict(X_train_hat)

 # Test the model

In [29]:
Y_test_hat = model_y.predict(model_z.predict(Z_test))

# Evalute the model
###  F1 score

In [31]:
f1 = f1_score(Y_test > np.median(Y_test), Y_test_hat > np.median(Y_test))
print('F1 score:', f1)

F1 score: 0.6666666666666666


### R-squared

In [32]:
r2 = r2_score(Y_test, Y_test_hat)
print('R-squared:', r2)

R-squared: 0.24017830998861556


# Save the Model

In [33]:
joblib.dump(model_y, 'residential_model_2sls.pkl')

['residential_model_2sls.pkl']

#  Load the Model

In [34]:
model_y = joblib.load('residential_model_2sls.pkl')

In [35]:
Y_test_hat = model_y.predict(model_z.predict(Z_test))
f1 = f1_score(Y_test > np.median(Y_test), Y_test_hat > np.median(Y_test))
print('F1 score after loading:', f1)
r2 = r2_score(Y_test, Y_test_hat)
print('R-squared after loading:', r2)

F1 score after loading: 0.6666666666666666
R-squared after loading: 0.24017830998861556
