In [1]:
# Import all the tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Model Evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [65]:
df = pd.read_csv("miami-housing.csv")

In [66]:
df

Unnamed: 0,LATITUDE,LONGITUDE,PARCELNO,SALE_PRC,LND_SQFOOT,TOT_LVG_AREA,SPEC_FEAT_VAL,RAIL_DIST,OCEAN_DIST,WATER_DIST,CNTR_DIST,SUBCNTR_DI,HWY_DIST,age,avno60plus,month_sold,structure_quality
0,25.891031,-80.160561,622280070620,440000.0,9375,1753,0,2815.9,12811.4,347.6,42815.3,37742.2,15954.9,67,0,8,4
1,25.891324,-80.153968,622280100460,349000.0,9375,1715,0,4359.1,10648.4,337.8,43504.9,37340.5,18125.0,63,0,9,4
2,25.891334,-80.153740,622280100470,800000.0,9375,2276,49206,4412.9,10574.1,297.1,43530.4,37328.7,18200.5,61,0,2,4
3,25.891765,-80.152657,622280100530,988000.0,12450,2058,10033,4585.0,10156.5,0.0,43797.5,37423.2,18514.4,63,0,9,4
4,25.891825,-80.154639,622280100200,755000.0,12800,1684,16681,4063.4,10836.8,326.6,43599.7,37550.8,17903.4,42,0,7,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13927,25.783130,-80.259795,131320040990,275000.0,6780,967,6580,3844.5,20568.0,3252.4,22175.9,12150.1,917.4,16,0,4,4
13928,25.783585,-80.260354,131320040910,340000.0,7500,1854,2544,3593.6,20791.9,3077.7,22375.1,12316.8,738.2,26,0,5,4
13929,25.783793,-80.256126,131320040420,287500.0,8460,1271,2064,4143.2,20307.9,3588.4,20966.9,12433.0,743.7,16,0,7,4
13930,25.784007,-80.257542,131320040330,315000.0,7500,1613,3136,3986.9,20542.6,3589.1,21475.6,12458.0,626.1,16,0,8,4


In [67]:
df.shape

(13932, 17)

In [68]:
df.dtypes

LATITUDE             float64
LONGITUDE            float64
PARCELNO               int64
SALE_PRC             float64
LND_SQFOOT             int64
TOT_LVG_AREA           int64
SPEC_FEAT_VAL          int64
RAIL_DIST            float64
OCEAN_DIST           float64
WATER_DIST           float64
CNTR_DIST            float64
SUBCNTR_DI           float64
HWY_DIST             float64
age                    int64
avno60plus             int64
month_sold             int64
structure_quality      int64
dtype: object

In [69]:
df.isna().sum()

LATITUDE             0
LONGITUDE            0
PARCELNO             0
SALE_PRC             0
LND_SQFOOT           0
TOT_LVG_AREA         0
SPEC_FEAT_VAL        0
RAIL_DIST            0
OCEAN_DIST           0
WATER_DIST           0
CNTR_DIST            0
SUBCNTR_DI           0
HWY_DIST             0
age                  0
avno60plus           0
month_sold           0
structure_quality    0
dtype: int64

In [70]:
x_train = df.drop('SALE_PRC', axis=1)
y_train = df['SALE_PRC']
x = df.drop('SALE_PRC', axis=1)
y = df['SALE_PRC']

In [71]:
x_train

Unnamed: 0,LATITUDE,LONGITUDE,PARCELNO,LND_SQFOOT,TOT_LVG_AREA,SPEC_FEAT_VAL,RAIL_DIST,OCEAN_DIST,WATER_DIST,CNTR_DIST,SUBCNTR_DI,HWY_DIST,age,avno60plus,month_sold,structure_quality
0,25.891031,-80.160561,622280070620,9375,1753,0,2815.9,12811.4,347.6,42815.3,37742.2,15954.9,67,0,8,4
1,25.891324,-80.153968,622280100460,9375,1715,0,4359.1,10648.4,337.8,43504.9,37340.5,18125.0,63,0,9,4
2,25.891334,-80.153740,622280100470,9375,2276,49206,4412.9,10574.1,297.1,43530.4,37328.7,18200.5,61,0,2,4
3,25.891765,-80.152657,622280100530,12450,2058,10033,4585.0,10156.5,0.0,43797.5,37423.2,18514.4,63,0,9,4
4,25.891825,-80.154639,622280100200,12800,1684,16681,4063.4,10836.8,326.6,43599.7,37550.8,17903.4,42,0,7,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13927,25.783130,-80.259795,131320040990,6780,967,6580,3844.5,20568.0,3252.4,22175.9,12150.1,917.4,16,0,4,4
13928,25.783585,-80.260354,131320040910,7500,1854,2544,3593.6,20791.9,3077.7,22375.1,12316.8,738.2,26,0,5,4
13929,25.783793,-80.256126,131320040420,8460,1271,2064,4143.2,20307.9,3588.4,20966.9,12433.0,743.7,16,0,7,4
13930,25.784007,-80.257542,131320040330,7500,1613,3136,3986.9,20542.6,3589.1,21475.6,12458.0,626.1,16,0,8,4


In [72]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

In [73]:
print('R^2: {0}'.format(model.score(x_train, y_train)))

R^2: 0.7138503670137699


In [11]:
# intercept - b0

coef = pd.DataFrame(x_train.columns.tolist(), columns=[ 'category'])
coef["coef"] = model.coef_
coef.loc[len(coef.index)] = ['INTERCEPT',model.intercept_]
coef

Unnamed: 0,category,coef
0,LATITUDE,-441346.3
1,LONGITUDE,1415700.0
2,PARCELNO,-4.880139e-08
3,LND_SQFOOT,3.475073
4,TOT_LVG_AREA,190.7606
5,SPEC_FEAT_VAL,2.973597
6,RAIL_DIST,4.929402
7,OCEAN_DIST,1.148487
8,WATER_DIST,-1.028187
9,CNTR_DIST,1.044485


In [14]:
import statsmodels.api as sm
from scipy import stats
from sklearn import datasets

diabetes = datasets.load_diabetes()

x = x_train
y = y_train

#x = diabetes.data
#y = diabetes.target

X2 = sm.add_constant(x)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:               SALE_PRC   R-squared:                       0.711
Model:                            OLS   Adj. R-squared:                  0.710
Method:                 Least Squares   F-statistic:                     2280.
Date:                Sat, 06 Nov 2021   Prob (F-statistic):               0.00
Time:                        01:16:00   Log-Likelihood:            -1.8761e+05
No. Observations:               13932   AIC:                         3.752e+05
Df Residuals:                   13916   BIC:                         3.754e+05
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                77.8206     20.60

In [74]:
%%time 

# Let's bould the ml model
#np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=-1, #all cores of cpu
                             random_state=42) 
    
model.fit(x_train, y_train)

Wall time: 1.52 s


RandomForestRegressor(n_jobs=-1, random_state=42)

In [87]:
from sklearn.model_selection import train_test_split
np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.001)

In [88]:
x_test

Unnamed: 0,LATITUDE,LONGITUDE,PARCELNO,LND_SQFOOT,TOT_LVG_AREA,SPEC_FEAT_VAL,RAIL_DIST,OCEAN_DIST,WATER_DIST,CNTR_DIST,SUBCNTR_DI,HWY_DIST,age,avno60plus,month_sold,structure_quality
6442,25.751585,-80.354429,3040080071300,8250,1920,750,11227.5,37268.4,7283.3,53965.6,26569.4,9971.9,55,0,12,4
8524,25.5392,-80.379349,3060190090670,4385,2857,1360,11073.6,16216.4,8882.5,105924.2,58379.5,2945.1,1,0,4,2
13210,25.644376,-80.386857,3059130241010,4983,2528,4136,902.6,28661.9,21366.4,79945.9,28959.4,444.6,14,0,10,4
2501,25.878742,-80.123984,1422350010781,5600,1767,3311,14846.0,1147.1,933.8,43390.1,32063.1,24226.2,31,0,11,4
168,25.908868,-80.184472,622190140850,7740,1700,0,7225.9,20624.6,4045.2,48060.4,46434.9,8336.5,41,0,6,4
12143,25.715862,-80.428979,3049220310490,5000,1673,26840,18498.1,54379.6,35036.2,80807.3,39273.1,14849.8,27,0,10,4
10761,25.453936,-80.488039,1678240190384,6650,1563,0,3428.4,49028.8,23561.9,152355.2,102781.3,4363.6,20,0,1,2
5156,25.632611,-80.28924,350240090040,7967,6014,26100,15231.7,3207.9,205.3,61162.9,21754.9,18940.1,6,0,1,5
1351,25.832839,-80.24393,3031160030050,8175,1884,33388,848.3,33565.2,10471.9,26414.5,26414.5,7411.6,46,0,7,2
10148,25.466139,-80.427346,1079150170810,4668,2944,3007,16911.7,28638.8,17257.7,136757.5,89047.7,5587.3,1,0,12,2


In [93]:
model.score(x_test,y_test)

0.986789635852924

In [90]:
y_preds = model.predict(x_test)

In [91]:
y_test

6442      380000.0
8524      312000.0
13210     330000.0
2501      805000.0
168       210000.0
12143     305000.0
10761     127000.0
5156     2075000.0
1351      215000.0
10148     324000.0
6497      655000.0
708       165000.0
11822     555000.0
7591      316000.0
Name: SALE_PRC, dtype: float64

In [92]:
y_preds

array([ 359098.,  311913.,  342721.,  763965.,  217971.,  309494.,
        136438., 1927595.,  202302.,  323519.,  525967.,  163335.,
        520296.,  313100.])

In [85]:
# Create evaluation function (RMSLE)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

def rmsle(y_test, y_preds):
    """
    Calculates root mean squared log error between predictions and true labels
    """
    return np.sqrt(mean_squared_log_error(y_test, y_preds))


# Create function to evaluate model on a few different levels
def show_scores(model):
    train_preds = model.predict(x_train)
    val_preds = model.predict(x_test)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error(y_test, val_preds),
              "Training RMSLE": rmsle(y_train,train_preds),
              "Valid RMSLE": rmsle(y_test,val_preds),
              "Training R^2": r2_score(y_train,train_preds),
              "Valid R^2": r2_score(y_test,val_preds)}
    return scores

In [86]:
show_scores(model)

{'Training MAE': 16431.851980241343,
 'Valid MAE': 10494.5,
 'Training RMSLE': 0.06045218543553618,
 'Valid RMSLE': 0.04000559510469145,
 'Training R^2': 0.9879717330354216,
 'Valid R^2': 0.8110289044117647}

In [37]:
x_test

Unnamed: 0,LATITUDE,LONGITUDE,PARCELNO,LND_SQFOOT,TOT_LVG_AREA,SPEC_FEAT_VAL,RAIL_DIST,OCEAN_DIST,WATER_DIST,CNTR_DIST,SUBCNTR_DI,HWY_DIST,age,avno60plus,month_sold,structure_quality
6442,25.751585,-80.354429,3040080071300,8250,1920,750,11227.5,37268.4,7283.3,53965.6,26569.4,9971.9,55,0,12,4
8524,25.539200,-80.379349,3060190090670,4385,2857,1360,11073.6,16216.4,8882.5,105924.2,58379.5,2945.1,1,0,4,2
13210,25.644376,-80.386857,3059130241010,4983,2528,4136,902.6,28661.9,21366.4,79945.9,28959.4,444.6,14,0,10,4
2501,25.878742,-80.123984,1422350010781,5600,1767,3311,14846.0,1147.1,933.8,43390.1,32063.1,24226.2,31,0,11,4
168,25.908868,-80.184472,622190140850,7740,1700,0,7225.9,20624.6,4045.2,48060.4,46434.9,8336.5,41,0,6,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4832,25.649643,-80.302194,2050140110790,35196,3315,24794,8979.4,9623.4,5313.1,58463.9,14475.7,11385.0,22,0,11,4
6914,25.852073,-80.362957,3530080050460,4000,2514,5514,5566.3,61110.2,8845.3,62311.2,50605.5,7977.7,1,0,5,5
13607,25.603632,-80.441932,3059330353410,5366,2412,3715,3768.5,43801.0,25196.8,103357.8,52321.8,21073.5,0,0,10,4
12622,25.696917,-80.457060,3049320370100,6634,3757,2640,8661.6,57858.7,46475.3,91676.5,47355.8,22480.1,0,0,2,4
