# Models

## The baseline model

In [34]:
# Imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
    confusion_matrix, classification_report

from sklearn.metrics import roc_auc_score, plot_roc_curve, roc_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve,plot_precision_recall_curve

from sklearn.model_selection import learning_curve

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

from sklearn.inspection import permutation_importance
#import warnings
#warnings.simplefilter(action="ignore")

In [3]:
# load my clean dataset
df = pd.read_csv('clean.csv')

## Model 1 : Dummy

Iteration 1

R2 score : 0.0

In [8]:
X = df.drop(['median_house_value'], axis=1)
y = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.3, random_state=1)

# Dummy
dummy_reg = DummyRegressor(strategy="mean")

# fit the model
dummy_reg.fit(X_train, y_train)

# Iteration
print("First iteration R2 = {}".format(dummy_reg.score(X_train, y_train)))

First iteration R2 = 0.0


## Model 2 : Linear Regression

In [12]:
# Replace the missing values for 0 

df_zero = df.fillna(0)

Iteration 2

R2 score : 0.652

In [13]:
# linear regression with all features 
X = df_zero.drop(['median_house_value'], axis=1)
y = df_zero['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.3)

model = LinearRegression()

model.fit(X_train, y_train)

score = model.score(X_train, y_train)

#coef and intercept results
print('Coef:',model.coef_)
print('Intercept:',model.intercept_)
print('Score:',score)


Coef: [ 9.99026076e-02 -2.61484713e+04 -2.55114783e+04  1.01155009e+03
 -3.15159159e+00  5.10972096e+01 -4.61827539e+01  1.05737125e+02
  3.83590379e+04 -2.78476409e+04 -6.62765002e+04  1.45685948e+05
 -3.18703662e+04 -1.96914405e+04]
Intercept: -2154189.8826179905
Score: 0.652373078349147


In [14]:
# Remove the column with the missing values

df_no_bed = df.drop(columns=['total_bedrooms'])


Iteration 3

R2 score : 0.643

In [22]:
# linear regression with no bedrooms

X = df_no_bed.drop(['median_house_value'], axis=1)
y = df_no_bed['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.3)

model = LinearRegression()

model.fit(X_train, y_train)

score = model.score(X_train, y_train)

#coef and intercept results
print('Coef:',model.coef_)
print('Intercept:',model.intercept_)
print('Score:',score)

Coef: [ 1.82997027e-01 -2.53359471e+04 -2.41165985e+04  1.05474789e+03
  3.14679997e-01 -4.96443228e+01  1.55147731e+02  3.75319151e+04
 -4.22387514e+04 -8.71951289e+04  2.22320970e+05 -5.01218194e+04
 -4.27652698e+04]
Intercept: -2090270.0746615273
Score: 0.6431667741821034


In [17]:
# Replace the column with the missing values for the median

df_median = df.fillna(df.median())

Iteration 4

R2 score : 0.629

In [23]:
# linear regression with MEDIAN
X = df_median.drop(['median_house_value'], axis=1)
y = df_median['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.3)

model = LinearRegression()

model.fit(X_train, y_train)

score = model.score(X_train, y_train)

#coef and intercept results
print('Coef:',model.coef_)
print('Intercept:',model.intercept_)
print('Score:',score)

Coef: [ 2.09165852e-02 -2.62983638e+04 -2.56552506e+04  1.07739104e+03
 -4.05750499e+00  8.87084342e+01 -3.78885579e+01  4.89100911e+01
  3.76171173e+04  8.99151230e+03 -3.08151315e+04 -1.81898940e-12
  7.75135218e+03  1.40722670e+04]
Intercept: -2205761.01929846
Score: 0.6299222626299458


In [20]:
# Replace the column with the missing values for the median

df_mean = df.fillna(df.mean())

Iteration 5

R2 score : 0.659

In [26]:
# linear regression with the MEAN
X = df_mean.drop(['median_house_value'], axis=1)
y = df_mean['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.3)

model = LinearRegression()

model.fit(X_train, y_train)

score = model.score(X_train, y_train)

#coef and intercept results
print('Coef:',model.coef_)
print('Intercept:',model.intercept_)
print('Score:',score)

Coef: [-3.66765690e-01 -2.21949186e+04 -2.10540812e+04  9.54767078e+02
 -5.32263604e+00  8.56098653e+01 -4.43309301e+01  7.51950059e+01
  3.94076984e+04  1.64240953e+03 -4.38228810e+04  4.43372596e+04
 -7.56521563e+03  5.40842744e+03]
Intercept: -1867593.189341713
Score: 0.659826084721391


## Model 3 : Random Forest


Iteration 6
R2 score : 0.75

In [36]:
X = df_zero.drop(['median_house_value'], axis=1)
y = df_zero['median_house_value']

# standardizing the features (scaling)
X = RobustScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.3)

rf = RandomForestRegressor(n_estimators=10, random_state=20)

rf.fit(X_train, y_train)

pred = rf.predict(X_test)

# R2 
print("R2={}".format(rf.score(X_test,y_test)))

# MAE
errors = abs(pred - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2))

# MAPE
mape = 100 * (errors / y_test)
print('Mean Absolute Percentage Error :', round(np.mean(mape), 2), '%.')

R2=0.7495616122362954
Mean Absolute Error: 38848.45
Mean Absolute Percentage Error : 21.86 %.


Iteration 7
R2 score : 0.74

In [38]:
X = df_mean.drop(['median_house_value'], axis=1)
y = df_mean['median_house_value']

# standardizing the features (scaling)
X = RobustScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.3)

rf = RandomForestRegressor(n_estimators=10, random_state=20)

rf.fit(X_train, y_train)

pred = rf.predict(X_test)

# R2 
print("R2={}".format(rf.score(X_test,y_test)))

# MAE
errors = abs(pred - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2))

# MAPE
mape = 100 * (errors / y_test)
print('Mean Absolute Percentage Error :', round(np.mean(mape), 2), '%.')

R2=0.748775850158302
Mean Absolute Error: 38950.63
Mean Absolute Percentage Error : 21.96 %.
