In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

In [2]:
df2 = pd.read_csv('../datasets/Richard_392_dem.csv')

In [3]:
df2.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 1,1,2,3,4,GEN_ALIAS,1p,2p,3p,4p,MEDIAN_GROSS_RENT,HU_VALUE_MEDIAN_DOLLARS,MEDIAN_HH_INC_PAST_12MO_DOLLAR
0,0,0,-122.318147,47.535777,30,24,0,0,Georgetown,0.555556,0.444444,0.0,0.0,988,341900,58611
1,1,1,-122.378989,47.646236,18,18,0,0,Interbay,0.5,0.5,0.0,0.0,1490,571300,74679
2,2,2,-122.321063,47.627981,38,110,4,3,North Capitol Hill,0.245161,0.709677,0.025806,0.019355,1576,896200,96220
3,3,3,-122.285281,47.691914,6,9,0,0,Wedgwood/View Ridge,0.4,0.6,0.0,0.0,1596,628275,114723
4,4,4,-122.316971,47.633354,23,60,1,2,North Capitol Hill,0.267442,0.697674,0.011628,0.023256,1576,896200,96220


In [5]:
X = df2.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 1', '4', 'GEN_ALIAS', '4p', 'MEDIAN_GROSS_RENT', 'HU_VALUE_MEDIAN_DOLLARS', 'MEDIAN_HH_INC_PAST_12MO_DOLLAR'], axis = 1)
y = df2['HU_VALUE_MEDIAN_DOLLARS']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 40)

## Median Home Value Correlations

## Linear Regression

In [15]:
lr = linear_model.LinearRegression()

In [16]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
y_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

In [18]:
lr_rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print('Linear Regression X Train: ', lr_rmse)

lr_rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print('Linear Regression X Test:', lr_rmse_test)

Linear Regression X Train:  130627.626745373
Linear Regression X Test: 138310.9098050184


In [19]:
def r2(y, y_pred):
    res = y - y_pred
    return 1 - res @ res / np.var(y) / len(y)

In [102]:
r2(y_train, y_pred)

0.6152386176314169

In [20]:
r2(y_test, y_test_pred)

0.43608837833163516

## Adaboost

In [21]:
ab = AdaBoostRegressor()

In [22]:
ab.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None)

In [23]:
y_pred = ab.predict(X_train)
y_test_pred = ab.predict(X_test)

In [24]:
ab_rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print('Adaboost X Train:', ab_rmse)

ab_rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print('Adaboost X Test:', ab_rmse_test)

Adaboost X Train: 95421.9888428901
Adaboost X Test: 130403.14544158432


In [25]:
r2(y_train, y_pred)

0.6836058012316124

In [26]:
r2(y_test, y_test_pred)

0.4987270113734644

## LASSO

In [27]:
ss = StandardScaler()

In [28]:
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.fit_transform(X_test)

In [29]:
lasso_alphas = np.logspace(3, 22, 100)

lasso_model = LassoCV(alphas=lasso_alphas, cv=5, max_iter=5000)

lasso_model = lasso_model.fit(X_train_ss, y_train)

In [30]:
lasso_model.alpha_

2420.128264794381

In [31]:
print(lasso_model.score(X_train_ss, y_train))
print(lasso_model.score(X_test_ss, y_test))

0.40409871413255194
0.4356988443257326


In [32]:
y_pred = lasso_model.predict(X_train)
y_test_preds = lasso_model.predict(X_test)

In [33]:
r2(y_test, y_test_preds)

-8.488128658913967

In [34]:
lasso = Lasso()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv = 5)

lasso_regressor.fit(X_train, y_train)
#https://github.com/marcopeix/ISL-Ridge-Lasso/blob/master/Lasso%20and%20Ridge%20Regression.ipynb

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [35]:
lasso_regressor.best_params_

{'alpha': 1}

In [36]:
lasso_regressor.best_score_

-18507008816.58909

In [37]:
y_pred = lasso_regressor.predict(X_train)
y_test_pred = lasso_regressor.predict(X_test)

In [38]:
r2(y_train, y_pred)

0.406991231222138

In [39]:
r2(y_test, y_test_pred)

0.4363477208403159

## KNN

In [40]:
knn_params = {
    'n_neighbors': range(1, 51, 10),
    'metric': ['euclidean', 'manhattan']
}

In [41]:
knn_gridsearch = GridSearchCV(KNeighborsRegressor(),
                              knn_params,
                              cv=5,
                              verbose=1)
#4.06 lesson for code

In [42]:
knn_gridsearch.fit(X_train, y_train);

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.2s finished


In [43]:
knn_gridsearch.best_score_

0.325080260877538

In [44]:
knn_gridsearch.best_params_

{'metric': 'manhattan', 'n_neighbors': 11}

In [45]:
best_knn = knn_gridsearch.best_estimator_
best_knn.score(X_test, y_test)

0.4735234822261768

In [46]:
best_knn.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
                    metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                    weights='uniform')

In [47]:
y_pred = best_knn.predict(X_train)
y_test_pred = best_knn.predict(X_test)

In [48]:
r2(y_train, y_pred)

0.45786644964309176

In [49]:
r2(y_test, y_test_pred)

0.473523482226177

## Decision Tree

In [50]:
grid = GridSearchCV(estimator = DecisionTreeRegressor(),
                    param_grid = {'max_depth': [3, 5, 7, 10],
                                  'min_samples_split': [5, 10, 15, 20],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    verbose = 1)

In [51]:
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:    1.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 5, 7, 10],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7],
                         'min_samples_split': [5, 10, 15, 20]},
             pre_dispatch='2*n

In [52]:
grid.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [53]:
grid.best_score_

0.40660596874106736

In [54]:
dt = grid.best_estimator_


In [55]:
print(dt.score(X_train, y_train))
print(dt.score(X_test, y_test))

0.5879373540821817
0.29813402556380475


In [56]:
y_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

In [57]:
r2(y_train, y_pred)

0.5879373540821822

In [58]:
r2(y_test, y_test_pred)

0.29813402556380497

## Median Income Correlations 

In [7]:
X = df2.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 1', '4', 'GEN_ALIAS', '4p', 'MEDIAN_GROSS_RENT', 'HU_VALUE_MEDIAN_DOLLARS', 'MEDIAN_HH_INC_PAST_12MO_DOLLAR'], axis = 1)
y = df2['HU_VALUE_MEDIAN_DOLLARS']

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 100)

### Linear Regression 

In [61]:
lr = linear_model.LinearRegression()

In [62]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [63]:
y_preds = lr.predict(X_train)
y_test_preds = lr.predict(X_test)

In [64]:
lr_rmse = np.sqrt(mean_squared_error(y_train, y_preds))
print('Linear Regression Income:', lr_rmse)

lr_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_preds))
print('Linear Regression Income:', lr_test_rmse)

Linear Regression Income: 125646.03285355256
Linear Regression Income: 158767.7670790529


In [65]:
r2(y_test, y_test_preds)

0.15540598741701983

### Adaboost 

In [66]:
ab = AdaBoostRegressor()

In [67]:
ab.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None)

In [68]:
y_preds = ab.predict(X_train)
y_test_preds = ab.predict(X_test)

In [69]:
ab_rmse = np.sqrt(mean_squared_error(y_train, y_preds))
print('Adaboost Income:', ab_rmse)

ab_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_preds))
print('Adaboost Income:', ab_test_rmse)

Adaboost Income: 94469.52116557774
Adaboost Income: 121286.82448141111


In [100]:
r2(y_train, y_preds)

0.7080073920914206

In [70]:
r2(y_test, y_test_preds)

0.507109354654379

### LASSO

In [71]:
ss = StandardScaler()

In [72]:
X_train_ss = ss.fit_transform(X_train)
x_test_ss = ss.fit_transform(X_test)

In [73]:
lasso_alphas = np.logspace(3, 24, 95)
laso_model = LassoCV(alphas=lasso_alphas, cv =5, max_iter = 5000)
lasso_model = lasso_model.fit(X_train_ss, y_train)

In [74]:
lasso_model.alpha_

1555.6761439304723

In [75]:
print(lasso_model.score(X_train_ss, y_train))
print(lasso_model.score(X_test_ss, y_test))

0.48264942430408075
-0.6973795878115349


In [76]:
lasso = Lasso()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv = 5)

lasso_regressor.fit(X_train, y_train)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [77]:
lasso_regressor.best_params_

{'alpha': 20}

In [78]:
lasso_regressor.best_score_

-17029304025.764471

In [79]:
y_pred = lasso_regressor.predict(X_train)

In [80]:
y_test_preds = lasso_regressor.predict(X_test)

In [81]:
r2(y_test, y_test_preds)

0.15904311135921656

### KNN 

In [82]:
knn_params = {
    'n_neighbors': range(1, 51, 10),
    'metric': ['euclidean', 'manhattan']
}

In [83]:
knn_gridsearch = GridSearchCV(KNeighborsRegressor(),
                              knn_params,
                              cv=5,
                              verbose=1)


In [84]:
knn_gridsearch.fit(X_train, y_train);

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.2s finished


In [85]:
knn_gridsearch.best_params_

{'metric': 'manhattan', 'n_neighbors': 11}

In [86]:
knn_gridsearch.best_score_

0.43218381796698024

In [87]:
best_knn = knn_gridsearch.best_estimator_
best_knn.score(X_test, y_test)

0.37213211381511513

In [88]:
y_pred = knn_gridsearch.predict(X_train)
y_test_preds = knn_gridsearch.predict(X_test)

In [89]:
r2(y_test, y_test_preds)

0.3721321138151146

### Decision Tree

In [90]:
grid = GridSearchCV(estimator = DecisionTreeRegressor(),
                    param_grid = {'max_depth': [3, 5, 7, 10],
                                  'min_samples_split': [5, 10, 15, 20],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    verbose = 1)

In [91]:
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:    1.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 5, 7, 10],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7],
                         'min_samples_split': [5, 10, 15, 20]},
             pre_dispatch='2*n

In [92]:
grid.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=5, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [93]:
grid.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=5, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [94]:
dt = grid.best_estimator_


In [95]:
print(dt.score(X_train, y_train))
print(dt.score(X_test, y_test))

0.6152386176314171
0.45500200703870997


In [96]:
y_pred = dt.predict(X_train)
y_test_preds = dt.predict(X_test)

In [97]:
r2(y_test, y_test_preds)

0.4550020070387095

In [98]:
r2(y_train, y_preds)

0.7080073920914206

In [99]:
r2_score(y_test, y_test_preds)

0.45500200703870997