In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('./datasets/generated_gps_price_radius.csv')

In [3]:
df.head()

Unnamed: 0,index,latitude,longitude,neighborhood,0.5mi 1 dollar,1.0mi 1 dollar,0.5mi 2 dollar,1.0mi 2 dollar,0.5mi 3 dollar,1.0mi 3 dollar,0.5mi 4 dollar,1.0mi 4 dollar,median income,median rent,median home value
0,0,47.698699,-122.359579,Greenwood/Phinney Ridge,0.5,0.25,0.5,0.6875,0.0,0.0625,0.0,0.0,92464,1398,556916
1,1,47.629,-122.29701,Montlake/Portage Bay,0.0,0.269231,0.75,0.653846,0.25,0.076923,0.0,0.0,132573,1723,821250
2,2,47.603136,-122.301123,Central Area/Squire Park,0.3,0.352941,0.7,0.588235,0.0,0.058824,0.0,0.0,88722,1401,517525
3,4,47.627629,-122.31755,North Capitol Hill,0.258065,0.227941,0.709677,0.720588,0.0,0.029412,0.032258,0.022059,96220,1576,896200
4,5,47.701419,-122.290185,Wedgwood/View Ridge,0.0,0.666667,0.0,0.333333,0.0,0.0,0.0,0.0,114723,1596,628275


In [4]:
X = df.drop(['index', 'latitude', 'longitude', 'neighborhood', 'median income', 'median rent', 'median home value'], axis = 1)
y = df['median home value']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 40)

## Median Home Value Correlations

## Linear Regression

In [6]:
lr = linear_model.LinearRegression()

In [7]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
y_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

In [9]:
lr_rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print('Linear Regression X Train: ', lr_rmse)

lr_rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print('Linear Regression X Test:', lr_rmse_test)

Linear Regression X Train:  144096.18873723855
Linear Regression X Test: 153034.36127390555


In [10]:
def r2(y, y_pred):
    res = y - y_pred
    return 1 - res @ res / np.var(y) / len(y)

In [11]:
r2(y_test, y_test_pred)

0.18416543924588502

## Adaboost

In [12]:
ab = AdaBoostRegressor()

In [13]:
ab.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None)

In [14]:
y_pred = ab.predict(X_train)
y_test_pred = ab.predict(X_test)

In [15]:
ab_rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print('Adaboost X Train:', ab_rmse)

ab_rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print('Adaboost X Test:', ab_rmse_test)

Adaboost X Train: 132493.72771567552
Adaboost X Test: 137110.59463388784


In [108]:
r2(y_train, y_pred)

0.45035764332273287

In [16]:
r2(y_test, y_test_pred)

0.34511322935875444

## LASSO

In [17]:
ss = StandardScaler()

In [18]:
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.fit_transform(X_test)

In [19]:
lasso_alphas = np.logspace(3, 22, 100)

lasso_model = LassoCV(alphas=lasso_alphas, cv=5, max_iter=5000)

lasso_model = lasso_model.fit(X_train_ss, y_train)

In [20]:
lasso_model.alpha_

5857.020818056667

In [21]:
print(lasso_model.score(X_train_ss, y_train))
print(lasso_model.score(X_test_ss, y_test))

0.24730589103670128
0.2048680578154617


In [23]:
y_pred = lasso_model.predict(X_train)
y_test_preds = lasso_model.predict(X_test)

In [24]:
r2(y_test, y_test_preds)

0.03478146353352951

In [28]:
lasso = Lasso()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv = 5)

lasso_regressor.fit(X_train, y_train)
#https://github.com/marcopeix/ISL-Ridge-Lasso/blob/master/Lasso%20and%20Ridge%20Regression.ipynb

  positive)
  positive)
  positive)
  positive)
  positive)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [29]:
lasso_regressor.best_params_

{'alpha': 20}

In [30]:
lasso_regressor.best_score_

-22785833787.561592

In [31]:
y_pred = lasso_regressor.predict(X_train)
y_test_pred = lasso_regressor.predict(X_test)

In [107]:
r2(y_train, y_pred)

0.45035764332273287

In [32]:
r2(y_test, y_test_pred)

0.20613406943564672

## KNN

In [33]:
knn_params = {
    'n_neighbors': range(1, 51, 10),
    'metric': ['euclidean', 'manhattan']
}

In [34]:
knn_gridsearch = GridSearchCV(KNeighborsRegressor(),
                              knn_params,
                              cv=5,
                              verbose=1)
#4.06 lesson for code

In [35]:
knn_gridsearch.fit(X_train, y_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.2s finished


In [36]:
knn_gridsearch.best_score_

0.1415514599686293

In [37]:
knn_gridsearch.best_params_

{'metric': 'manhattan', 'n_neighbors': 21}

In [38]:
best_knn = knn_gridsearch.best_estimator_
best_knn.score(X_test, y_test)

0.2623877363485988

In [39]:
best_knn.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
                    metric_params=None, n_jobs=None, n_neighbors=21, p=2,
                    weights='uniform')

In [40]:
y_pred = best_knn.predict(X_train)
y_test_pred = best_knn.predict(X_test)

In [106]:
r2(y_train, y_pred)

0.45035764332273287

In [41]:
r2(y_test, y_test_pred)

0.2623877363485996

## Decision Tree

In [50]:
grid = GridSearchCV(estimator = DecisionTreeRegressor(),
                    param_grid = {'max_depth': [3, 5, 7, 10],
                                  'min_samples_split': [5, 10, 15, 20],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    verbose = 1)

In [51]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:    1.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 5, 7, 10],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7],
                         'min_samples_split': [5, 10, 15, 20]},
             pre_dispatch='2*n

In [52]:
grid.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=7, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=7,
                      min_samples_split=5, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [53]:
grid.best_score_

0.20946748308218974

In [54]:
dt = grid.best_estimator_


In [55]:
print(dt.score(X_train, y_train))
print(dt.score(X_test, y_test))

0.4420411676476901
0.2892815519765969


In [56]:
y_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

In [105]:
r2(y_train, y_pred)

0.45035764332273287

In [57]:
r2(y_test, y_test_pred)

0.28928155197659755

## Median Income Correlations 

In [58]:
X = df.drop(['index', 'latitude', 'longitude', 'neighborhood', 'median income', 'median rent', 'median home value'], axis = 1)
y = df['median income']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 100)

### Linear Regression 

In [60]:
lr = linear_model.LinearRegression()

In [61]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [62]:
y_preds = lr.predict(X_train)
y_test_preds = lr.predict(X_test)

In [63]:
lr_rmse = np.sqrt(mean_squared_error(y_train, y_preds))
print('Linear Regression Income:', lr_rmse)

lr_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_preds))
print('Linear Regression Income:', lr_test_rmse)

Linear Regression Income: 21135.635041765316
Linear Regression Income: 21567.54334168376


In [64]:
r2(y_test, y_test_preds)

0.1616993320655643

### Adaboost 

In [65]:
ab = AdaBoostRegressor()

In [66]:
ab.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None)

In [67]:
y_preds = ab.predict(X_train)
y_test_preds = ab.predict(X_test)

In [68]:
ab_rmse = np.sqrt(mean_squared_error(y_train, y_preds))
print('Adaboost Income:', ab_rmse)

ab_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_preds))
print('Adaboost Income:', ab_test_rmse)

Adaboost Income: 19897.494608811954
Adaboost Income: 21843.777835011442


In [69]:
r2(y_test, y_test_preds)

0.14008810808335992

### LASSO

In [70]:
ss = StandardScaler()

In [71]:
X_train_ss = ss.fit_transform(X_train)
x_test_ss = ss.fit_transform(X_test)

In [72]:
lasso_alphas = np.logspace(3, 24, 95)
laso_model = LassoCV(alphas=lasso_alphas, cv =5, max_iter = 5000)
lasso_model = lasso_model.fit(X_train_ss, y_train)

In [73]:
lasso_model.alpha_

1000.0

In [74]:
print(lasso_model.score(X_train_ss, y_train))
print(lasso_model.score(X_test_ss, y_test))

0.16182590667555696
-0.16300370390767394


In [75]:
lasso = Lasso()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv = 5)

lasso_regressor.fit(X_train, y_train)

  positive)
  positive)
  positive)
  positive)
  positive)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [76]:
lasso_regressor.best_params_

{'alpha': 1}

In [77]:
lasso_regressor.best_score_

-469586133.6125855

In [78]:
y_pred = lasso_regressor.predict(X_train)

In [79]:
y_test_preds = lasso_regressor.predict(X_test)

In [80]:
r2(y_test, y_test_preds)

0.14008810808335992

### KNN 

In [81]:
knn_params = {
    'n_neighbors': range(1, 51, 10),
    'metric': ['euclidean', 'manhattan']
}

In [82]:
knn_gridsearch = GridSearchCV(KNeighborsRegressor(),
                              knn_params,
                              cv=5,
                              verbose=1)


In [83]:
knn_gridsearch.fit(X_train, y_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.2s finished


In [84]:
knn_gridsearch.best_params_

{'metric': 'manhattan', 'n_neighbors': 11}

In [85]:
knn_gridsearch.best_score_

0.16590694548518048

In [86]:
best_knn = knn_gridsearch.best_estimator_
best_knn.score(X_test, y_test)

0.23277465219634064

In [89]:
y_pred = knn_gridsearch.predict(X_train)
y_test_preds = knn_gridsearch.predict(X_test)

In [90]:
r2(y_test, y_test_preds)

0.14008810808335992

### Decision Tree

In [91]:
grid = GridSearchCV(estimator = DecisionTreeRegressor(),
                    param_grid = {'max_depth': [3, 5, 7, 10],
                                  'min_samples_split': [5, 10, 15, 20],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    verbose = 1)

In [92]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:    1.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 5, 7, 10],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7],
                         'min_samples_split': [5, 10, 15, 20]},
             pre_dispatch='2*n

In [93]:
grid.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=7,
                      min_samples_split=20, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [94]:
grid.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=7,
                      min_samples_split=20, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [95]:
dt = grid.best_estimator_


In [96]:
print(dt.score(X_train, y_train))
print(dt.score(X_test, y_test))

0.4503576433227322
0.20832188544619165


In [97]:
y_pred = dt.predict(X_train)
y_test_preds = dt.predict(X_test)

In [103]:
r2(y_test, y_test_preds)

0.2083218854461918

In [104]:
r2(y_train, y_preds)

0.2644179727926703

In [102]:
r2_score(y_test, y_test_preds)

0.20832188544619168