In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV


In [3]:
df = pd.read_csv('./datasets/generated_gps_price_radius.csv')

In [4]:
df.head()

Unnamed: 0,index,latitude,longitude,neighborhood,0.5mi 1 dollar,1.0mi 1 dollar,0.5mi 2 dollar,1.0mi 2 dollar,0.5mi 3 dollar,1.0mi 3 dollar,0.5mi 4 dollar,1.0mi 4 dollar,median income,median rent,median home value
0,0,47.698699,-122.359579,Greenwood/Phinney Ridge,0.5,0.25,0.5,0.6875,0.0,0.0625,0.0,0.0,92464,1398,556916
1,1,47.629,-122.29701,Montlake/Portage Bay,0.0,0.269231,0.75,0.653846,0.25,0.076923,0.0,0.0,132573,1723,821250
2,2,47.603136,-122.301123,Central Area/Squire Park,0.3,0.352941,0.7,0.588235,0.0,0.058824,0.0,0.0,88722,1401,517525
3,4,47.627629,-122.31755,North Capitol Hill,0.258065,0.227941,0.709677,0.720588,0.0,0.029412,0.032258,0.022059,96220,1576,896200
4,5,47.701419,-122.290185,Wedgwood/View Ridge,0.0,0.666667,0.0,0.333333,0.0,0.0,0.0,0.0,114723,1596,628275


In [7]:
X = df.drop(['index', 'latitude', 'longitude', 'neighborhood', 'median income', 'median rent', 'median home value'], axis = 1)
y = df['median home value']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 40)

## Linear Regression

In [11]:
lr = linear_model.LinearRegression()

In [12]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
y_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

In [30]:
lr_rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print('Linear Regression X Train: ', lr_rmse)

lr_rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print('Linear Regression X Test:', lr_rmse_test)

Linear Regression X Train:  135712.73879178907
Linear Regression X Test: 140152.17748746774


## Adaboost

In [23]:
ab = AdaBoostRegressor()

In [24]:
ab.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None)

In [26]:
y_pred = ab.predict(X_train)
y_test_pred = ab.predict(X_test)

In [28]:
ab_rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print('Adaboost X Train:', ab_rmse)

ab_rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print('Adaboost X Test:', ab_rmse_test)

Adaboost X Train: 135712.73879178907
Adaboost X Test: 140152.17748746774


## LASSO

In [32]:
ss = StandardScaler()

In [33]:
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.fit_transform(X_test)

In [36]:
lasso_alphas = np.logspace(3, 22, 100)

lasso_model = LassoCV(alphas=lasso_alphas, cv=5, max_iter=5000)

lasso_model = lasso_model.fit(X_train_ss, y_train)

In [37]:
lasso_model.alpha_

5857.020818056667

In [38]:
print(lasso_model.score(X_train_ss, y_train))
print(lasso_model.score(X_test_ss, y_test))

0.24730589103670128
0.2048680578154617


In [43]:
lasso = Lasso()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv = 5)

lasso_regressor.fit(X_train, y_train)
#https://github.com/marcopeix/ISL-Ridge-Lasso/blob/master/Lasso%20and%20Ridge%20Regression.ipynb

  positive)
  positive)
  positive)
  positive)
  positive)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [44]:
lasso_regressor.best_params_

{'alpha': 20}

In [46]:
lasso_regressor.best_score_

-22785833787.561592

## KNN

In [54]:
knn_params = {
    'n_neighbors': range(1, 51, 10),
    'metric': ['euclidean', 'manhattan']
}

In [55]:
knn_gridsearch = GridSearchCV(KNeighborsRegressor(),
                              knn_params,
                              cv=5,
                              verbose=1)
#4.06 lesson for code

In [56]:
knn_gridsearch.fit(X_train, y_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.2s finished


In [57]:
knn_gridsearch.best_score_

0.1415514599686293

In [58]:
knn_gridsearch.best_params_

{'metric': 'manhattan', 'n_neighbors': 21}

In [60]:
best_knn = knn_gridsearch.best_estimator_
best_knn.score(X_test, y_test)

0.2623877363485988

## Decision Tree

In [62]:
grid = GridSearchCV(estimator = DecisionTreeRegressor(),
                    param_grid = {'max_depth': [3, 5, 7, 10],
                                  'min_samples_split': [5, 10, 15, 20],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    verbose = 1)

In [63]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:    1.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 5, 7, 10],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7],
                         'min_samples_split': [5, 10, 15, 20]},
             pre_dispatch='2*n

In [64]:
grid.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=7, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=7,
                      min_samples_split=15, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [65]:
grid.best_score_

0.2084513892020269

In [66]:
dt = grid.best_estimator_
dt.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=7, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=7,
                      min_samples_split=15, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [None]:
print('Traindt.score(X_train, y_train))
print(dt.score(X_train, y_train))