In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge,RidgeCV, LassoCV, BayesianRidge,ElasticNet,ElasticNetCV

from sklearn import preprocessing  # imputing and scale

from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection,stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score

import itertools 

from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import mean_squared_error   #RMSE

predictors = ['Bed',
       'Bath', 'House_size', 'Year_Built', 'Parking', 'Lot',
       'HOA', 'Price_Sqft', 'walk_score', 'transit_score']
outcome = 'List_price'

In [None]:
X = zillow_reg[predictors]
y = zillow_reg[outcome]

In [None]:
train_X, temp_X, train_y, temp_y = train_test_split(X, y, test_size=0.5, random_state=1)
valid_X, test_X, valid_y, test_y = train_test_split(temp_X, temp_y, test_size=0.3, random_state=1)

print('Training : ', train_X.shape)
print('Validation : ', valid_X.shape)
print('Test : ', test_X.shape)

In [None]:
#train model using training data
train_X

In [None]:
#actual value
train_y

# Linear regression

In [None]:
lm = LinearRegression()
lm.fit(train_X,train_y)

In [None]:
print(pd.DataFrame({'Predictor': X.columns, 'coefficient':
(lm.coef_)}))

In [None]:
# Regression Summary - RMSE, MAE, MPE, MAPE
regressionSummary(train_y, lm.predict(train_X))

In [None]:
lm_pred = lm.predict(valid_X)

In [None]:
result = pd.DataFrame({'Predicted': lm_pred, 'Actual':
valid_y,'Residual': valid_y - lm_pred})
print(result.head(20))

In [None]:
regressionSummary(valid_y, lm_pred)

In [None]:
lm_pred = lm.predict(valid_X)
all_residuals = valid_y - lm_pred
# Determine the percentage of datapoints with a residual in [-1406, 1406] = approx.
# 75%
print(len(all_residuals[(all_residuals > -1406) & (all_residuals < 1406)]) / len(all_residuals))
pd.DataFrame(all_residuals).hist(bins=25)
plt.show()

In [None]:
def findsubsets(s, n): 
    return list(map(set, itertools.combinations(s, n)))

In [None]:
findsubsets(train_X.columns,49)

In [None]:
lm = LinearRegression()
lm.fit(train_X,train_y)

In [None]:
model=LinearRegression()
reg  = model.fit(train_X, train_y) #
#features -> train_X
#outcome -> train_y
#training data predictor/features and the actual #
            
#how did we do against the training set

pred_train = reg.predict(train_X)  #predicts outcome training data

pred_valid = reg.predict(valid_X)  #predicts outcome using the validation data

print("train RMSE: "+str(mean_squared_error(train_y,pred_train)))

print("validation RMSE: "+str(mean_squared_error(valid_y,pred_valid)))

In [None]:
def exhaustive(model,train_X,train_y,valid_X,valid_y):
    low_err=600000**2

    for x in range(len(train_X.columns),0,-1):
        n=x
        #print("number of features "+str(n))
        ss=findsubsets(train_X.columns,n)
        print("_"*50)
        print(str(len(ss))+" model(s) using "+ str(n)+ " features")
        for z in range(0,len(ss)):
        
            model = model
            reg  = model.fit(train_X[ss[z]], train_y)
            pred_train = reg.predict(train_X[ss[z]])
            err_train= mean_squared_error(train_y, pred_train)
            
            
            pred_valid = reg.predict(valid_X[ss[z]])
            err_valid = mean_squared_error(valid_y, pred_valid)
            mae_valid = mean_absolute_error(valid_y,pred_valid)    
        
            if err_valid < low_err:
                low_err=err_valid
                low_err_features=list(ss[z])
                print("_"*50)
                print(low_err_features)
                print("RMSE train: "+str(err_train ** .5))
                print("MSE validation: "+str(low_err))
                print("RMSE validation: "+str(low_err ** .5))
                print("MAE validation: "+str(mae_valid))
                abs(valid_y-pred_valid).hist()
                plt.show()
                print("_"*50)

In [None]:
exhaustive(model=LinearRegression(normalize=True,n_jobs=-1), train_X=train_X, train_y=train_y,valid_X=valid_X,valid_y=valid_y )

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_reg = RandomForestRegressor()
random_reg.fit(train_X,train_y)

test_X = test_X[predictors]

predicted_prices = random_reg.predict(test_X)
print(predicted_prices)

In [None]:
test_X
test_y

### GridSearchCV Using RandomForest Regressor


In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pylab as plt 


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from dmba import plotDecisionTree

from sklearn.metrics import mean_squared_error



# user grid search to find optimized tree

param_grid = {
    'max_depth': [5],#list(np.arange(6,12)), 
    'min_samples_split':[5],# list(np.arange(1,8)), 
    'max_leaf_nodes': [52],#list(np.arange(35,54)),  
#    'min_samples_leaf':  list(np.arange(2,6)) , #does not improve MSE
#    'max_features': [7],    
#    'n_estimators': list(np.arange(10,150,10))#list(np.arange(100,1200,200)),
}
gridSearch = GridSearchCV(RandomForestRegressor(random_state=1), 
                          param_grid, cv=5, n_jobs=-1)
gridSearch.fit(train_X, train_y)
print('Improved parameters: ', gridSearch.best_params_)

rf = gridSearch.best_estimator_

print("RMSE: "+str(mean_squared_error(valid_y, rf.predict(valid_X))**.5))


In [None]:
# Variable importance
importance = pd.DataFrame({'features': train_X.columns, 
                           'importance': rf.feature_importances_})
importance.index=importance.features
importance.sort_values(by='importance', ascending=True).plot.barh()

### GridSearchCV using GradientBoostingRegressor

In [None]:
# user grid search to find optimized tree

param_grid = {
    
    'learning_rate':list(np.arange(0.01,0.22,.01)),  #choose initial high learning rate
    'min_samples_split': list(np.arange(3,7)),#'
#    'min_samples_leaf': list(np.arange(2,5)) ,#does not reduce error
#    'max_depth':list(np.arange(2,7)), #
######    'subsample': list(np.arange(0.77,0.83,.01)),
    'max_leaf_nodes': list(np.arange(4,7)), 
#    'max_features': [2,3,4,5,6,7], 
}

gridSearch = GridSearchCV(GradientBoostingRegressor(random_state=1), 
                          param_grid, cv=5, n_jobs=-1)
gridSearch.fit(train_X, train_y)
print('Improved parameters: ', gridSearch.best_params_)

gbm = gridSearch.best_estimator_

print("RMSE: "+str(mean_squared_error(valid_y, gbm.predict(valid_X))**.5))

In [None]:
# Variable importance
importance = pd.DataFrame({'features': train_X.columns, 
                           'importance': gbm.feature_importances_})
importance.index=importance.features
importance.sort_values(by='importance', ascending=True).plot.barh()