In [1]:
import pandas as pd
import geopandas as gp
import numpy as np
import matplotlib.pyplot as plt

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_seq_items = None
pd.options.display.max_rows = None

In [3]:
model_df = pd.read_csv('/home/semipro321/Documents/Workspace/Geospatial-Analysis-on-House-Price-Prediction/Inputs/mastermodel_filledv3.csv')

In [4]:
model_df.shape

(15234, 167)

In [5]:
model_df.head()

Unnamed: 0,sqft,parking,mean_district_income,bedrooms_bg,bedrooms_ag,bathrooms,final_price,type_Att/Row/Twnhouse,type_Co-Op Apt,type_Co-Ownership Apt,...,neighbourhood_Yonge-St.Clair,neighbourhood_York University Heights,neighbourhood_Yorkdale-Glen Park,Auto Theft500m,Auto Theft1000m,violent_crime1000m,violent_crime500m,Airbnb500m_YOY1519,Airbnb1000m_YOY1519,dist_nearest_park
0,850.0,1,56526,1,2,2,855000,0,0,0,...,0,0,0,55,230,7409,2994,26,19,130.27
1,1295.319092,6,52787,0,3,2,885000,0,0,0,...,0,0,0,40,61,381,570,10,18,134.63
2,550.0,0,57039,0,1,1,550000,0,0,0,...,0,0,0,29,63,830,1241,10,18,134.63
3,650.0,1,70623,1,1,1,665000,0,0,0,...,0,0,0,37,82,1732,683,9,16,149.17
4,977.184326,1,44101,0,2,2,825513,0,0,0,...,0,0,0,19,37,642,215,9,16,149.17


## *Creating an ensemble model* 

In [14]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression

*create train test split*

In [7]:
predictors = model_df.drop(columns=['final_price'], axis=1).columns

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(model_df[predictors], model_df['final_price'], test_size=0.30, random_state=42)

*model function*

In [44]:
def modelfit(pip, x_train, y_train, x_test, y_test, predictors, cv_folds=5):
    
    
    #Fit the algorithm on the data
    pip.fit(x_train, y_train)
    
    y_val = y_test.values
    
    #Perform cross-validation:
    #if performCV:
    #cv_score = cross_val_score(pip, x_train, y_train, cv=cv_folds, scoring='neg_mean_squared_log_error')
    
    #Predict test set:
    pred = pip.predict(x_test)
    #Print model report:
    print('Model Report')
    print(f'Accuracy : {round(np.sqrt(mean_squared_log_error(y_val, pred)),6)}')
    
    #if performCV:
    #print(f'CV Score : Mean - {np.mean(cv_score)} | Std - {np.std(cv_score)} | Min - {np.min(cv_score)} | Max - {np.max(cv_score)}')
                
    #Print Feature Importance:
    #if printFeatureImportance:
    #feat_imp = pd.Series(pip.steps[1][1].feature_importances_, predictors).sort_values(ascending=False)
    #feat_imp.head(20).plot(kind='bar', title='Feature Importances')
    #plt.ylabel('Feature Importance Score')
    data = {'Actual': y_val,
           'Predictions': pred}
    return pd.DataFrame(data)

In [11]:
def create_pipeline(scale, model):
    steps = [('std',scale), ('model', model)]
    return Pipeline(steps=steps)

#### Random Forest Regressor

In [45]:
rf = RandomForestRegressor()
scale = StandardScaler()

pipeline_rf = create_pipeline(scale, rf)

op_rf = modelfit(pipeline_rf, X_train, Y_train, X_test, Y_test,predictors)

Model Report
Accuracy : 0.142559


In [41]:
op_rf[op_rf['Predictions'] <= 0]

Unnamed: 0,Actual,Predictions


In [37]:
lm = LinearRegression()

pipeline_lm = create_pipeline(StandardScaler(), lm)

op_lm = modelfit(lm, X_train, Y_train, X_test, Y_test, predictors)

In [39]:
op_lm[op_lm['Predictions'] <= 0]

Unnamed: 0,Actual,Predictions
305,299000,-2041.760486
371,265000,-59695.34425
1052,165000,-106126.549783
1497,217000,-66166.964646
2938,325000,-47760.348314


In [24]:
scaled = scale.fit_transform(X_train)
lm.fit(scaled, Y_train)
pred = lm.predict(X_test)

In [25]:
pred

array([2.52243327e+21, 1.33113991e+21, 2.51420454e+21, ...,
       1.17510161e+21, 2.01298911e+21, 1.90818494e+21])

#### Light GBM

In [13]:
lgb = LGBMRegressor()

pipeline_lgb = create_pipeline(scale, lgb)

op_lgb = modelfit(pipeline_lgb, X_train, Y_train, X_test, Y_test, predictors)

Model Report
Accuracy : 0.025


#### ExtraTrees Regressor

In [46]:
ext = ExtraTreesRegressor()

pipeline_ext = create_pipeline(scale, ext)

op_ext = modelfit(ext, X_train, Y_train, X_test, Y_test, predictors)

Model Report
Accuracy : 0.132272


#### *Parameter Tunning in ExtraTreesRegresor*

In [27]:
ExtraTreesRegressor().get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [28]:
from sklearn.model_selection import GridSearchCV

In [95]:
g_ext = ExtraTreesRegressor(n_estimators=140, max_depth=15, min_samples_split=4, criterion = 'mse', random_state = 10)
                            

pipelineGSV = create_pipeline(scale, g_ext)

param_test = {'model__bootstrap':[False, True]}

g_search = GridSearchCV(pipelineGSV, param_test,scoring='neg_mean_squared_log_error',n_jobs=None, cv=5,refit=False)

In [96]:
g_search.fit(X_train, Y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('std', StandardScaler()),
                                       ('model',
                                        ExtraTreesRegressor(max_depth=15,
                                                            min_samples_split=4,
                                                            n_estimators=140,
                                                            random_state=10))]),
             param_grid={'model__bootstrap': [False, True]}, refit=False,
             scoring='neg_mean_squared_log_error')

In [97]:
g_search.cv_results_, g_search.best_params_, g_search.best_score_

({'mean_fit_time': array([9.64880872, 7.02350698]),
  'std_fit_time': array([0.76887289, 0.53993951]),
  'mean_score_time': array([0.07289228, 0.07857413]),
  'std_score_time': array([0.00786048, 0.03250506]),
  'param_model__bootstrap': masked_array(data=[False, True],
               mask=[False, False],
         fill_value='?',
              dtype=object),
  'params': [{'model__bootstrap': False}, {'model__bootstrap': True}],
  'split0_test_score': array([-0.0215091 , -0.02183852]),
  'split1_test_score': array([-0.0212592 , -0.02200616]),
  'split2_test_score': array([-0.01982995, -0.02059338]),
  'split3_test_score': array([-0.02065457, -0.02120457]),
  'split4_test_score': array([-0.02294893, -0.02371861]),
  'mean_test_score': array([-0.02124035, -0.02187225]),
  'std_test_score': array([0.00103191, 0.00104971]),
  'rank_test_score': array([1, 2], dtype=int32)},
 {'model__bootstrap': False},
 -0.021240351507526477)

## *Stacking Models*

In [107]:
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

In [108]:
# get a stacking ensemble of models
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('rf', RandomForestRegressor(random_state=10)))
    level0.append(('ext', ExtraTreesRegressor(n_estimators=140, max_depth=15, min_samples_split=4, random_state = 10)))
    level0.append(('lgb', LGBMRegressor(random_state=10)))
    # define meta learner model
    level1 = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    return model

In [110]:
# get a list of models to evaluate
def get_models():
    models = dict()
    #models['rf'] = RandomForestRegressor(random_state=10)
    #models['ext'] = ExtraTreesRegressor(n_estimators=140, max_depth=15, min_samples_split=4, random_state = 10)
    #models['lgb'] = LGBMRegressor(random_state=10)
    models['stacking'] = get_stacking()
    return models

In [99]:
# evaluate a given model using cross-validation
def evaluate_model(pipeline, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_log_error', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [111]:
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    pip = create_pipeline(scale, model)
    scores = evaluate_model(pip, X_train, Y_train)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

# plot model performance for comparison
#pyplot.boxplot(results, labels=names, showmeans=True)
#pyplot.show()

>stacking -0.021 (0.002)
