In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor

In [2]:
# Reading in TRAIN (df) and TEST (fd) data
df = pd.read_csv('./data/df_select_features.csv')
fd = pd.read_csv('./data/fd_select_features.csv')
# To facilitate Kaggle submissions
submission_index = list(range(1461,2920))
# For referncing feature names
cols = list(fd.columns.values)

In [3]:
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42)

# Decision Tree Regression
    - Bagged Decision Trees Regressor
    - Random Forests Regressor
    - Extremely Random Trees Regressor

### Bagged Decision Tree Regressors

In [5]:
dtree = DecisionTreeRegressor()

param_grid = {'max_depth':[None,5,10,15,20],
              'min_samples_split':[2,4,6,8],
              'max_features':[None,5,10,15],
              'max_leaf_nodes':[None,5,10,15]}

gs_dtree = GridSearchCV(dtree, param_grid)

gs_dtree.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [None, 5, 10, 15, 20],
                         'max_features': [None, 5, 10, 15],
                         'max_leaf_nodes': [None, 5, 10, 15],
                        

In [6]:
# Generating predictions based on best model evaluated by GridSearch
dtree_preds = (gs_dtree.best_estimator_).predict(X_test)

print(f'(DTree) Mean cross validation score: {round(gs_dtree.best_score_, 4)}')
print(f'(DTree) Root mean squared log error: {round(((y_test-dtree_preds)**2).mean()**(1/2),4)}')
print()
print(f'(DTree) Best parameters, GridSearch: {gs_dtree.best_params_}')

(DTree) Mean cross validation score: 0.7726
(DTree) Root mean squared log error: 35365.9878

(DTree) Best parameters, GridSearch: {'max_depth': 10, 'max_features': 15, 'max_leaf_nodes': None, 'min_samples_split': 4}


In [10]:
# Bootstrapping and bagging trees

bag_df = pd.DataFrame(index=X_test.index)

for i in range(1000):
    # Random sampling with replacement
    X_sample = X_train.sample(n=X_train.shape[0],
                              replace=True)
    # Ensuring our random samples align with their label
    y_sample = y[X_sample.index]
    
    # Model configured according to GridSearch best_params_
    tree = DecisionTreeRegressor(max_depth=10,
                                 max_features=15,
                                 max_leaf_nodes=None,
                                 min_samples_split=8)
    # Fitting models
    tree.fit(X_sample, y_sample)
    
    # Appending predictions to DataFrame
    bag_df[f'Tree {i}'] = tree.predict(X_test)
    
bag_df.shape

(362, 1000)

In [29]:
bag_df['SalePrice'] = bag_df.mean(axis='columns')

RMSLE = ((np.log(y_test)-np.log(bag_df['SalePrice']))**2).mean()**(1/2)
print(f'Root mean squared logarithmic error: {round(RMSLE,4)}')
print(f'Mean cross validation score: {cross_val_score(dtree, X_train, y_train).mean()}')

Root mean squared logarithmic error: 0.1221
Mean cross validation score: 0.7278833957249772




In [None]:
cross_val_score(dtree, X, y)

In [20]:
# Creating DataFrame for submission
tree_bag = pd.DataFrame(index=fd.index)

for i in range(1000):
    X_sample = X.sample(n=X.shape[0],
                         replace=True)
    y_sample = y[X_sample.index]
    
    tree = DecisionTreeRegressor(max_depth=10,
                                 max_features=15,
                                 max_leaf_nodes=None,
                                 min_samples_split=8)
    tree.fit(X_sample,y_sample)
    
    tree_bag[f'Tree {i}'] = tree.predict(fd)

In [21]:
tree_bag['SalePrice'] = tree_bag.mean(axis='columns')

In [23]:
# Creating submission DataFrame
tree_bag_submission = pd.DataFrame(columns=['Id','SalePrice'])
tree_bag_submission['Id'] = submission_index
tree_bag_submission['SalePrice'] = tree_bag['SalePrice']

# Exporting to .csv
tree_bag_submission.to_csv('./data/tree_bag.csv',index=False)

#### Kaggle score:
    - RMSLE: 0.1834

### Random Forest Regressor

In [43]:
# Instantiating Random Forrest Regressor
rf = RandomForestRegressor(random_state=42)

param_grid = {'n_estimators':[200,300,400],
              'max_depth':[None,15],
              'min_samples_split':[2,3,4],
              'max_features':[3,4,5],
              'max_leaf_nodes':[None,15]}

gs_ranfo = GridSearchCV(rf, param_grid, verbose=1)

gs_ranfo.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=1)]: Done 324 out of 324 | elapsed:  2.4min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [None, 15], 'ma

In [26]:
rf_preds = (gs_ranfo.best_estimator_).predict(X_test)

print(f'(RandF) Mean cross validation score: {round(gs_ranfo.best_score_,4)}')
print(f'(RandF) Root mean squared log error: {round(((np.log(y_test)-np.log(rf_preds))**2).mean()**(1/2),4)}')

print()
print(f'Best parameters: {gs_ranfo.best_params_}')

(RandF) Mean cross validation score: 0.8717
(RandF) Root mean squared log error: 0.1177

Best parameters: {'max_depth': 15, 'max_features': 5, 'max_leaf_nodes': None, 'min_samples_split': 3, 'n_estimators': 200}


In [33]:
# Instantiating model according to GridSearch parameters
rf = RandomForestRegressor(max_depth=15,
                           max_features=5,
                           max_leaf_nodes=None,
                           min_samples_split=3,
                           n_estimators=200)

# Fitting model to entire training set for submission
rf.fit(X,y)

# Generating predictions for submission
rf_preds = rf.predict(fd)

# Creating DataFrame for submission
rf_df = pd.DataFrame(columns=['Id','SalePrice'])
rf_df['Id'] = submission_index
rf_df['SalePrice'] = rf_preds

In [34]:
rf_df.to_csv('./data/rand_for.csv', index=False)

#### Kaggle score:
    - RMSLE: 0.1727

### Extemely Random Trees Regressor

In [37]:
et = ExtraTreesRegressor(random_state=42)

param_grid = {'n_estimators':[200, 300, 400],
              'max_depth':[None,15],
              'min_samples_split':[2,3,4],
              'max_features':[3,4,5],
              'max_leaf_nodes':[None,15]}

gs_extra = GridSearchCV(et, param_grid, verbose=1)

gs_extra.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=1)]: Done 324 out of 324 | elapsed:  1.9min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=ExtraTreesRegressor(bootstrap=False, criterion='mse',
                                           max_depth=None, max_features='auto',
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=2,
                                           min_weight_fraction_leaf=0.0,
                                           n_estimators='warn', n_jobs=None,
                                           oob_score=False, random_state=42,
                                           verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [None, 15], 'max_features': [3, 4, 5],
                         'max_leaf_nodes':

In [39]:
et_preds = (gs_extra.best_estimator_).predict(X_test)

print(f'(Extra) Mean cross validation score: {round(gs_extra.best_score_,4)}')
print(f'(Extra) Root mean squared log error: {round(((np.log(y_test)-np.log(et_preds))**2).mean()**(1/2),4)}')
print()
print(f'(Extra) Best parameters: {gs_extra.best_params_}')

(Extra) Mean cross validation score: 0.8635
(Extra) Root mean squared log error: 0.1219

(Extra) Best parameters: {'max_depth': 15, 'max_features': 5, 'max_leaf_nodes': None, 'min_samples_split': 2, 'n_estimators': 300}


In [41]:
# Instantiating model according to GridSearch parameters
et = ExtraTreesRegressor(max_depth=15,
                         max_features=5,
                         max_leaf_nodes=None,
                         min_samples_split=2,
                         n_estimators=300)

# Fitting model to entire training set for submission
et.fit(X,y)

# Generating predictions for submission
et_preds = et.predict(fd)

# Creating DataFrame for submission
et_df = pd.DataFrame(columns=['Id','SalePrice'])
et_df['Id'] = submission_index
et_df['SalePrice'] = et_preds

In [42]:
et_df.to_csv('./data/extra_trees.csv', index=False)

#### Kaggle score:
    - RMSLE: 0.1644