# **(Predict House Price Nootebook)**

## Objectives

* Develop and assess a predictive model for estimating the sale values of inherited properties.

## Inputs

* outputs/datasets/cleaned/HousePricesCleaned.csvk

## Outputs

* Train set (features and target)
* Test set (features and target)
* ML pipeline to predict house prices
* Feature Importance Plot
* Model performance plot

## Additional Comments

* In the begining of the project we made an hypothesis, after the taken steps we can make an conclusion that the hypothesis was true, we see that size, quality and the year the house was built matters on the price. I will also credit coce institute and https://github.com/Amareteklay/ who i followed. 


---

# Change working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [1]:
import os
current_dir = os.getcwd()
current_dir

'/workspace/housepricepred2/jupyter_notebooks'

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [3]:
current_dir = os.getcwd()
current_dir

'/workspace/housepricepred2'

## Load Data

Start by loading data

In [4]:
import numpy as np
import pandas as pd
df = pd.read_csv("outputs/datasets/collection/HousePrices.csv") 

print(df.shape)
df.head()

(1460, 24)


Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,EnclosedPorch,GarageArea,GarageFinish,...,LotFrontage,MasVnrArea,OpenPorchSF,OverallCond,OverallQual,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,SalePrice
0,856,854.0,3.0,No,706,GLQ,150,0.0,548,RFn,...,65.0,196.0,61,5,7,856,0.0,2003,2003,208500
1,1262,0.0,3.0,Gd,978,ALQ,284,,460,RFn,...,80.0,0.0,0,8,6,1262,,1976,1976,181500
2,920,866.0,3.0,Mn,486,GLQ,434,0.0,608,RFn,...,68.0,162.0,42,5,7,920,,2001,2002,223500
3,961,,,No,216,ALQ,540,,642,Unf,...,60.0,0.0,35,5,7,756,,1915,1970,140000
4,1145,,4.0,Av,655,GLQ,490,0.0,836,RFn,...,84.0,350.0,84,5,8,1145,,2000,2000,250000


---

## MP Pipeline: Regressor

In [5]:
from sklearn.pipeline import Pipeline

# Feature Scaling
from sklearn.preprocessing import StandardScaler

# Feature Selection
from sklearn.feature_selection import SelectFromModel

# Models
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import Pipeline

# Data Cleaning
from feature_engine.imputation import MeanMedianImputer
from feature_engine.selection import DropFeatures
from feature_engine.imputation import CategoricalImputer

# Feature Engineering
from feature_engine.encoding import OrdinalEncoder
from feature_engine.selection import SmartCorrelatedSelection
from feature_engine import transformation as vt
from feature_engine.outliers import Winsorizer

def OptimizeModelPipeline(model):
    pipeline = Pipeline([
        ('impute_mean', MeanMedianImputer(imputation_method='mean', variables=['LotFrontage', 'BedroomAbvGr'])),
        ('impute_median', MeanMedianImputer(imputation_method='median', variables=['2ndFlrSF', 'MasVnrArea'])),
        ('impute_categorical', CategoricalImputer(imputation_method='frequent', variables=['GarageFinish', 'BsmtFinType1', 'BsmtExposure'])),
        ('drop_features', DropFeatures(features_to_drop=['EnclosedPorch', 'GarageYrBlt', 'WoodDeckSF'])), 
        ('encoder', OrdinalEncoder(encoding_method='arbitrary', variables=['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual'])),
        ('log_transformer', vt.LogTransformer(variables=['GrLivArea', 'LotArea', 'LotFrontage'])),
        ('power_transformer', vt.PowerTransformer(variables=['GarageArea', 'MasVnrArea', 'OpenPorchSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF'])),
        ('outlier_handler', Winsorizer(capping_method='iqr', tail='both', fold=1.5, variables=['GarageArea', 'LotArea', 'LotFrontage', 'MasVnrArea', 'OpenPorchSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF'])),  
        ('smart_corr_sel', SmartCorrelatedSelection(variables=None, method="spearman", threshold=0.6, selection_method="variance")),
        ('feat_scaling', StandardScaler()),
        ('feat_selection', SelectFromModel(model)),
        ('model', model)
    ])

    return pipeline

  from pandas import MultiIndex, Int64Index


* taken from code institute

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.models.keys():
            try:
                print(f"\nRunning GridSearchCV for {key}\n")
                
                model = OptimizeModelPipeline(self.models[key])
                
                params = self.params[key]
                gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, verbose=verbose, scoring=scoring, error_score='raise')
                gs.fit(X, y)
                self.grid_searches[key] = gs
            except Exception as e:
                print(f"Error encountered for model {key}: {e}")

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))
        
        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns], self.grid_searches


## Split Train and Test Set

In [7]:


from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(
                                    df.drop(['SalePrice'], axis=1) ,
                                    df['SalePrice'],
                                    test_size=0.2,
                                    random_state=0
                                    )

print("* Train set:", X_train.shape, y_train.shape, "\n* Test set:",  X_test.shape, y_test.shape)

* Train set: (1168, 23) (1168,) 
* Test set: (292, 23) (292,)


## Grid Search CV - Sklearn

In [8]:
# Set up a dictionary of various regression models with default settings
initial_models = {
    "Linear_Reg": LinearRegression(),
    "Decision_Tree": DecisionTreeRegressor(random_state=0),
    "Random_Forest": RandomForestRegressor(random_state=0),
    "Extra_Trees": ExtraTreesRegressor(random_state=0),
    "AdaBoost": AdaBoostRegressor(random_state=0),
    "Gradient_Boosting": GradientBoostingRegressor(random_state=0),
    "XGBoost": XGBRegressor(random_state=0),
}

# Define hyperparameters for a quick comparison of models
model_hyperparams = {
    "Linear_Reg": {},

    "Decision_Tree": {
        'model__max_depth': [None, 4, 15],
        'model__min_samples_split': [2, 50],
        'model__min_samples_leaf': [1, 50],
        'model__max_leaf_nodes': [None, 50],
    },

    "Random_Forest": {
        'model__n_estimators': [100, 50, 140],
        'model__max_depth': [None, 4, 15],
        'model__min_samples_split': [2, 50],
        'model__min_samples_leaf': [1, 50],
        'model__max_leaf_nodes': [None, 50],
    },

    "Extra_Trees": {
        'model__n_estimators': [100, 50, 150],
        'model__max_depth': [None, 3, 15],
        'model__min_samples_split': [2, 50],
        'model__min_samples_leaf': [1, 50],
    },

    "AdaBoost": {
        'model__n_estimators': [50, 25, 80, 150],
        'model__learning_rate': [1, 0.1, 2],
        'model__loss': ['linear', 'square', 'exponential'],
    },

    "Gradient_Boosting": {
        'model__n_estimators': [100, 50, 140],
        'model__learning_rate': [0.1, 0.01, 0.001],
        'model__max_depth': [3, 15, None],
        'model__min_samples_split': [2, 50],
        'model__min_samples_leaf': [1, 50],
        'model__max_leaf_nodes': [None, 50],
    },

    "XGBoost": {
        'model__n_estimators': [30, 80, 200],
        'model__max_depth': [None, 3, 15],
                    'model__learning_rate': [0.01,0.1,0.001],
                    'model__gamma': [0, 0.1],
        },
}

In [9]:
search = HyperparameterOptimizationSearch(models=initial_models, params=model_hyperparams)
search.fit(X_train, y_train, cv=5, n_jobs=-1, verbose=1, scoring='r2')



Running GridSearchCV for Linear_Reg

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).inde


Running GridSearchCV for Decision_Tree

Fitting 5 folds for each of 24 candidates, totalling 120 fits


  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).inde


Running GridSearchCV for Random_Forest

Fitting 5 folds for each of 72 candidates, totalling 360 fits


  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).inde


Running GridSearchCV for Extra_Trees

Fitting 5 folds for each of 36 candidates, totalling 180 fits


  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).inde


Running GridSearchCV for AdaBoost

Fitting 5 folds for each of 36 candidates, totalling 180 fits


  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).inde


Running GridSearchCV for Gradient_Boosting

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).index[0]
  f = X[feature_group].std().sort_values(ascending=False).inde

We run a summary and check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

## Do an extensive search on the most suitable model to find the best hyperparameter configuration.


* The first step we take is to create a model with parameters

In [None]:
models_search = {
    "Gradient_Boosting": GradientBoostingRegressor(random_state=0),
}
params_search = {
        "Gradient_Boosting": {
        'model__n_estimators': [100, 50, 140],
        'model__learning_rate': [0.1, 0.01, 0.001],
        'model__max_depth': [3, 15, None],
        'model__min_samples_split': [2, 50],
        'model__min_samples_leaf': [1, 50],
        'model__max_leaf_nodes': [None, 50],
    },
}

* Then we do as before and running an extensive GridSearch CV

In [None]:
search = HyperparameterOptimizationSearch(models=models_search, params= params_search)
search.fit(X_train, y_train, scoring = 'r2', n_jobs=-1, cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

* We looking for the optimize model

In [None]:
optimal_model = grid_search_summary.iloc[0]['estimator']
optimal_model

* Extract the best parameters for the top-performing model

In [None]:
optimal_parameters = grid_search_pipelines[optimal_model].best_params_
optimal_parameters

* Assign the most effective regression model from the grid search results

In [None]:
optimal_regression_pipeline = grid_search_pipelines[optimal_model].best_estimator_
optimal_regression_pipeline

### Assess feature importance

In [None]:
X_train.head(3)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Setting the style for the plot
sns.set_theme(style="whitegrid")

# Define the number of steps in the pipeline related to data cleaning and feature engineering
initial_pipeline_steps = 9

# Extract the feature names after the initial steps of the pipeline
featured_columns_post_processing = Pipeline(optimal_regression_pipeline.steps[:initial_pipeline_steps]).transform(X_train).columns

# Identifying the significant features based on the selection from the model
important_features = featured_columns_post_processing[optimal_regression_pipeline['feat_selection'].get_support()]

# Creating a DataFrame for the importance of each feature
df_feature_importance = pd.DataFrame({
    'Feature': important_features,
    'Importance': optimal_regression_pipeline.named_steps['model'].feature_importances_
}).sort_values(by='Importance', ascending=False)

# Displaying the key features and their importance
print(f"The model focuses on these {len(important_features)} key features, listed in order of importance: \n{df_feature_importance['Feature'].to_list()}")

# Plotting the feature importance
df_feature_importance.set_index('Feature').plot(kind='bar')
plt.title("Feature Importance in the Model")
plt.ylabel("Importance")
plt.show()

### Evaluate on Train and Test Sets

* We put as BR2 to have a score on both train and test set on 0.75 so we need to check the performance

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

def regression_performance(X_train, y_train, X_test, y_test, pipeline):
    print("Model Evaluation \n")
    print("* Train Set")
    r2_train = regression_evaluation(X_train, y_train, pipeline)
    print("* Test Set")
    r2_test = regression_evaluation(X_test, y_test, pipeline)

    # Performance Check
    check_performance_goal(r2_train, r2_test)

def regression_evaluation(X, y, pipeline):
    prediction = pipeline.predict(X)
    r2 = r2_score(y, prediction)
    print('R2 Score:', r2.round(3))  
    print('Mean Absolute Error:', mean_absolute_error(y, prediction).round(3))  
    print('Mean Squared Error:', mean_squared_error(y, prediction).round(3))  
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y, prediction)).round(3))
    print("\n")
    return r2

def check_performance_goal(r2_train, r2_test):
    if r2_train < 0.75:
        print("Warning: R2 score on the train set is below the goal of 0.75.")
    if r2_test < 0.75:
        print("Warning: R2 score on the test set is below the goal of 0.75.")

def regression_evaluation_plots(X_train, y_train, X_test, y_test, pipeline, alpha_scatter=0.5):
    pred_train = pipeline.predict(X_train)
    pred_test = pipeline.predict(X_test)

    r2_train = r2_score(y_train, pred_train)
    r2_test = r2_score(y_test, pred_test)

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,6))
    sns.scatterplot(x=y_train, y=pred_train, alpha=alpha_scatter, ax=axes[0])
    sns.lineplot(x=y_train , y=y_train, color='red', ax=axes[0])
    axes[0].set_title(f"Train Set - R2: {r2_train:.3f}")
    axes[0].set_xlabel("Actual")
    axes[0].set_ylabel("Predictions")

    sns.scatterplot(x=y_test, y=pred_test, alpha=alpha_scatter, ax=axes[1])
    sns.lineplot(x=y_test, y=y_test, color='red', ax=axes[1])
    axes[1].set_title(f"Test Set - R2: {r2_test:.3f}")
    axes[1].set_xlabel("Actual")
    axes[1].set_ylabel("Predictions")

    directory = 'docs/plots'
    if not os.path.exists(directory):
        os.makedirs(directory)


    plt.savefig('docs/plots/regression_performance.png', bbox_inches='tight')  
    plt.show()

In [None]:
regression_performance(X_train, y_train, X_test, y_test, optimal_regression_pipeline)
regression_evaluation_plots(X_train, y_train, X_test, y_test, optimal_regression_pipeline)

### Pipeline

* We achived the BR2

In [None]:
optimal_regression_pipeline

## Refit pipeline with best features

In [None]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import OrdinalEncoder
from feature_engine.transformation import LogTransformer, PowerTransformer
from feature_engine.outliers import Winsorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

def OptimizeModelPipeline(model):
    pipeline = Pipeline([
        ('impute_mean',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['LotFrontage', 'BedroomAbvGr'])),
        ('impute_median',
                 MeanMedianImputer(variables=['2ndFlrSF', 'MasVnrArea'])),
        ('impute_categorical',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['GarageFinish', 'BsmtFinType1',
                                               'BsmtExposure'])),
        
        ('encoder', OrdinalEncoder(encoding_method='arbitrary', variables=['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual'])), 

        ('lt', vt.LogTransformer(variables = ['GrLivArea']) ),

        ('pt', vt.PowerTransformer(variables = ['TotalBsmtSF']) ),
      
        ("Winsoriser_iqr",Winsorizer(capping_method='iqr', fold=1.5, tail='both', 
                                                  variables=['TotalBsmtSF', 'GarageArea']) ),  
   
        ('feat_scaling', StandardScaler()),

        
        # Model
        ('model', GradientBoostingRegressor(min_samples_leaf=50,
                                           n_estimators=140, random_state=0))])


    return pipeline


### Split Train & Test Set with the omptimize features

In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['SalePrice'], axis=1),
    df['SalePrice'],
    test_size=0.2,
    random_state=0,
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

### Grid Search CV – Sklearn

In [None]:
initial_models

In [None]:
model_hyperparams

In [None]:
initial_models = {
    "Gradient_Boosting": GradientBoostingRegressor(random_state=0),
}
model_hyperparams = {
        "Gradient_Boosting": {
        'model__n_estimators': [100, 50, 140],
        'model__learning_rate': [0.1, 0.01, 0.001],
        'model__max_depth': [3, 15, None],
        'model__min_samples_split': [2, 50],
        'model__min_samples_leaf': [1, 50],
        'model__max_leaf_nodes': [None, 50],
    },
}

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
search = HyperparameterOptimizationSearch(models=initial_models, params=model_hyperparams)
search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

In [None]:
optimal_model = grid_search_summary.iloc[0]['estimator']
optimal_model

In [None]:
optimal_regression_pipeline = grid_search_pipelines[optimal_model].best_estimator_
optimal_regression_pipeline

---

# Push files to Repo

In [None]:
import joblib
import os

version = 'v4'
file_path = f'outputs/ml_pipeline/predict_price/{version}'

try:
  os.makedirs(name=file_path)
except Exception as e:
  print(e)

## Train Set : features and target

In [None]:
X_train.head()

In [None]:
X_train.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
y_train.head()

In [None]:
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

## Test Set: features and target

In [None]:
X_test.head()

In [None]:
X_test.to_csv(f"{file_path}/X_test.csv", index=False)

In [None]:
y_test.head()

In [None]:
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

### Modelling pipeline

In [None]:
optimal_regression_pipeline

In [None]:
joblib.dump(value=optimal_regression_pipeline, filename=f"{file_path}/regression_pipeline.pkl")

## Feature importance plot

In [None]:
df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()

In [None]:
df_feature_importance.plot(kind='bar',x='Feature',y='Importance')
plt.savefig(f'{file_path}/features_importance.png', bbox_inches='tight')
## Save to docs folder for documentation
plt.savefig(f'docs/plots/features_importance.png', bbox_inches='tight') 