# Predict Sale Price 

## Objectives

* Use data cleaning, feature engineering to implement a ML pipeline. 
* Fit, train and evaluate a regression model to predict the sale price of houses in Ames, Iowa. 

## Inputs

* outputs/datasets/collection/house_pices_records.csv
* Instructions on which variables to use for data cleaning and feature engineering that were discovered from previous notebooks. 

## Outputs

* Train set (features and target)
* Test set (features and target)
* ML pipeline to predict sale price
* Feature Importance Plot

## CRISP-DM

* This notebook relates to the Modelling and Evaluation step of Crisp-DM methodology


---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

# Load Data 

In [None]:
import numpy as np
import pandas as pd
df = (pd.read_csv("outputs/datasets/collection/house_prices_records.csv"))

print(df.shape)
df.head(3)

---

## ML Pipeline with all Data

### Data Cleaning and Feature Engineering 

In [None]:
from sklearn.pipeline import Pipeline

### Data Cleaning
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import MeanMedianImputer

### Feature Engineering
from feature_engine import transformation as vt
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import OrdinalEncoder
from feature_engine.selection import SmartCorrelatedSelection


### Feature Scaling
from sklearn.preprocessing import StandardScaler

### Feature Selection 
from sklearn.feature_selection import SelectFromModel

### ML algorithms
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor



def PipelineOptimization(model):
  pipeline_base = Pipeline([

    ### Data Cleaning 
    
    ("ArbitraryNumberImputer",ArbitraryNumberImputer(arbitrary_number=0,
                                variables = ['2ndFlrSF', 'EnclosedPorch', 'MasVnrArea', 'WoodDeckSF']) ),

    ("CategoricalEncoder",CategoricalImputer(imputation_method='missing',fill_value='Unf',
                                variables = ['BsmtFinType1', 'GarageFinish'])),

    ("MeanMedianImputer",MeanMedianImputer(imputation_method='median', 
                                variables = ['BedroomAbvGr', 'GarageYrBlt', 'LotFrontage']) ),  
    ### Feature Engineering 
    ("Ordinalencoder", OrdinalEncoder(encoding_method='arbitrary', 
                          variables = ['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual']) ),
                          
    ("LogTransformer", vt.LogTransformer(
                         variables = ['1stFlrSF', 'GrLivArea', 'LotArea', 'LotFrontage']) ),

    ("PowerTransformer", vt.PowerTransformer(
                         variables = ['BsmtFinSF1', 'BsmtUnfSF', 'GarageArea', 'GrLivArea', 'MasVnrArea', 'OpenPorchSF' ]) ),

    ("YeoJohnsonTransformer", vt.YeoJohnsonTransformer(
                         variables = ['1stFlrSF', 'BsmtUnfSF', 'GarageArea', 'GrLivArea', 'OpenPorchSF', 'TotalBsmtSF']) ),

    ("Winsorizer", Winsorizer(capping_method='iqr', tail='both', fold=1.5,
                        variables = ['GrLivArea']) ),

      
    ("SmartCorrelatedSelection",SmartCorrelatedSelection(variables=None, method="spearman", 
                                                        threshold=0.8, selection_method="variance") ),
    ("feat_scaling",StandardScaler() ),

    ("feat_selection",SelectFromModel(model) ),

    ("model",model ),  
     ])

  return pipeline_base

## ML Pipeline for Modelling and Hyperparameter Optimization 

Custom Class for Hyperparameter Optimization

In [None]:
from sklearn.model_selection import GridSearchCV

class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")

            model =  PipelineOptimization(self.models[key])
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, verbose=verbose, scoring=scoring, )
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns], self.grid_searches


### Split Train and Test Set 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(
                                    df.drop(['SalePrice'],axis=1),
                                    df['SalePrice'],
                                    test_size = 0.2,
                                    random_state = 0,
                                    )

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)


## Grid Search CV - Sklearn

We will use standard hyper parameters to find most suitable algorithm

In [None]:
models_quick_search = {
    'LinearRegression': LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(random_state=0),
}

params_quick_search = {
    'LinearRegression': {},
    "DecisionTreeRegressor": {},
    "RandomForestRegressor": {},
    "ExtraTreesRegressor": {},
    "AdaBoostRegressor": {},
    "GradientBoostingRegressor": {},
    "XGBRegressor": {},
}



### Quick GridSearch CV - Binary Classifier

In [None]:
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train,
           scoring =  'r2',
           n_jobs=-1, cv=5)

Check results 

In [None]:

grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary 

### Do extensive search on most suitable algorithm to find best hyperparameter configuration

Define model and parameters, for Extensive Search

In [None]:
models_search = {
    "ExtraTreesRegressor":ExtraTreesRegressor(random_state=0),
}

# Documentation used to understand and assist with parameters: 
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html

# n_estimators: int, default=100 
# The number of trees in the forest, which translates to 


# max_depth: int, default=None
# The maximum number 


# min_sample_split: int or float=2
# The minimum number of samples required to split an internal code




params_search = {
    "ExtraTreesRegressor":{
        'model__n_estimators': [100,400],
        'model__max_depth': [4,10,None],
        'model__min_samples_split': [7]
    }
}


Extensive GridSearch CV 

In [None]:

search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train,
           scoring =  'r2',
           n_jobs=-1, cv=5)

Check results 

In [None]:

grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary 


Check best model 

In [None]:

best_model = grid_search_summary.iloc[0,0]
best_model

Parameters for best model

In [None]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

Define the best regressor

In [None]:
regressor_pipeline = grid_search_pipelines[best_model].best_estimator_
regressor_pipeline

### Assess Feature Importance 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

# after data cleaning and feat engine, the feature may space changes
data_cleaning_feat_eng_steps = 9 # how many data cleaning and feature engineering does your pipeline have?
columns_after_data_cleaning_feat_eng = (Pipeline(best_regressor_pipeline.steps[:data_cleaning_feat_eng_steps])
                                        .transform(X_train)
                                        .columns)

best_features = columns_after_data_cleaning_feat_eng[best_regressor_pipeline['feat_selection'].get_support()].to_list()

# create DataFrame to display feature importance
df_feature_importance = (pd.DataFrame(data={
          'Feature': columns_after_data_cleaning_feat_eng[best_regressor_pipeline['feat_selection'].get_support()],
          'Importance': best_regressor_pipeline['model'].feature_importances_})
  .sort_values(by='Importance', ascending=False)
  )

# Most important features statement and plot
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

df_feature_importance.plot(kind='bar',x='Feature',y='Importance')
plt.show()

### Evaluate Pipeline on Train and Test Sets

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error 
import numpy as np

def regression_performance(X_train, y_train, X_test, y_test,pipeline):
	print("Model Evaluation \n")
	print("* Train Set")
	regression_evaluation(X_train,y_train,pipeline)
	print("* Test Set")
	regression_evaluation(X_test,y_test,pipeline)

def regression_evaluation(X,y,pipeline):
  prediction = pipeline.predict(X)
  print('R2 Score:', r2_score(y, prediction).round(3))  
  print('Mean Absolute Error:', mean_absolute_error(y, prediction).round(3))  
  print('Mean Squared Error:', mean_squared_error(y, prediction).round(3))  
  print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y, prediction)).round(3))
  print("\n")


def regression_evaluation_plots(X_train, y_train, X_test, y_test,pipeline, alpha_scatter=0.5):
  pred_train = pipeline.predict(X_train)
  pred_test = pipeline.predict(X_test)


  fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,6))
  sns.scatterplot(x=y_train , y=pred_train, alpha=alpha_scatter, ax=axes[0])
  sns.lineplot(x=y_train , y=y_train, color='red', ax=axes[0])
  axes[0].set_xlabel("Actual")
  axes[0].set_ylabel("Predictions")
  axes[0].set_title("Train Set")

  sns.scatterplot(x=y_test , y=pred_test, alpha=alpha_scatter, ax=axes[1])
  sns.lineplot(x=y_test , y=y_test, color='red', ax=axes[1])
  axes[1].set_xlabel("Actual")
  axes[1].set_ylabel("Predictions")
  axes[1].set_title("Test Set")

  plt.show()

Evaluate performance 

In [None]:

regression_performance(X_train, y_train, X_test, y_test,best_regressor_pipeline)
regression_evaluation_plots(X_train, y_train, X_test, y_test, best_regressor_pipeline)


Evaluate classifier

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

def confusion_matrix_and_report(X,y,pipeline,label_map):

  prediction = pipeline.predict(X)

  print('---  Confusion Matrix  ---')
  print(pd.DataFrame(confusion_matrix(y_true=prediction, y_pred=y),
        columns=[ ["Actual " + sub for sub in label_map] ], 
        index= [ ["Prediction " + sub for sub in label_map ]]
        ))
  print("\n")


  print('---  Classification Report  ---')
  print(classification_report(y, prediction, target_names=label_map),"\n")


def clf_performance(X_train,y_train,X_test,y_test,pipeline,label_map):
  print("#### Train Set #### \n")
  confusion_matrix_and_report(X_train,y_train,pipeline,label_map)

  print("#### Test Set ####\n")
  confusion_matrix_and_report(X_test,y_test,pipeline,label_map)

### Regressor with PCA

In [None]:

pipeline = PipelineOptimization(model=LinearRegression())
pipeline_pca = Pipeline(pipeline.steps[:9])
df_pca = pipeline_pca.fit_transform(df.drop(['SalePrice'],axis=1))

print(df_pca.shape,'\n', type(df_pca))

Apply PCA separately to the scaled data

In [None]:
import numpy as np
from sklearn.decomposition import PCA

n_components = 21

def pca_components_analysis(df_pca,n_components): 
    pca = PCA(n_components=n_components).fit(df_pca)
    x_PCA = pca.transform(df_pca) # array with transformed PCA

    ComponentsList = ["Component " + str(number) for number in range(n_components)]
    dfExplVarRatio = pd.DataFrame(
        data= np.round(100 * pca.explained_variance_ratio_ ,3),
        index=ComponentsList,
        columns=['Explained Variance Ratio (%)'])

    dfExplVarRatio['Accumulated Variance'] = dfExplVarRatio['Explained Variance Ratio (%)'].cumsum() 

    PercentageOfDataExplained = dfExplVarRatio['Explained Variance Ratio (%)'].sum()

    print(f"* The {n_components} components explain {round(PercentageOfDataExplained,2)}% of the data \n")
    plt.figure(figsize=(12,5))
    sns.lineplot(data=dfExplVarRatio,  marker="o")
    plt.xticks(rotation=90)
    plt.yticks(np.arange(0, 110, 10))
    plt.show()

pca_components_analysis(df_pca=df_pca,n_components=n_components)


In [None]:

n_components = 4
pca_components_analysis(df_pca=df_pca,n_components=n_components)

### Rewrite ML Pipeline for Modelling

In [None]:
### PCA
from sklearn.decomposition import PCA

def PipelineOptimization(model):
  pipeline_base = Pipeline([

    ### Data Cleaning 
    
    ("ArbitraryNumberImputer",ArbitraryNumberImputer(arbitrary_number=0,
                                variables = ['2ndFlrSF', 'EnclosedPorch', 'MasVnrArea', 'WoodDeckSF']) ),

    ("CategoricalEncoder",CategoricalImputer(imputation_method='missing',fill_value='Unf',
                                variables = ['BsmtFinType1', 'GarageFinish'])),

    ("MeanMedianImputer",MeanMedianImputer(imputation_method='median', 
                                variables = ['BedroomAbvGr', 'GarageYrBlt', 'LotFrontage']) ),  
    ### Feature Engineering 
    ("Ordinalencoder", OrdinalEncoder(encoding_method='arbitrary', 
                          variables = ['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual']) ),
                          
    ("LogTransformer", vt.LogTransformer(
                         variables = ['1stFlrSF', 'GrLivArea', 'LotArea', 'LotFrontage']) ),

    ("PowerTransformer", vt.PowerTransformer(
                         variables = ['BsmtFinSF1', 'BsmtUnfSF', 'GarageArea', 'GrLivArea', 'MasVnrArea', 'OpenPorchSF' ]) ),

    ("YeoJohnsonTransformer", vt.YeoJohnsonTransformer(
                         variables = ['1stFlrSF', 'BsmtUnfSF', 'GarageArea', 'GrLivArea', 'OpenPorchSF', 'TotalBsmtSF']) ),

    ("Winsorizer", Winsorizer(capping_method='iqr', tail='both', fold=1.5,
                        variables = ['GrLivArea']) ),

      
    ("SmartCorrelatedSelection",SmartCorrelatedSelection(variables=None, method="spearman", 
                                                        threshold=0.8, selection_method="variance") ),
    ("feat_scaling",StandardScaler() ),

    ("feat_selection",SelectFromModel(model) ),

    ("model",model ),  
     ])

  return pipeline_base

### Grid Search CV – Sklearn

In [None]:
print("* Train set:", X_train.shape, y_train.shape, "\n* Test set:",  X_test.shape, y_test.shape)

### Use standard hyperparameters to find most suitable model

In [None]:
models_quick_search = {
    'LinearRegression': LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(random_state=0),
}

params_quick_search = {
    'LinearRegression': {},
    "DecisionTreeRegressor": {},
    "RandomForestRegressor": {},
    "ExtraTreesRegressor": {},
    "AdaBoostRegressor": {},
    "GradientBoostingRegressor": {},
    "XGBRegressor": {},
}

Do a quick optimization search

In [None]:
quick_search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
quick_search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)

Check results

In [None]:
grid_search_summary, grid_search_pipelines = quick_search.score_summary(sort_by='mean_score')
grid_search_summary

Define the best regressor

In [None]:

best_regressor_pipeline

### Refit pipeline with best features

Rewrite ML pipeline for Data Cleaning, Feature Engineering and Modeling

In [None]:

best_features

In [None]:
### ADD BEST GEATURES TO ONLY TO PIPELINE

def PipelineOptimization(model):
  pipeline_base = Pipeline([

    ### Data Cleaning 
    
    ("ArbitraryNumberImputer",ArbitraryNumberImputer(arbitrary_number=0,
                                variables = ['2ndFlrSF', 'EnclosedPorch', 'MasVnrArea', 'WoodDeckSF']) ),

    ("CategoricalEncoder",CategoricalImputer(imputation_method='missing',fill_value='Unf',
                                variables = ['BsmtFinType1', 'GarageFinish'])),

    ("MeanMedianImputer",MeanMedianImputer(imputation_method='median', 
                                variables = ['BedroomAbvGr', 'GarageYrBlt', 'LotFrontage']) ),  
    ### Feature Engineering 
    ("Ordinalencoder", OrdinalEncoder(encoding_method='arbitrary', 
                          variables = ['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual']) ),
                          
    ("LogTransformer", vt.LogTransformer(
                         variables = ['1stFlrSF', 'GrLivArea', 'LotArea', 'LotFrontage']) ),

    ("PowerTransformer", vt.PowerTransformer(
                         variables = ['BsmtFinSF1', 'BsmtUnfSF', 'GarageArea', 'GrLivArea', 'MasVnrArea', 'OpenPorchSF' ]) ),

    ("YeoJohnsonTransformer", vt.YeoJohnsonTransformer(
                         variables = ['1stFlrSF', 'BsmtUnfSF', 'GarageArea', 'GrLivArea', 'OpenPorchSF', 'TotalBsmtSF']) ),

    ("Winsorizer", Winsorizer(capping_method='iqr', tail='both', fold=1.5,
                        variables = ['GrLivArea']) ),

      
    ("SmartCorrelatedSelection",SmartCorrelatedSelection(variables=None, method="spearman", 
                                                        threshold=0.8, selection_method="variance") ),
    ("feat_scaling",StandardScaler() ),

    ("feat_selection",SelectFromModel(model) ),

    ("model",model ),  
     ])

  return pipeline_base

Then filter using the best features

### Split Train and Test sets with only the best features

In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(
                                    df.drop(['SalePrice'],axis=1),
                                    df['SalePrice'],
                                    test_size = 0.2,
                                    random_state = 0,
                                    )

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
X_train = X_train.filter(best_features)
X_test = X_test.filter(best_features)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
X_train.head(3)

### Grid Search CV - SKLearn

In [None]:

models_search

In [None]:
best_parameters

In [None]:

params_search = {'ExtraTreesRegressor':  {
    'model__max_depth': [None], 
    'model__min_samples_split': [8],
    'model__n_estimators': [300],        
}
}
params_search

In [None]:
from sklearn.metrics import make_scorer, recall_score
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train,
           scoring = 'r2',
           n_jobs=-1,cv=5)


Chech results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

Check the Best Model

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

Define the best regression pipeline

In [None]:
best_regressor_pipeline = grid_search_pipelines[best_model].best_estimator_
best_regressor_pipeline

### Assess Feature Importance

In [None]:

data_cleaning_feat_eng_steps = 1 # how many data cleaning and feature engineering does your pipeline have?
columns_after_data_cleaning_feat_eng = (Pipeline(best_regressor_pipeline.steps[:data_cleaning_feat_eng_steps])
                                        .transform(X_train)
                                        .columns)

best_features = columns_after_data_cleaning_feat_eng

# create DataFrame to display feature importance
df_feature_importance = (pd.DataFrame(data={
          'Feature': columns_after_data_cleaning_feat_eng,
          'Importance': best_regressor_pipeline['model'].feature_importances_})
  .sort_values(by='Importance', ascending=False)
  )

# Most important features statement and plot
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

df_feature_importance.plot(kind='bar',x='Feature',y='Importance')
plt.show()


# Push files to Repo

In [None]:
import os
try:
  # create here your folder
  # os.makedirs(name='')
except Exception as e:
  print(e)

### Save datasets and models

We will generate the following files

* Train set
* Test set
* Modeling pipeline
* Features importance plot

### Train Set : features and target

### Test Set: features and target

### Modelling pipeline

ML pipeline for predicting Total Price

In [None]:
best_pipeline_regressor

In [None]:
joblib.dump(value=best_pipeline_regressor, filename=f"{file_path}/regression_pipeline.pkl")

### Feature importance Plot

In [None]:

df_feature_importance.plot(kind='bar',x='Feature',y='Importance')
plt.show()


In [None]:
df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.savefig(f'{file_path}/features_importance.png', bbox_inches='tight')

---

NOTE

* You may add as many sections as you want, as long as they support your project workflow.
* All notebook's cells should be run top-down (you can't create a dynamic wherein a given point you need to go back to a previous cell to execute some task, like go back to a previous cell and refresh a variable content)

---

* If you do not need to push files to Repo, you may replace this section with "Conclusions and Next Steps" and state your conclusions and next steps.

In [None]:
import os
try:
  # create here your folder
  # os.makedirs(name='')
except Exception as e:
  print(e)
