# **Regression**

## Objectives

*   Fit and evaluate a regression model to predict the sales price, of inherited properties, in Ames, Iowa


## Inputs

* outputs/datasets/collection/HousePricesRecords.csv
* Instructions on which variables to use for data cleaning and feature engineering. These are found in their respective notebooks.

## Outputs

* Train set (features and target)
* Test set (features and target)
* ML pipeline to predict sale price
* Labels Map
* Feature Importance Plot

---

# Change working directory

Accessing the current directory

In [2]:
import os
current_dir = os.getcwd()
current_dir

'/workspaces/milestone-project-heritage-housing-issues/jupyter_notebooks'

Making sure working in the child of the workspace directory

In [3]:
os.chdir('/workspaces/milestone-project-heritage-housing-issues')
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [4]:
current_dir = os.getcwd()
current_dir

'/workspaces/milestone-project-heritage-housing-issues'

---

# Load Data

In [16]:
import numpy as np
import pandas as pd
df = (pd.read_csv("outputs/datasets/collection/HousePricesRecords.csv")
        .drop(labels=['2ndFlrSF', 'BedroomAbvGr', 'BsmtExposure',
                    'BsmtFinSF1', 'BsmtFinType1', 'BsmtUnfSF',
                    'GarageFinish', 'LotArea', 'LotFrontage',
                    'MasVnrArea', 'OpenPorchSF', 'OverallCond',
                    'YearRemodAdd', 'WoodDeckSF', 'EnclosedPorch'], axis=1))
df["GarageYrBlt"] = np.where(df["GarageArea"] == 0, df['GarageYrBlt'].fillna(0), None)
df['GarageYrBlt'] = np.where(df['YearBuilt'] > 1995, df['GarageYrBlt'].fillna(df['YearBuilt']), df['GarageYrBlt'].fillna(1979))
df['KitchenQual'] = df['KitchenQual'].replace({'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})
df['GarageYrBlt'] = df['GarageYrBlt'].astype(int)
print(df.shape)
df.head()

(1460, 9)


Unnamed: 0,1stFlrSF,GarageArea,GarageYrBlt,GrLivArea,KitchenQual,OverallQual,TotalBsmtSF,YearBuilt,SalePrice
0,856,548,2003,1710,3,7,856,2003,208500
1,1262,460,1979,1262,2,6,1262,1976,181500
2,920,608,2001,1786,3,7,920,2001,223500
3,961,642,1979,1717,3,7,756,1915,140000
4,1145,836,2000,2198,3,8,1145,2000,250000


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   1stFlrSF     1460 non-null   int64
 1   GarageArea   1460 non-null   int64
 2   GarageYrBlt  1460 non-null   int64
 3   GrLivArea    1460 non-null   int64
 4   KitchenQual  1460 non-null   int64
 5   OverallQual  1460 non-null   int64
 6   TotalBsmtSF  1460 non-null   int64
 7   YearBuilt    1460 non-null   int64
 8   SalePrice    1460 non-null   int64
dtypes: int64(9)
memory usage: 102.8 KB


# MP Pipeline: Regressor

## Create ML pipeline

In [1]:
from sklearn.pipeline import Pipeline

# Feature Engineering
from feature_engine.transformation import LogTransformer
from feature_engine.transformation import YeoJohnsonTransformer
from feature_engine.transformation import BoxCoxTransformer
from feature_engine.outliers import Winsorizer
from feature_engine.selection import SmartCorrelatedSelection
from feature_engine import transformation as vt

# Feat Scaling
from sklearn.preprocessing import StandardScaler

# Feat Selection
from sklearn.feature_selection import SelectFromModel

# ML algorithms
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor


def PipelineOptimization(model):
    pipeline = Pipeline([
    ('log_transformer', vt.LogTransformer(
        variables=['FirFlrSF', 'GrLivArea', 'KitchenQual', 'YearBuilt'], base='e')),
    ('yeo_johnson_transformer', vt.YeoJohnsonTransformer(variables=['GarageYrBlt'])),
    ('box_cox_transformer', vt.BoxCoxTransformer(variables=['OverallQual'])),
    ('winsorizer_iqr', Winsorizer(capping_method='iqr', fold=1.5, tail='both', 
        variables = ['FirFlrSF', 'GarageArea', 'GarageYrBlt',
                    'GrLivArea', 'OverallQual', 'TotalBsmtSF',
                    'YearBuilt'])),
    ('SmartCorrelatedSelection', SmartCorrelatedSelection(variables=None,
        method="spearman", threshold=0.6, selection_method="variance")),

    ('feat_scaling', StandardScaler()),

    ('feat_selection',  SelectFromModel(model)),

    ('model', model),
    ])

    return pipeline_base


  from pandas import MultiIndex, Int64Index


Custom Class for hyperparameter optimisation

* Custom class from the Code Institute Wakthrough Project 02

In [2]:
from sklearn.model_selection import GridSearchCV


class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")
            model = PipelineOptimization(self.models[key])

            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring)
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score',
                   'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns], self.grid_searches