In [None]:
import lib._util.visualplot as vp

# Pre-processing
from lib._class.DFSimpleImputer import DFSimpleImputer
from lib._class.DFDuplicateRemoval import DFDuplicateRemoval
from lib._class.DFPositiveTransformer import DFPositiveTransformer
from lib._class.DFBoxCoxTransformer import DFBoxCoxTransformer

# Feature selection
from lib._class.DFRegressionThreshold import DFRegressionThreshold

# Feature encoding
from lib._class.DFOrdinalEncoder import DFOrdinalEncoder

# Feature scaling
from lib._class.DFStandardScaler import DFStandardScaler
from lib._class.DFMinMaxScaler import DFMinMaxScaler

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

# Scikit-Learn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Plotly
import plotly.express as px

# Constant Variable

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH   = 'resources/output/graph/'

# Phase 1 - Data Loading
- Reference: https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
- MSSubClass: The building class
- MSZoning: The general zoning classification
- LotFrontage: Linear feet of street connected to property
- LotArea: Lot size in square feet
- Street: Type of road access
- Alley: Type of alley access
- LotShape: General shape of property
- LandContour: Flatness of the property
- Utilities: Type of utilities available
- LotConfig: Lot configuration
- LandSlope: Slope of property
- Neighborhood: Physical locations within Ames city limits
- Condition1: Proximity to main road or railroad
- Condition2: Proximity to main road or railroad (if a second is present)
- BldgType: Type of dwelling
- HouseStyle: Style of dwelling
- OverallQual: Overall material and finish quality
- OverallCond: Overall condition rating
- YearBuilt: Original construction date
- YearRemodAdd: Remodel date
- RoofStyle: Type of roof
- RoofMatl: Roof material
- Exterior1st: Exterior covering on house
- Exterior2nd: Exterior covering on house (if more than one material)
- MasVnrType: Masonry veneer type
- MasVnrArea: Masonry veneer area in square feet
- ExterQual: Exterior material quality
- ExterCond: Present condition of the material on the exterior
- Foundation: Type of foundation
- BsmtQual: Height of the basement
- BsmtCond: General condition of the basement
- BsmtExposure: Walkout or garden level basement walls
- BsmtFinType1: Quality of basement finished area
- BsmtFinSF1: Type 1 finished square feet
- BsmtFinType2: Quality of second finished area (if present)
- BsmtFinSF2: Type 2 finished square feet
- BsmtUnfSF: Unfinished square feet of basement area
- TotalBsmtSF: Total square feet of basement area
- Heating: Type of heating
- HeatingQC: Heating quality and condition
- CentralAir: Central air conditioning
- Electrical: Electrical system
- 1stFlrSF: First Floor square feet
- 2ndFlrSF: Second floor square feet
- LowQualFinSF: Low quality finished square feet (all floors)
- GrLivArea: Above grade (ground) living area square feet
- BsmtFullBath: Basement full bathrooms
- BsmtHalfBath: Basement half bathrooms
- FullBath: Full bathrooms above grade
- HalfBath: Half baths above grade
- Bedroom: Number of bedrooms above basement level
- Kitchen: Number of kitchens
- KitchenQual: Kitchen quality
- TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
- Functional: Home functionality rating
- Fireplaces: Number of fireplaces
- FireplaceQu: Fireplace quality
- GarageType: Garage location
- GarageYrBlt: Year garage was built
- GarageFinish: Interior finish of the garage
- GarageCars: Size of garage in car capacity
- GarageArea: Size of garage in square feet
- GarageQual: Garage quality
- GarageCond: Garage condition
- PavedDrive: Paved driveway
- WoodDeckSF: Wood deck area in square feet
- OpenPorchSF: Open porch area in square feet
- EnclosedPorch: Enclosed porch area in square feet
- 3SsnPorch: Three season porch area in square feet
- ScreenPorch: Screen porch area in square feet
- PoolArea: Pool area in square feet
- PoolQC: Pool quality
- Fence: Fence quality
- MiscFeature: Miscellaneous feature not covered in other categories
- MiscVal: $Value of miscellaneous feature
- MoSold: Month Sold
- YrSold: Year Sold
- SaleType: Type of sale
- SaleCondition: Condition of sale
- SalePrice: The property's sale price in dollars. This is the target variable that you're trying to predict.

In [None]:
def load_trainset():
    df_chunks = pd.read_csv(f'{SOURCE_PATH_DATA}train.csv', sep=',', chunksize=50_000)
    return pd.concat(df_chunks)

In [None]:
data_df = load_trainset()

data_df.shape

In [None]:
data_df.head()

In [None]:
vp.faststat(data_df)

In [None]:
# Prepare mean & mode of full dataset for test dataset
mean_dict = {x: data_df[x].mean() for x in data_df.select_dtypes('number').columns}
mode_dict = {x: data_df[x].mode()[0] for x in data_df.select_dtypes('object').columns}

len(mean_dict), len(mode_dict)

In [None]:
# Drop ID feature
data_df.drop(columns=['Id'], inplace=True)

# Feature stats
stat_df = data_df.isna().sum().to_frame(name='N/A Count')
stat_df['N/A Ratio'] = stat_df['N/A Count'] / len(data_df)
stat_df = stat_df.merge(data_df.dtypes.to_frame(name='Type'), left_index=True, right_index=True, how='left')

# Drop features more than 40% N/A
data_df.drop(columns=[x for x in data_df.columns if stat_df.at[x, 'N/A Ratio'] > .4], inplace=True)

# Data imputation
columns      = [x for x in data_df.select_dtypes('number').columns if any(data_df[x].isna())]
mean_imputer = DFSimpleImputer(columns=columns, strategy='mean')

columns      = [x for x in data_df.select_dtypes('object').columns if any(data_df[x].isna())]
mode_imputer = DFSimpleImputer(columns=columns, strategy='most_frequent')

steps = [
    ('mean_imputer', mean_imputer),
    ('mode_imputer', mode_imputer),
]
data_df = Pipeline(steps).fit_transform(data_df)

In [None]:
vp.faststat(data_df)

###### Histogram

In [None]:
vp.histogram(data_df,
             bin_algo='count',
             max_col=4,
             title='Phase 1 - Histogram',
             out_path=OUT_PATH_GRAPH,
             layout_kwargs={'height': 4000})

###### Box

In [None]:
vp.box(data_df,
       max_col=4,
       title='Phase 1 - Box',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'height': 2000,
           'legend_orientation': 'h'
       })

# Phase 2 - Data Preparation
- Remove outliers

###### Correlation Matrix

In [None]:
vp.corrmat(data_df,
           absolute=True,
           matrix_type='upper',
           title='Phase 2 - Correlation Matrix',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'reversescale': True
           })

###### Scatter

In [None]:
vp.scatter(data_df,
           xy_tuples=[(x, 'SalePrice') for x in data_df.select_dtypes('number').columns if x != 'SalePrice'],
           max_col=4,
           title='Phase 2 - Scatter',
           out_path=OUT_PATH_GRAPH,
           scatter_kwargs={
               'trendline': 'ols',
               'trendline_color_override': 'red'
           },
           layout_kwargs={
               'height': 2000,
               'legend_orientation': 'h'
           })

In [None]:
# Remove outliers
data_df = data_df[~data_df['LotFrontage'].isin([313])]
data_df = data_df[~data_df['LotArea'].isin([115_149, 159_000, 164_660, 215_245])]
data_df = data_df[~data_df['MasVnrArea'].isin([1600])]
data_df = data_df[~data_df['BsmtFinSF1'].isin([5644])]
data_df = data_df[~data_df['TotalBsmtSF'].isin([6110])]
data_df = data_df[~data_df['1stFlrSF'].isin([4692])]
data_df = data_df[~data_df['GrLivArea'].isin([4676, 5642])]
data_df = data_df[~data_df['GarageArea'].isin([1248, 1356, 1390, 1418])]
data_df = data_df[~data_df['OpenPorchSF'].isin([523])]
data_df = data_df[~data_df['EnclosedPorch'].isin([552])]

###### Scatter

In [None]:
vp.scatter(data_df,
           xy_tuples=[(x, 'SalePrice') for x in data_df.select_dtypes('number').columns if x != 'SalePrice'],
           max_col=4,
           title='Phase 2 - Scatter - No Outlier',
           out_path=OUT_PATH_GRAPH,
           scatter_kwargs={
               'trendline': 'ols',
               'trendline_color_override': 'red'
           },
           layout_kwargs={
               'height': 2000,
               'legend_orientation': 'h'
           })

###### Correlation Matrix

In [None]:
vp.corrmat(data_df,
           absolute=True,
           matrix_type='upper',
           title='Phase 2 - Correlation Matrix - No Outlier',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'reversescale': True
           })

# Phase 3 - Data Preparation
- Categorical encoding

###### Box

In [None]:
vp.box_categorical(data_df,
                   y='SalePrice',
                   title='Phase 3 - Box',
                   out_path=OUT_PATH_GRAPH,
                   max_col=4,
                   layout_kwargs={'height': 2500})

###### Bar

In [None]:
vp.meandist(data_df,
            y='SalePrice',
            title='Phase 3 - Bar - Mean Distribution',
            out_path=OUT_PATH_GRAPH,
            max_col=2,
            layout_kwargs={'height': 6500})

In [None]:
# Ordinal encoding based on mean
mapper_dict = {
    'MSZoning': {
        'C (all)': 1,
        'RM': 2,
        'RH': 2,
        'RL': 3,
        'FV': 3,
    },
    'Street': {
        'Grvl': 1,
        'Pave': 2,
    },
    'LotShape': {
        'Reg': 1,
        'IR1': 2,
        'IR3': 2,
        'IR2': 3,
    },
    'LandContour': {
        'Bnk': 1,
        'Lvl': 2,
        'Low': 2,
        'HLS': 3,
    },
    'Utilities': {
        'NoSeWa': 1,
        'AllPub': 2,
    },
    'LotConfig': {
        'Inside': 1,
        'FR2': 1,
        'Corner': 1,
        'FR3': 2,
        'CulDSac': 2,
    },
    'LandSlope': {
        'Sev': 1,
        'Gtl': 2,
        'Mod': 3,
    },
    'Neighborhood': {
        'MeadowV': 1,
        'IDOTRR': 1,
        'BrDale': 1,
        'BrkSide': 2,
        'Edwards': 2,
        'OldTown': 2,
        'Sawyer': 2,
        'Blueste': 2,
        'SWISU': 2,
        'NPkVill': 2,
        'NAmes': 2,
        'Mitchel': 2,
        'SawyerW': 3,
        'NWAmes': 3,
        'Gilbert': 3,
        'Blmngtn': 3,
        'CollgCr': 3,
        'ClearCr': 3,
        'Crawfor': 3,
        'Somerst': 3,
        'Veenker': 3,
        'Timber': 3,
        'StoneBr': 4,
        'NridgHt': 4,
        'NoRidge': 4,
    },
    'Condition1': {
        'Artery': 1,
        'RRAe': 1,
        'Feedr': 1,
        'Norm': 2,
        'RRAn': 2,
        'RRNe': 3,
        'RRNn': 3,
        'PosN': 3,
        'PosA': 3,
    },
    'Condition2': {
        'RRNn': 1,
        'Artery': 1,
        'Feedr': 1,
        'RRAn': 1,
        'Norm': 2,
        'RRAe': 2,
        'PosA': 3,
        'PosN': 3,
    },
    'BldgType': {
        '2fmCon': 1,
        'Duplex': 1,
        'Twnhs': 1,
        'TwnhsE': 2,
        '1Fam': 2,
    },
    'HouseStyle': {
        '1.5Unf': 1,
        'SFoyer': 1,
        '1.5Fin': 1,
        '2.5Unf': 1,
        'SLvl': 2,
        '1Story': 2,
        '2Story': 3,
        '2.5Fin': 3,
    },
    'RoofStyle': {
        'Gambrel': 1,
        'Gable': 2,
        'Mansard': 2,
        'Flat': 2,
        'Hip': 3,
        'Shed': 3,
    },
    'RoofMatl': {
        'Roll': 1,
        'CompShg': 2,
        'Tar&Grv': 2,
        'Metal': 2,
        'WdShake': 3,
        'Membran': 3,
        'WdShngl': 4,
    },
    'Exterior1st': {
        'BrkComm': 1,
        'AsphShn': 2,
        'CBlock': 2,
        'AsbShng': 2,
        'Wd Sdng': 3,
        'MetalSd': 3,
        'WdShing': 3,
        'Stucco': 4,
        'HdBoard': 4,
        'Plywood': 4,
        'BrkFace': 5,
        'VinylSd': 5,
        'CemntBd': 5,
        'Stone': 6,
        'ImStucc': 6,
    },
    'Exterior2nd': {
        'CBlock': 1,
        'AsbShng': 1,
        'Brk Cmn': 2,
        'AsphShn': 3,
        'Wd Sdng': 3,
        'MetalSd': 3,
        'Stucco': 3,
        'Stone': 4,
        'Wd Shng': 4,
        'Plywood': 4,
        'HdBoard': 4,
        'BrkFace': 5,
        'VinylSd': 6,
        'CmentBd': 6,
        'ImStucc': 6,
        'Other': 7,
    },
    'MasVnrType': {
        'BrkCmn': 1,
        'None': 1,
        'BrkFace': 2,
        'Stone': 3,
    },
    'ExterQual': {
        'Fa': 1,
        'TA': 2,
        'Gd': 3,
        'Ex': 4,
    },
    'ExterCond': {
        'Po': 1,
        'Fa': 1,
        'Gd': 2,
        'TA': 2,
        'Ex': 2,
    },
    'Foundation': {
        'Slab': 1,
        'BrkTil': 2,
        'CBlock': 2,
        'Stone': 3,
        'Wood': 3,
        'PConc': 4,
    },
    'BsmtQual': {
        'Fa': 1,
        'TA': 1,
        'Gd': 2,
        'Ex': 3,
    },
    'BsmtCond': {
        'Po': 1,
        'Fa': 2,
        'TA': 3,
        'Gd': 4,
    },
    'BsmtExposure': {
        'No': 1,
        'Mn': 2,
        'Av': 2,
        'Gd': 3,
    },
    'BsmtFinType1': {
        'Rec': 1,
        'BLQ': 1,
        'LwQ': 1,
        'ALQ': 2,
        'Unf': 2,
        'GLQ': 3,
    },
    'BsmtFinType2': {
        'BLQ': 1,
        'Rec': 2,
        'LwQ': 2,
        'GLQ': 3,
        'Unf': 3,
        'ALQ': 4,
    },
    'Heating': {
        'Floor': 1,
        'Grav': 1,
        'Wall': 2,
        'OthW': 3,
        'GasW': 4,
        'GasA': 5,
    },
    'HeatingQC': {
        'Po': 1,
        'Fa': 2,
        'TA': 2,
        'Gd': 2,
        'Ex': 3,
    },
    'CentralAir': {
        'N': 1,
        'Y': 2,
    },
    'Electrical': {
        'Mix': 1,
        'FuseP': 2,
        'FuseF': 2,
        'FuseA': 2,
        'SBrkr': 3,
    },
    'KitchenQual': {
        'Fa': 1,
        'TA': 1,
        'Gd': 2,
        'Ex': 3,
    },
    'Functional': {
        'Maj2': 1,
        'Sev': 2,
        'Min2': 3,
        'Min1': 3,
        'Maj1': 3,
        'Mod': 4,
        'Typ': 4,
    },
    'GarageType': {
        'CarPort': 1,
        'Detchd': 2,
        '2Types': 2,
        'Basment': 2,
        'Attchd': 3,
        'BuiltIn': 4,
    },
    'GarageFinish': {
        'Unf': 1,
        'RFn': 2,
        'Fin': 3,
    },
    'GarageQual': {
        'Po': 1,
        'Fa': 1,
        'TA': 2,
        'Gd': 3,
        'Ex': 3,
    },
    'GarageCond': {
        'Po': 1,
        'Fa': 1,
        'Ex': 1,
        'Gd': 2,
        'TA': 2,
    },
    'PavedDrive': {
        'N': 1,
        'P': 1,
        'Y': 2,
    },
    'SaleType': {
        'Oth': 1,
        'ConLw': 1,
        'COD': 1,
        'ConLD': 1,
        'WD': 2,
        'ConLI': 3,
        'CWD': 3,
        'Con': 4,
        'New': 4,
    },
    'SaleCondition': {
        'AdjLand': 1,
        'Abnorml': 1,
        'Family': 1,
        'Alloca': 1,
        'Normal': 1,
        'Partial': 2,
    }
}

ordinal_encoder = DFOrdinalEncoder(mapper_dict)
data_df         = ordinal_encoder.fit_transform(data_df)

In [None]:
vp.faststat(data_df)

###### Scatter

In [None]:
vp.scatter(data_df,
           xy_tuples=[(x, 'SalePrice') for x in mapper_dict.keys()],
           max_col=4,
           title='Phase 3 - Scatter',
           out_path=OUT_PATH_GRAPH,
           scatter_kwargs={
               'trendline': 'ols',
               'trendline_color_override': 'red'
           },
           layout_kwargs={
               'height': 2000,
               'legend_orientation': 'h'
           })

# Phase 4 - Data Preparation
- Feature selection

In [None]:
# Remove features with low r2 or high p-value
data_df = DFRegressionThreshold('SalePrice').fit_transform(data_df)

data_df.shape

In [None]:
vp.faststat(data_df)

###### Scatter

In [None]:
vp.scatter(data_df,
           xy_tuples=[(x, 'SalePrice') for x in data_df.columns if x != 'SalePrice'],
           max_col=4,
           title='Phase 4 - Scatter',
           out_path=OUT_PATH_GRAPH,
           scatter_kwargs={
               'trendline': 'ols',
               'trendline_color_override': 'red'
           },
           layout_kwargs={
               'height': 1000,
               'legend_orientation': 'h'
           })

###### Correlation Matrix

In [None]:
vp.corrmat(data_df,
           absolute=True,
           matrix_type='upper',
           title='Phase 4 - Correlation Matrix',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'reversescale': True
           })

In [None]:
# Remove high correlated features
data_df.drop(columns=['TotRmsAbvGrd', '1stFlrSF', 'GarageArea', 'GarageYrBlt'], inplace=True)

data_df.shape

In [None]:
vp.faststat(data_df)

###### Correlation Matrix

In [None]:
vp.corrmat(data_df,
           absolute=True,
           matrix_type='upper',
           title='Phase 4 - Correlation Matrix - Low Correlated',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'reversescale': True
           })

# Phase 5 - Data Preparation
- Remove duplicated data

In [None]:
duplicate_removal = DFDuplicateRemoval(keep='mean', target='SalePrice')
duplicate_removal.fit(data_df)

In [None]:
# Observe duplicated data
duplicate_df = duplicate_removal.duplicate_df

duplicate_df

In [None]:
# Observe target's mean of duplicated data
duplicate_df.groupby([x for x in duplicate_df.columns if x != 'SalePrice']).agg(
    SalePrice=('SalePrice', 'mean')
).reset_index()

In [None]:
# Remove duplicated data & take target's mean
data_df = duplicate_removal.transform(data_df)

data_df.shape

In [None]:
vp.faststat(data_df)

###### Scatter

In [None]:
vp.scatter(data_df,
           xy_tuples=[(x, 'SalePrice') for x in data_df.columns if x != 'SalePrice'],
           max_col=4,
           title='Phase 5 - Scatter',
           out_path=OUT_PATH_GRAPH,
           scatter_kwargs={
               'trendline': 'ols',
               'trendline_color_override': 'red'
           },
           layout_kwargs={
               'height': 1000,
               'legend_orientation': 'h'
           })

###### Correlation Matrix

In [None]:
vp.corrmat(data_df,
           absolute=True,
           matrix_type='upper',
           title='Phase 5 - Correlation Matrix',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'reversescale': True
           })

# Phase 6 - Data Preparation
- Handle skewness

###### Histogram

In [None]:
vp.histogram(data_df,
             bin_algo='count',
             max_col=4,
             title='Phase 6 - Histogram',
             out_path=OUT_PATH_GRAPH,
             layout_kwargs={'height': 1000})

###### Probability

In [None]:
vp.prob(data_df,
        max_col=4,
        title='Phase 6 - Probability',
        out_path=OUT_PATH_GRAPH,
        layout_kwargs={'height': 1000})

In [None]:
# Observe skewness & kurtosis
boxcox_columns = ['GrLivArea', 'TotalBsmtSF', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'SalePrice']
DFBoxCoxTransformer(columns=boxcox_columns).fit(data_df).stat_df

In [None]:
# Exclude approximately normal distributed features
boxcox_columns = [x for x in boxcox_columns if x not in ['TotalBsmtSF', 'YearBuilt', 'YearRemodAdd']]
boxcox_columns

In [None]:
# Features with negative or zero value
negative_columns = [x for x in boxcox_columns if data_df[x].min() <= 0]
negative_columns

In [None]:
# Handle skewness
positive_transformer = DFPositiveTransformer(columns=negative_columns)
boxcox_transformer   = DFBoxCoxTransformer(columns=boxcox_columns)

steps = [
    ('positive_transformer', positive_transformer),
    ('boxcox_transformer', boxcox_transformer),
]
data_df = Pipeline(steps).fit_transform(data_df)

In [None]:
vp.faststat(data_df)

In [None]:
boxcox_transformer.stat_df

###### Histogram

In [None]:
vp.histogram(data_df,
             bin_algo='count',
             max_col=4,
             title='Phase 6 - Histogram - Box Cox',
             out_path=OUT_PATH_GRAPH,
             layout_kwargs={'height': 1000})

###### Probability

In [None]:
vp.prob(data_df,
        max_col=4,
        title='Phase 6 - Probability - Box Cox',
        out_path=OUT_PATH_GRAPH,
        layout_kwargs={'height': 1000})

# Phase 7 - Regression
- Separate features & target
- Feature scaling
- Regression

In [None]:
# Separate features & target
X_train = data_df[[x for x in data_df.columns if x != 'SalePrice']]
y_train = data_df['SalePrice']

X_train.shape, y_train.shape

In [None]:
# Feature scaling
standard_scaler = DFStandardScaler()
minmax_scaler   = DFMinMaxScaler()

steps = [
    ('standard_scaler', standard_scaler),
    ('minmax_scaler', minmax_scaler),
]
X_train = Pipeline(steps).fit_transform(X_train)

In [None]:
def eval_regress(X, y, model):
    y_pred = model.predict(X)
    y_true = y
    
    # Reference: https://medium.com/acing-ai/how-to-evaluate-regression-models-d183b4f5853d
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2  = r2_score(y_true, y_pred)
    
    print(f'MAE: {mae :.5f}')
    print(f'MSE: {mse :.5f}')
    print(f'R2 : {r2 :.5f}')
    
    return pd.DataFrame({
        'y_true': y_true,
        'y_pred': y_pred
    })

In [None]:
# Regression
model = RandomForestRegressor(n_estimators=500, random_state=0)
model.fit(X_train, y_train)

###### Scatter

In [None]:
vp.scatter(eval_regress(X_train, y_train, model),
           xy_tuples=[('y_true', 'y_pred')],
           max_col=1,
           title='Phase 7 - Scatter - Train',
           out_path=OUT_PATH_GRAPH,
           scatter_kwargs={
               'trendline': 'ols',
               'trendline_color_override': 'red'
           })

# Phase 8 - Regression (Test)

In [None]:
def load_testset():
    df_chunks  = pd.read_csv(f'{SOURCE_PATH_DATA}test.csv', sep=',', chunksize=50_000)
    feature_df = pd.concat(df_chunks)
    
    df_chunks = pd.read_csv(f'{SOURCE_PATH_DATA}sample_submission.csv', sep=',', chunksize=50_000)
    target_df = pd.concat(df_chunks)
    
    return feature_df.merge(target_df, on='Id', how='left')

In [None]:
test_df = load_testset()

test_df.shape

In [None]:
vp.faststat(test_df)

In [None]:
# Data imputation (based on train dataset)
steps = [
    ('mean_imputer', mean_imputer),
    ('mode_imputer', mode_imputer),
]
test_df = Pipeline(steps).transform(test_df)

In [None]:
vp.faststat(test_df)

In [None]:
# Data imputation (not imputed on train dataset)
columns = [x for x in test_df.select_dtypes('number').columns if any(test_df[x].isna())]
for column in columns:
    test_df[column].fillna(mean_dict[column], inplace=True)

columns = [x for x in test_df.select_dtypes('object').columns if any(test_df[x].isna())]
for column in columns:
    test_df[column].fillna(mode_dict[column], inplace=True)

In [None]:
vp.faststat(test_df)

In [None]:
# Pre-process pipeline
steps = [
    ('ordinal_encoder', ordinal_encoder),
    ('positive_transformer', positive_transformer),
    ('boxcox_transformer', boxcox_transformer),
]
test_df = Pipeline(steps).transform(test_df)

In [None]:
vp.faststat(test_df)

In [None]:
# Feature selection
test_df = test_df[data_df.columns].copy()

test_df.shape

In [None]:
vp.faststat(test_df)

In [None]:
# Separate features & target
X_test = test_df[[x for x in test_df.columns if x != 'SalePrice']]
y_test = test_df['SalePrice']

X_test.shape, y_test.shape

In [None]:
# Feature scaling
steps = [
    ('standard_scaler', standard_scaler),
    ('minmax_scaler', minmax_scaler),
]
X_test = Pipeline(steps).transform(X_test)

###### Scatter

In [None]:
vp.scatter(eval_regress(X_test, y_test, model),
           xy_tuples=[('y_true', 'y_pred')],
           max_col=1,
           title='Phase 8 - Scatter - Test',
           out_path=OUT_PATH_GRAPH,
           scatter_kwargs={
               'trendline': 'ols',
               'trendline_color_override': 'red'
           })