# The analysis of data and ideas how it can be transformed in case of predicting *SalePrice* is present in *analysis.ipynb* notebook.

In [1]:
import pandas as pd 
import numpy as np 
from scipy.stats import spearmanr
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split



c:\Python38\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Python38\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [1]:
CV_NO_FOLDS = 5
RANDOM_STATE = 17

In [2]:
def scrollable_dataframe(df):
    
    table_html = df.to_html(classes='table',
                            table_id='scrollable_table', 
                            escape=False)

    html = f'''
    <div style="width: 100%; height:400px; overflow:scroll;">
            <style>
             td, th {{
                white-space: nowrap;
                width: 100px;
                padding: 10px;
            }}
        </style>
        {table_html}
    </div>
    '''

    return display(HTML(html))

In [3]:

data_train = pd.read_csv('train.csv')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data_train.drop(columns=['SalePrice', 'Id']), data_train[['SalePrice']], test_size=0.25, random_state=RANDOM_STATE)

## Data cleaning.
### As stated in *analysis* notebook:
##### - I will fill numerical features with median
##### - I will fill categorical features with 'Unknown' label or with majority class depending on how numerous a particular feature is
##### - I will drop columns with too numeours majority class or missing values

In [5]:
# helper function to decide how to deal with missing values for categorical columns
def unknownOrMajoritySplit(data: pd.DataFrame) -> (list, list):
    unknownCols = []
    majorityCols = []
    desc = data.describe().T
    length = len(data)
    for index, row in desc.iterrows():
        freq = row['freq']
        if freq / length >= 0.8:
            majorityCols.append(index)
        else:
            unknownCols.append(index)
    
    return (unknownCols, majorityCols)

In [6]:
cats = X_train.select_dtypes(include=[object, bool])
unknownCols, majorityCols = unknownOrMajoritySplit(cats)

### The plan is following 
#### clean data -> fill_missing values separately -> add new features and remove some of the existing ones

### Similar functions like below I created in *analysis.ipynb* notebook

In [7]:
import math

def adjustNumerical(data: pd.DataFrame) -> pd.DataFrame:
    copy = data.copy()
    # convert MSSubClass to categorical as it is probably not ordinal
    copy['CatMSSubClass'] = copy['MSSubClass'].astype(object)
    # convert OveralCond to categorical as it is probably not ordinal (it should be ordinal but but SalePrice does not increase as OveralQual increase)
    copy['CatOverallCond'] = copy['OverallCond'].astype(object)
    copy['CatMoSold'] = copy['MoSold'].astype(object)
    copy['CatYrSold'] = copy['YrSold'].astype(object)

    # drop columns which I saved under another column name  
    copy.drop(columns=['MoSold', 'YrSold', 'OverallCond', 'MSSubClass'], inplace=True)
    #copy.drop(columns='BedroomAbvGr', inplace=True)
    
    copy['TotalSqrFt'] = copy['GrLivArea'] + copy['GarageArea']
    copy['IsGarage'] = copy['GarageArea'].apply(lambda x: x if math.isnan(x) else (1 if x > 0 else 0)).astype(object)
    copy['Is2ndFlr'] = copy['2ndFlrSF'].apply(lambda x: x if math.isnan(x) else (1 if x > 0 else 0)).astype(object)

    # drop redundant features
    copy.drop(columns=['1stFlrSF', '2ndFlrSF', 'GrLivArea', 'TotalBsmtSF'], inplace=True)

    copy['WasRemod'] = copy['YearRemodAdd'].apply(lambda x: x if math.isnan(x) else (1 if x > 0 else 0)).astype(object)

    # drop redundant features
    copy.drop(columns=['YearRemodAdd'], inplace=True)

    return copy

new_cat = ['IsGarage', 'Is2ndFlr', 'WasRemod', 'CatMSSubClass', 'CatOverallCond', 'CatMoSold']
new_num = ['TotalSqrFt']
ordinal = ['OverallQual']

In [8]:
# helper functions
def qual_mapper(value):
    if value == 'Po':
        return 1
    if value == 'Fa':
        return 2
    if value == 'TA':
        return 3
    if value == 'Gd':
        return 4
    if value == 'Ex':
        return 5
    # I treat nan as 1
    return 0

def MasVnrType_mapper(value):
    if value == 'None' or value == 'BrkCmn':
        return 1
    if value == 'BrkFace':
        return 2
    if value == 'Stone':
        return 3
    # I treat nan equally as None
    return 1

def garage_mapper(value):
    if value == 'Unf':
        return 1
    if value == 'Rfn':
        return 2
    if value == 'Fin':
        return 3
    # I treat nan equally as Unf
    return 1

def toOrdinalFn(col: str):
    fn = 0
    if col == 'GarageFinish':
        fn = garage_mapper
    elif col == 'MasVnrType':
        fn = MasVnrType_mapper
    else:
        fn = qual_mapper
    return fn

In [9]:
potencial_ordinal = ["ExterQual", "FireplaceQu", "GarageFinish", "MasVnrType", "BsmtQual", "HeatingQC", "KitchenQual", "GarageQual"]

def adjustCategorical(df: pd.DataFrame, drop_some_cols=False) -> pd.DataFrame:
    cpy = df.copy()
    # transform some columns from categorical to ordinal
    for i in potencial_ordinal:
        cpy[i] = cpy[i].apply(toOrdinalFn(i))

    if drop_some_cols:
        too_many_majority = ['Utilities', 'Street', 'Condition2', 'RoofMatl', 'Heating', 
                        'LandSlope', 'CentralAir', 'Functional', 'PavedDrive', 'Electrical', 'GarageCond']
        too_many_missing = ['Alley', 'PoolQC', 'MiscFeature']
        cols_to_remove = too_many_majority + too_many_missing
        cpy.drop(columns=cols_to_remove, inplace=True)

    return cpy

ordinal = list(set(ordinal + potencial_ordinal))

In [10]:
def adjustFeatures(df:pd.DataFrame) -> pd.DataFrame:
    cpy = df.copy()
    cpy = adjustNumerical(cpy)
    cpy = adjustCategorical(cpy)

    return cpy

In [15]:
from sklearn.impute import SimpleImputer
unknownImputer = SimpleImputer(strategy='constant', fill_value='Unknown')
majorityImputer = SimpleImputer(strategy='most_frequent')

# I do it by hand because something was going wrong when I tried to such action in Pipeline

X_train_prep = X_train.copy()
X_test_prep = X_test.copy()

X_train_prep[unknownCols] = unknownImputer.fit_transform(X_train_prep[unknownCols])
X_test_prep[unknownCols] = unknownImputer.transform(X_test[unknownCols])

X_train_prep[majorityCols] = majorityImputer.fit_transform(X_train_prep[majorityCols])
X_test_prep[majorityCols] = majorityImputer.transform(X_test_prep[majorityCols])

In [16]:
X_train_prep = adjustFeatures(X_train_prep)
X_test_prep = adjustFeatures(X_test_prep)

In [17]:
cat_cols = list(X_train_prep.select_dtypes(include=[object, 'category']).columns)
num_cols = list(X_train_prep.select_dtypes(include=np.number).columns)

In [None]:
print(cat_cols)
print(num_cols)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterCond', 'Foundation', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageType', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition', 'CatMSSubClass', 'CatOverallCond', 'CatMoSold', 'CatYrSold', 'IsGarage', 'Is2ndFlr', 'WasRemod']
['LotFrontage', 'LotArea', 'OverallQual', 'YearBuilt', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'BsmtQual', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'HeatingQC', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'WoodDeckSF', 'OpenPorchSF', 'Enclos

In [None]:
# from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer
# from sklearn.pipeline import Pipeline

# cat_unknown_transformer = Pipeline(
#     steps=[('cat_unknown', SimpleImputer(missing_values='Unknown'))]
# )
# cat_majority_transformer = Pipeline(
#     steps=[('cat_majority', SimpleImputer(strategy='most_frequent'))]
# )
# cat_general_transformer = Pipeline(
#     steps=[('cat_general', OneHotEncoder(handle_unknown='ignore'))]
# )
# num_transformer = Pipeline(
#     steps=[('num', SimpleImputer(strategy='median'), ('scaler' ,StandardScaler()))]
# )
# new_features_transformer = FunctionTransformer(adjustFeatures)

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', num_transformer, num_cols),
#         ('cat_majority', cat_majority_transformer, majorityCols),
#         ('cat_unknown', cat_unknown_transformer, unknownCols),
#         ('new_features', new_features_transformer, cat_cols + num_cols),
#         ('cat_general_transformer', cat_general_transformer, cat_cols)
#     ],remainder = 'passthrough'
# )


# def getPipeline(model):
#     pipe = Pipeline(
#     steps=[("preprocessor", preprocessor), ("model", model)]
#     )

#     return pipe

In [18]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

def getPipeline(model, num, cat):
    cat_transformer = Pipeline(
        steps=[('encoder', OneHotEncoder(handle_unknown='ignore'))]
    )
    num_transformer = Pipeline(
        steps=[('num', SimpleImputer(strategy='median')), ('scaler' ,StandardScaler())]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num),
            ('cat', cat_transformer, cat),
        ]
    )

    pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", model)]
    )

    return pipe

### How good can be LinearModel with many iterations on RandomizedSearchCV?

In [19]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'model__alpha': np.linspace(0, 1000, 1000),
    'model__l1_ratio': np.linspace(0, 1, 100)
}

lm = getPipeline(ElasticNet(), num_cols, cat_cols)

rsCV = RandomizedSearchCV(lm, param_distributions=param_grid, cv=CV_NO_FOLDS, n_iter=250, n_jobs=-1, verbose=3, scoring='r2', random_state=RANDOM_STATE)
rsCV.fit(X_train_prep, y_train)


Fitting 5 folds for each of 250 candidates, totalling 1250 fits


In [None]:
from sklearn.metrics import r2_score

preds_rsCV_lm = rsCV.predict(X_test_prep)
r2_rsCV_lm = r2_score(y_test, preds_rsCV_lm)

print(f"score on test: {r2_rsCV_lm}")
print(f"CV best score: {rsCV.best_score_}")
print(f"CV best params: {rsCV.best_params_}")

CV score on train: 0.860508471874778
CV best params: 0.8070493300615867


In [31]:
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

models = {
    'ElasticNet': ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=RANDOM_STATE),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'RandomForestRegressor': RandomForestRegressor(random_state=RANDOM_STATE)
}

param_grid = {
    'ElasticNet': {
        'model__alpha': np.linspace(0, 1000, 1000),
        'model__l1_ratio': np.linspace(0, 1, 100)
    },
    'RandomForestRegressor': {
        'model__n_estimators': [100, 200, 300, 400, 500, 700],
        'model__max_depth': [None, 5, 10, 15, 20, 25, 30, 40, 60],
        'model__min_samples_split': [2, 5, 7, 10],
        'model__min_samples_leaf': [1, 2, 3, 4],
    },
    'KNeighborsRegressor': {
        'model__n_neighbors': [i for i in range(4, 51)],
        'model__weights': ['uniform', 'distance'],
        'model__p': [1,2]  
    },
    'GradientBoostingRegressor': {
        'model__n_estimators': [100, 200, 300, 400, 500, 700],
        'model__learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
        'model__max_depth': [3,4, 5, 6, 8, 10],
        'model__subsample': [0.5, 0.75, 1]
    }
}

#### Below I use BayesSearchCV and RandomsedSearchCV to check whether Bayes will overcome Random. 
#### When it comes to bayes: the advantange is that Bayes use information from previous searches to get to know where to look for optimal parameters. It creates a probability space in which hyperparameters with better score are more probable. The disadvantages are longer learning time compared to RandomizedSearchCV and it can get stuck around some hyperparameters which are considered to be "good" and totally omit area with better hyperparameters because, as the algorithm focuses on "good" area, there may be no opportunity to check better area.
#### But on average (according to some articles on the internet) Bayesian searching is better.

In [40]:
from skopt import BayesSearchCV
import time

grid_bayes = {}
grid_bayes_times = {}
for model_name, model in models.items():
    start_time = time.time()

    grid_bayes[model_name] = BayesSearchCV(getPipeline(model, num_cols, cat_cols), search_spaces=param_grid[model_name], cv=CV_NO_FOLDS, scoring='r2', n_jobs=-1, random_state=RANDOM_STATE, n_iter=40)
    grid_bayes[model_name].fit(X_train_prep, y_train)

    best_params = grid_bayes[model_name].best_params_
    best_score = grid_bayes[model_name].best_score_

    fit_time = round(time.time() - start_time, 3)
    grid_bayes_times[model_name] = fit_time

  y = column_or_1d(y, warn=True)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [41]:
from sklearn.model_selection import RandomizedSearchCV

grid_random = {}
grid_random_times = {}
for model_name, model in models.items():
    start_time = time.time()

    grid_random[model_name] = RandomizedSearchCV(getPipeline(model, num_cols, cat_cols), param_distributions=param_grid[model_name], cv=CV_NO_FOLDS, scoring='r2', n_jobs=-1, n_iter=40, random_state=RANDOM_STATE)
    grid_random[model_name].fit(X_train_prep, y_train)

    best_params = grid_random[model_name].best_params_
    best_score = grid_random[model_name].best_score_

    fit_time = round(time.time() - start_time, 3)
    grid_random_times[model_name] = fit_time

  y = column_or_1d(y, warn=True)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


### When it comes to ElasticNet:  RandomGridSearch did better CV than BayesSearch. If I would quess why: the optimal parameter 'l1_ration' is on the edge of considered values (l1_ratio close to 1 is optimal value in my opinion) and Bayes search did not guess it and jumped around this value but not close enough to reach it.

In [42]:
print("Learning Comparison")
print(f'{10*"*"}\n')

for model_name, model in models.items():
    print(model_name)
    print(f"Learning time for Bayes: {grid_bayes_times[model_name]}")
    print(f"Learning time for Random: {grid_random_times[model_name]}")
    print(10*'-')
    print(f"Best score for Bayes: {grid_bayes[model_name].best_score_}")
    print(f"Best score for Random: {grid_random[model_name].best_score_}")
    print(10*'-')
    print(f"Best params for Bayes: {grid_bayes[model_name].best_params_}")
    print(f"Best params for Random: {grid_random[model_name].best_params_}")


Learning Comparison
**********

ElasticNet
Learning time for Bayes: 57.57
Learning time for Random: 16.861
----------
Best score for Bayes: 0.7918660708966769
Best score for Random: 0.8070493300615867
----------
Best params for Bayes: OrderedDict([('model__alpha', 53.053053053053056), ('model__l1_ratio', 0.98989898989899)])
Best params for Random: {'model__l1_ratio': 1.0, 'model__alpha': 465.4654654654655}
GradientBoostingRegressor
Learning time for Bayes: 744.138
Learning time for Random: 454.937
----------
Best score for Bayes: 0.8713929574583649
Best score for Random: 0.8662121500518773
----------
Best params for Bayes: OrderedDict([('model__learning_rate', 0.1), ('model__max_depth', 3), ('model__n_estimators', 700), ('model__subsample', 1.0)])
Best params for Random: {'model__subsample': 0.75, 'model__n_estimators': 500, 'model__max_depth': 4, 'model__learning_rate': 0.1}
KNeighborsRegressor
Learning time for Bayes: 56.23
Learning time for Random: 17.243
----------
Best score for B

### As we can see:
- Bayes has a little better score on GradintBoosting, KNearest and Forest
- Bayes is worse on ElasticNet
- Bayes is much time consuming

### Save model for the future

In [47]:
from joblib import dump
for key in models.keys():
    dump(grid_bayes[key].best_estimator_, f'./models/bayes_{key}.joblib') 
    dump(grid_random[key].best_estimator_, f'./models/rand_{key}.joblib') 

### Later on I will build base models without any hyperparams to compare them with tuned models

### One more thing to LinearModel: if optimum is near 'l1_ration'=1, can Bayes find as least as good params as Random Search given area of 'l1_ratio' close to 1?

In [53]:
param_grid_2 = {
    'model__alpha': np.linspace(0, 1000, 1000),
    'model__l1_ratio': np.linspace(0.98, 1, 100)
}

lm2 = getPipeline(ElasticNet(), num_cols, cat_cols)

BayCV = BayesSearchCV(lm2, search_spaces=param_grid_2, cv=CV_NO_FOLDS, n_iter=100, n_jobs=-1, scoring='r2', random_state=RANDOM_STATE)
BayCV.fit(X_train_prep, y_train)

In [56]:

preds_bayCV_lm = rsCV.predict(X_test_prep)
r2_bayCV_lm = r2_score(y_test, preds_bayCV_lm)

print("BAYES from now:")
print(f"score on test: {r2_bayCV_lm}")
print(f"CV best params: {BayCV.best_score_}")
print(f"CV best params: {BayCV.best_params_}")

print("RANDOMIZED from earlier search:")
print(f"score on test: {r2_rsCV_lm}")
print(f"CV best score: {rsCV.best_score_}")
print(f"CV best params: {rsCV.best_params_}")

BAYES from now:
score on test: 0.860508471874778
CV best params: 0.8094135321615644
CV best params: OrderedDict([('model__alpha', 9.00900900900901), ('model__l1_ratio', 0.9959595959595959)])
RANDOMIZED from earlier search:
score on test: 0.860508471874778
CV best score: 0.8070493300615867
CV best params: {'model__l1_ratio': 1.0, 'model__alpha': 465.4654654654655}


In [61]:
grid_base = {}
for model_name, model in models.items():
    grid_base[model_name] = getPipeline(model, num_cols, cat_cols)
    grid_base[model_name].fit(X_train_prep, y_train)

  y = column_or_1d(y, warn=True)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


### Below is summarize of scores of GridModels

In [69]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

for key in models.keys():
    bay_score = r2_score(y_test, grid_bayes[key].best_estimator_.predict(X_test_prep))
    rand_score = r2_score(y_test, grid_random[key].best_estimator_.predict(X_test_prep))
    base_score = r2_score(y_test, grid_base[key].predict(X_test_prep))

    print(f'\n{10*"*"}\n')
    print(f'Model name: {key} \n')
    print(f"CV score for Bayes: {grid_bayes[key].best_score_}")
    print(f"CV score for Random: {grid_random[key].best_score_}")
    print(f"CV score for Base: {np.mean(cross_val_score(grid_base[key], X_train_prep, y_train, cv=CV_NO_FOLDS))}")
    print('/n')
    print(f'R2 score on test set Bayes: {bay_score}')
    print(f'R2 score on test set Rand: {rand_score}')
    print(f'R2 score on test set Base: {base_score}')


**********

Model name: ElasticNet 

CV score for Bayes: 0.7918660708966769
CV score for Random: 0.8070493300615867
CV score for Base: 0.7935956376281847
/n
R2 score on test set Bayes: 0.8378847274858741
R2 score on test set Rand: 0.860508471874778
R2 score on test set Base: 0.840521988124535

**********

Model name: GradientBoostingRegressor 

CV score for Bayes: 0.8713929574583649
CV score for Random: 0.8662121500518773
CV score for Base: 0.8635687736988018
/n
R2 score on test set Bayes: 0.8618257781573444
R2 score on test set Rand: 0.8870172386227485
R2 score on test set Base: 0.8525163775354825

**********

Model name: KNeighborsRegressor 

CV score for Bayes: 0.7704320420775022
CV score for Random: 0.7683552540900713
CV score for Base: 0.7354797662515736
/n
R2 score on test set Bayes: 0.831367577534079
R2 score on test set Rand: 0.8267267175578178
R2 score on test set Base: 0.8065983200732874

**********

Model name: RandomForestRegressor 

CV score for Bayes: 0.8382043107836903


### Two things I did not expect (before starting building models):
- base model won on RandomForest, that is base model (having such train and test) generalised data better than tuned models
- Bayes is the poorest on ElasticNet

### Note:
##### The results given might be biased because CV split is not necessarily the same for each method of finding / not finding hyperparams

### I could also compare how the best ElasticNets work on test set. In addition a good idea would to build model and make predictions especially for kaggle competition