In [None]:
import pandas as pd
import numpy as np


def PrepX(X:pd.DataFrame):

    X_prep = pd.DataFrame(index=X.index)
    X_types = X.dtypes.to_dict()

    #Iterates over X and applies transformations based on column type; Adds transformed columns 
    new_cols = []
    for col, t in X_types.items():

        #Leaves numeric columns unchanged, no scaling needed for tree-based methods, scale invariant
        if t in ['int', 'float']: 
            new_cols.append(X[col].copy())
            continue

        #Create dummies for categorical (str) columns
        if t == 'object': 
            new_col = pd.get_dummies(X[col], prefix=col, drop_first=True, dtype=int)
            new_cols.append(new_col)
            continue

        print(f'Type \"{t}\" not in standard types!')

    if new_cols: X_prep = pd.concat([X_prep] + new_cols, axis=1)

    return X_prep

#Split data into train and validation set. 
def Split(X:pd.DataFrame, Y:pd.DataFrame, TestSize:float, random_state:int=42):

    split_index = np.random.RandomState(random_state) \
                           .choice([True, False], size=len(Y), p=[TestSize,1-TestSize])
    
    X_test = X[split_index]
    X_train = X[~split_index]
    Y_test = Y[split_index]
    Y_train = Y[~split_index]

    return X_test, X_train, Y_test, Y_train



with open('../../data/DatasetCleaned.csv', 'r') as f:
    DATA = pd.read_csv(f)


Y_DATA = np.log(DATA['SalePrice'].copy().values)
VAR_DEPENDENT = 'SalePrice'

X_DATA = DATA.copy().drop(columns=['SalePrice','Unnamed: 0'])
X_DATA = PrepX(X_DATA)
VAR_NAMES = X_DATA.columns

VALIDATION_SET = True
if VALIDATION_SET: 
    X_VAL, X_DATA, Y_VAL, Y_DATA = Split(X_DATA, Y_DATA, 0.05, 1)
    X_VAL = X_VAL.values.reshape(-1, X_VAL.shape[1])

X_DATA = X_DATA.values.reshape(-1,X_DATA.shape[1])

In [None]:
#GridSearch performed as .py script for efficiency
import pickle
import matplotlib.pyplot as plt

with open('../models/GridSearchedRF.sav', 'rb') as f: GridSearchedRF = pickle.load(f)
GridSearchedRF_Overview = pd.DataFrame(GridSearchedRF.cv_results_)



fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15,5), sharey=True)

ax1.scatter(GridSearchedRF_Overview['param_min_samples_leaf'], GridSearchedRF_Overview['mean_test_R2'])
ax1.set_xlabel('Min. Leaf Samples')

GridSearchedRF_Overview['disp_param_max_features'] = np.where(
        GridSearchedRF_Overview['param_max_features'] == 'sqrt', 0,
        np.where(
            GridSearchedRF_Overview['param_max_features'] == 'log2', 1, 2
        )
    )
ax2.scatter(GridSearchedRF_Overview['disp_param_max_features'], 
                GridSearchedRF_Overview['mean_test_R2'])
ax2.set_xticks(GridSearchedRF_Overview['disp_param_max_features'],
                   GridSearchedRF_Overview['param_max_features'].astype(str))
ax2.set_xlabel('Features considered per Node')

ax3.scatter(GridSearchedRF_Overview['param_max_depth'], GridSearchedRF_Overview['mean_test_R2'])
ax3.set_xlabel('Tree Depth')

ax1.set_ylabel('Test R^2')