In [14]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import PowerTransformer, RobustScaler

with open('../data/DatasetCleaned.csv', 'r') as f:
    DATA = pd.read_csv(f)


Y_DATA = np.log(DATA['SalePrice'].copy()).values
VAR_DEPENDENT = 'SalePrice_log'

X_DATA = DATA.copy().drop(columns=['SalePrice','Unnamed: 0'])


def PrepX(X:pd.DataFrame, degrees:int, scaler=PowerTransformer(method='yeo-johnson'), only_dummies:bool=False):

    X_prep = pd.DataFrame(index=X.index)
    X_types = X.dtypes.to_dict()

    new_cols = []
    for col, t in X_types.items():

        if t != 'object' and -1 <= X[col].min() and X[col].max() <= 1 and not only_dummies and not re.match('BsmtFinish_',col): 
            new_cols.append(X[col].copy().replace({-1:0})) 
            continue

        if t in ['int', 'float'] and not only_dummies:
            nonapplicable = X[col] == -1

            new_col = pd.DataFrame(index=X.index)
            values = X[col].copy().values
            for d in range(1,degrees+1):
                var_and_degree = f'{col}_{d}'
                new_col[var_and_degree] = 0.0
                new_col.loc[~nonapplicable, var_and_degree] = values[~nonapplicable]**d
                new_col.loc[~nonapplicable, var_and_degree] = scaler.fit_transform(new_col.loc[~nonapplicable, var_and_degree].values.reshape(-1,1)).flatten()

            new_cols.append(new_col)
            
            continue

        if t == 'object': 
            new_col = pd.get_dummies(X[col].copy(), prefix=col, drop_first=True, dtype=int)
            new_cols.append(new_col)
            continue

        if only_dummies and t != 'object':
            X_prep[col] = X[col].copy()
        else:
            print(f'Type \"{t}\" not in standard types!')

    if new_cols: X_prep = pd.concat([X_prep] + new_cols, axis=1)

    return X_prep

DEGREES = 5
VALIDATION_SET = True

if VALIDATION_SET:
    X_DATA = PrepX(X_DATA, degrees=DEGREES, only_dummies=True)

    VALIDATION_SET = np.random.RandomState(42).choice([True, False], size=len(DATA), p=[0.05,0.95])
    X_VAL = X_DATA[VALIDATION_SET]
    X_DATA = X_DATA[~VALIDATION_SET]
    Y_VAL = Y_DATA[VALIDATION_SET]
    Y_DATA = Y_DATA[~VALIDATION_SET]
    
    X_VAL = PrepX(X=X_VAL, degrees=DEGREES)
    #X_VAL = X_VAL.values.reshape(-1, X_VAL.shape[1])

X_DATA = PrepX(X=X_DATA, degrees=DEGREES)
VAR_NAMES = X_DATA.columns
#X_DATA = X_DATA.values.reshape(-1,X_DATA.shape[1])