In [92]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from feature_engineering import main as add_feature

In [105]:
df = joblib.load('output/x_train.pkl')


import yaml
f = open("params.yaml", "r")
params = yaml.load(f, Loader=yaml.SafeLoader)
f.close()

In [102]:
def one_hot_encoder(x_cat,
                    state='fit'):
    df = x_cat.copy()
    index = x_cat.index
    col = x_cat.columns
    
    if state == 'fit':
        encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')
        encoder.fit(x_cat)
        joblib.dump(encoder,
                    "output/onehotencoder.pkl")
        
    elif state == 'transform':
        encoder = joblib.load("output/onehotencoder.pkl")
    
    encoded = encoder.transform(x_cat)
    feat_names = encoder.get_feature_names_out(col)
    encoded = pd.DataFrame(encoded)
    encoded.index = index
    encoded.columns = feat_names
    return encoded

def normalization(x_all,
                  state = 'fit'):
    index = x_all.index
    cols = x_all.columns
    

    if state == 'fit':
        normalizer = StandardScaler()
        normalizer.fit(x_all)
        joblib.dump(normalizer,
                    "output/normalizer.pkl")

    elif state == 'transform':
        normalizer = joblib.load("output/normalizer.pkl")
        
    normalized = normalizer.transform(x_all)
    normalized = pd.DataFrame(normalized)
    normalized.index = index
    normalized.columns = cols
    return normalized

def run(params, xpath, ypath, dump_path, state='fit'):
    house_variables = joblib.load(xpath)
    house_target = joblib.load(ypath)

    house_variables_feat = add_feature(house_variables)
    
    house_numerical = house_variables_feat[params['NUM_COLUMN']]
    house_categorical = house_variables_feat[params['CAT_COLUMN']]
    house_label = house_variables_feat[params['LABEL_COLUMN']]
    
    df_categorical_encoded = one_hot_encoder(house_categorical, state=state)
    
    df_joined = pd.concat([df_categorical_encoded, house_label, house_numerical], axis=1)

    df_joined['AgeBin'] = df_joined['AgeBin'].cat.codes
    
    df_normalized = normalization(df_joined, state=state)
    
    joblib.dump(df_normalized, dump_path)

In [99]:
house_variables_feat = add_feature(df)
house_numerical = house_variables_feat[params['NUM_COLUMN']]
house_categorical = house_variables_feat[params['CAT_COLUMN']]
house_label = house_variables_feat[params['LABEL_COLUMN']]
df_categorical_encoded = one_hot_encoder(house_categorical, state='fit')
df_joined = pd.concat([df_categorical_encoded, house_label, house_numerical], axis=1)
df_joined['AgeBin'] = df_joined['AgeBin'].cat.codes
df_normalized = normalization(df_joined, state='fit')


In [103]:
# Running feature engineering and preprocessing data
run(params, 'output/x_train.pkl', 'output/y_train.pkl', 'output/prepro.pkl', state='fit')

In [104]:
# Final Data X
joblib.load('output\prepro.pkl')

Unnamed: 0_level_0,SEX_1,SEX_2,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3,EDUCATION,LIMIT_BAL,AGE,AgeBin,PAY_0,...,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Closeness_6,Closeness_5,Closeness_4,Closeness_3,Closeness_2,Closeness_1
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2759,-0.809615,0.809615,-0.916372,0.939913,-0.113014,-1.123305,0.249536,-1.355094,-1.495237,-1.763793,...,-0.258958,-0.305323,-0.183944,-0.288013,0.894133,0.945030,0.961602,0.970866,0.999632,1.016427
11338,1.235155,-1.235155,-0.916372,0.939913,-0.113014,-1.123305,-0.367928,0.271764,0.449998,0.015819,...,-0.202922,0.564163,-0.053800,-0.133181,0.828573,0.610004,0.762261,0.562707,0.196878,-0.029690
23150,-0.809615,0.809615,-0.916372,0.939913,-0.113014,0.220093,0.867001,0.163306,0.449998,-0.873987,...,-0.182103,-0.188812,-0.186752,0.284414,0.902517,0.926000,0.950103,0.960235,0.986555,1.006132
19040,-0.809615,0.809615,-0.916372,0.939913,-0.113014,-1.123305,-1.062576,-1.029722,-0.846825,0.905625,...,-0.285979,-0.270295,-0.234815,-0.237341,-0.694929,-0.561034,-0.434772,-0.365691,-0.251564,-0.096678
10803,-0.809615,0.809615,-0.916372,0.939913,-0.113014,-1.123305,2.179114,-0.595894,-0.846825,0.015819,...,-0.175237,-0.243217,-0.178638,-0.257216,0.792502,0.755607,0.791510,0.805178,0.833206,0.773958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29803,1.235155,-1.235155,-0.916372,0.939913,-0.113014,0.220093,-0.908210,-0.378979,-0.198414,0.015819,...,-0.197883,-0.300852,-0.232693,3.840136,0.567618,-0.715806,-2.052608,-1.804946,-1.629477,-1.514161
5391,1.235155,-1.235155,-0.916372,0.939913,-0.113014,-1.123305,0.249536,0.163306,0.449998,1.795431,...,-0.285979,0.160473,0.071039,-0.068434,-1.568647,-1.455352,-1.247622,-1.124057,-1.040790,-0.876487
861,1.235155,-1.235155,-0.916372,0.939913,-0.113014,-1.123305,-0.908210,-1.029722,-0.846825,-1.763793,...,-0.285979,-0.305323,-0.303476,-0.293643,0.921848,0.945030,0.968188,0.976955,1.010442,1.022324
15796,-0.809615,0.809615,-0.916372,0.939913,-0.113014,0.220093,-0.753844,-1.138180,-1.495237,0.015819,...,-0.036809,-0.230796,-0.303476,-0.226080,-0.237569,-0.216734,-0.386122,-0.846592,-1.469337,-1.530526
