In [20]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer
from feature_engineering import main_feat as add_feature
from tqdm import tqdm

tqdm.pandas()

from utils import read_yaml

PREPROCESSING_CONFIG_PATH = "../config/preprocessing_config.yaml"

def load_featured_data(params):
    """
    Loader for featured data.
    
    Args:
    - params(dict): featuring params.
    
    Returns:
    - list_of_featured(List): list of featured data.
    """

    name = ['train','valid','test']
    list_of_featured = []
    for i in name:
        path = f"{params['out_path']}x_{i}_featured.pkl"
        temp = joblib.load(path)
        list_of_featured.append(temp)

    return list_of_featured

def one_hot_encoder(params,
                    x_cat,
                    state=None):
    df = x_cat.copy()
    index = x_cat.index
    col = x_cat.columns
    
    if state == None:
        encoder = OneHotEncoder(sparse=False,handle_unknown='ignore').fit(x_cat)
    
        joblib.dump(encoder,
                    params["out_path"]+"onehotencoder.pkl")
    else:
        encoder = joblib.load(params["out_path"]+"onehotencoder.pkl")
    
    encoded = encoder.transform(x_cat)
    feat_names = encoder.get_feature_names_out(col)
    encoded = pd.DataFrame(encoded)
    encoded.index = index
    encoded.columns = feat_names
    return encoded

def normalization(params,
                  x_all,
                  state = None):
    index = x_all.index
    cols = x_all.columns

    if state == None:
        normalizer = Normalizer().fit(x_all)
        
        joblib.dump(normalizer,
                    params["out_path"]+"normalizer.pkl")
    
    else:
        normalizer = joblib.load(params["out_path"]+"normalizer.pkl")
    
    normalized = normalizer.transform(x_all)
    normalized = pd.DataFrame(normalized)
    normalized.index = index
    normalized.columns = cols
    return normalized, normalizer

def preprocessing(house_variables_feat, params, state=None):
    
    house_numerical = house_variables_feat[params['NUM_COLUMN']]
    house_categorical = house_variables_feat[params['CAT_COLUMN']]
    house_label = house_variables_feat[params['LABEL_COLUMN']]

    df_num_normalized = normalization(params, house_numerical, state=None)
    
    df_categorical_encoded = one_hot_encoder(params, house_categorical, state=None)
    
    df_joined = pd.concat([df_categorical_encoded, house_label, df_num_normalized[0]], axis=1)
    
    return df_joined, df_num_normalized[1]

def main_preprocessing(x_featured_list, params):
    x_train_featured, x_valid_featured, x_test_featured = x_featured_list
    x_train_preprocessed, normalizer = preprocessing(x_train_featured, params, state=None)
    x_valid_preprocessed = preprocessing(x_valid_featured, params, state='normalizer')
    x_test_preprocessed = preprocessing(x_test_featured, params, state='normalizer')
    joblib.dump(x_train_preprocessed, f"{params['out_path']}x_train_preprocessed.pkl")
    joblib.dump(x_valid_preprocessed[0], f"{params['out_path']}x_valid_preprocessed.pkl")
    joblib.dump(x_test_preprocessed[0], f"{params['out_path']}x_test_preprocessed.pkl")

    return x_train_preprocessed, x_valid_preprocessed[0], x_test_preprocessed[0]

if __name__ == "__main__":
    params_prep = read_yaml(PREPROCESSING_CONFIG_PATH)
    x_featured_list = load_featured_data(params_prep)
    x_train_preprocessed, x_valid_preprocessed, x_test_preprocessed = main_preprocessing(x_featured_list, params_prep)

In [14]:
params_prep = read_yaml(PREPROCESSING_CONFIG_PATH)
x_featured_list = load_featured_data(params_prep)
x_train_featured, x_valid_featured, x_test_featured = x_featured_list
x_train_preprocessed, normalizer = preprocessing(x_train_featured, params_prep)

In [19]:
OneHotEncoder

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,PAY_AMT4,PAY_AMT5,PAY_AMT6,AgeBin,Closeness_6,Closeness_5,Closeness_4,Closeness_3,Closeness_2,Closeness_1
28831,28832,180000.0,2,3,1,31,0,0,0,0,...,2700.0,2800.0,3000.0,2,0.582856,0.590961,0.595950,0.603900,0.612122,0.619722
24438,24439,320000.0,1,1,2,32,-1,-1,-1,0,...,1082.0,8807.0,787.0,2,0.972478,0.996619,0.892381,0.937462,1.000000,0.996741
555,556,630000.0,2,2,1,47,0,0,0,-1,...,8654.0,0.0,4981.0,4,1.000000,0.986263,0.995822,0.998084,0.891767,0.939921
13346,13347,210000.0,2,1,1,23,-2,-2,-2,-1,...,1500.0,1500.0,1500.0,0,0.850686,0.855419,0.860300,1.000000,1.000000,1.000000
1575,1576,230000.0,2,2,2,25,0,0,0,0,...,4000.0,3000.0,3000.0,0,0.622391,0.607948,0.618896,0.595030,0.589178,0.585604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25780,25781,200000.0,2,2,1,32,0,0,-2,-2,...,0.0,0.0,0.0,2,1.000000,1.000000,1.000000,1.000000,1.000000,0.946495
13921,13922,120000.0,2,2,2,24,0,0,0,0,...,10000.0,4560.0,0.0,0,-0.016350,-0.017725,0.050433,0.037375,0.055408,0.063867
3794,3795,120000.0,2,1,2,24,0,0,0,0,...,2000.0,1200.0,1000.0,0,0.834783,0.723433,0.583967,0.440108,0.366633,0.368367
27565,27566,360000.0,1,1,1,57,1,-2,-1,-1,...,0.0,0.0,0.0,5,1.000128,1.000128,0.999317,0.997611,1.000000,1.000000
