In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
import gc

In [None]:
train_data = pd.read_pickle('Data/train_data.pkl')
test_data = pd.read_pickle('Data/test_data.pkl')

train_data.shape, test_data.shape

In [None]:
to_remove_features = []

train_data.drop(columns=['S_2'], axis=1, inplace=True)
test_data.drop(columns=['S_2'], axis=1, inplace=True)

categorical_features = ['B_30', 'B_31', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
numerical_features = list(set(train_data.columns).difference(set(categorical_features)))

for i in to_remove_features:
    numerical_features.remove(i)

len(numerical_features), len(categorical_features)

In [None]:
def aggregator(df):
    operation_list = []
    for i in numerical_features:
        operation_list.append(['mean', 'std', 'min', 'max', 'last'])
    dic_numerical = dict(zip(numerical_features, operation_list))
    df_agg_num = df.groupby('customer_ID').agg(dic_numerical)
    df_agg_num.columns = ['_num_'.join(x) for x in df_agg_num.columns]

    operation_list = []
    for i in categorical_features:
        operation_list.append(['count', 'last', 'nunique'])
    dic_categorical = dict(zip(categorical_features, operation_list))
    df_agg_cat = df.groupby('customer_ID').agg(dic_categorical)
    df_agg_cat.columns = ['_cat_'.join(x) for x in df_agg_cat.columns]

    df_agg = pd.concat([df_agg_num, df_agg_cat], axis = 1)
    return df_agg

train_agg = aggregator(train_data)
test_agg = aggregator(test_data)

In [None]:
train_agg.shape, test_agg.shape

In [None]:
APPLY_ONE_HOT_ENC = False
if APPLY_ONE_HOT_ENC:
    cat_feature_last = train_agg.columns[train_agg.columns.str.contains('_cat_last')]
    train_agg[cat_feature_last] = train_agg[cat_feature_last].astype(str)
    test_agg[cat_feature_last] = test_agg[cat_feature_last].astype(str)
    enc = OneHotEncoder()
    train_encoded_df = enc.fit_transform(train_agg[cat_feature_last])
    test_encoded_df = enc.transform(test_agg[cat_feature_last])
    column_name = []
    for i in range(0, train_encoded_df.shape[1], 1):
        column_name.append('enc_' + str(i))
    train_enc_df = pd.DataFrame(train_encoded_df.toarray(), columns = column_name, index= train_agg.index)

    column_name = []
    for i in range(0, test_encoded_df.shape[1], 1):
        column_name.append('enc_' + str(i))
    test_enc_df = pd.DataFrame(test_encoded_df.toarray(), columns = column_name, index= test_agg.index)
    train_agg.drop(columns=cat_feature_last, axis=1, inplace=True)
    test_agg.drop(columns=cat_feature_last, axis=1, inplace=True)

    train_agg = pd.concat([train_agg, train_enc_df], axis=1)
    test_agg = pd.concat([test_agg, test_enc_df], axis=1)

In [None]:
APPLY_LABEL_ENC = True
if APPLY_LABEL_ENC:
    cat_feature_last = train_agg.columns[train_agg.columns.str.contains('_cat_last')]
    train_agg[cat_feature_last] = train_agg[cat_feature_last].astype(str)
    test_agg[cat_feature_last] = test_agg[cat_feature_last].astype(str)
    enc = OrdinalEncoder()
    train_encoded_df = enc.fit_transform(train_agg[cat_feature_last])
    test_encoded_df = enc.transform(test_agg[cat_feature_last])
    column_name = []
    for i in range(0, train_encoded_df.shape[1], 1):
        column_name.append('enc_' + str(i))
    train_enc_df = pd.DataFrame(train_encoded_df, columns = column_name, index= train_agg.index)

    column_name = []
    for i in range(0, test_encoded_df.shape[1], 1):
        column_name.append('enc_' + str(i))
    test_enc_df = pd.DataFrame(test_encoded_df, columns = column_name, index= test_agg.index)
    train_agg.drop(columns=cat_feature_last, axis=1, inplace=True)
    test_agg.drop(columns=cat_feature_last, axis=1, inplace=True)

    train_agg = pd.concat([train_agg, train_enc_df], axis=1)
    test_agg = pd.concat([test_agg, test_enc_df], axis=1)

In [None]:
train_test_agg = pd.concat([train_agg,test_agg], axis = 0)
num_feature_agg = train_test_agg.columns[train_agg.columns.str.contains('_num_')]

scaler = StandardScaler()
train_test_scaled = scaler.fit_transform(train_test_agg[num_feature_agg].fillna(train_test_agg[num_feature_agg].median()))
pca = PCA(n_components=50, random_state=42)
train_test_scaled_PCA = pca.fit_transform(train_test_scaled)
print(pca.explained_variance_ratio_)

column_name = []
for i in range(0, train_test_scaled_PCA.shape[1], 1):
    column_name.append('PCA_' + str(i))

train_test_PCA_df = pd.DataFrame(train_test_scaled_PCA, columns= column_name, index=train_test_agg.index)

In [None]:
train_agg_mo = pd.concat([train_agg, train_test_PCA_df.loc[train_agg.index]], axis=1)
test_agg_mo = pd.concat([test_agg, train_test_PCA_df.loc[test_agg.index]], axis=1)
train_agg_mo.shape, test_agg_mo.shape

In [None]:
train_agg_mo.to_pickle('Data/train_agg_mo.pkl')
test_agg_mo.to_pickle('Data/test_agg_mo.pkl')