In [None]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

In [None]:
def feature_engineering(data, cat_columns, num_columns):

    cat_minus2_value = data[cat_columns].groupby('customer_ID').apply(lambda x: x.iloc[-2] if len(x) > 1 else x.iloc[0])
    cat_minus2_value.columns = ['cat_-2_' + c for c in cat_minus2_value]
    print('cat_-2 Done')

    cat_first_value = data[cat_columns].groupby('customer_ID').apply(lambda x: x.iloc[0])
    cat_first_value.columns = ['cat_first_' + c for c in cat_first_value]
    print('cat_first Done')

    na_fe = data.groupby('customer_ID').apply(lambda x: x.isnull().sum() / len(x))
    na_fe.columns = ['NA_' + c for c in na_fe]
    print('na Done')

    num_last_minus2_diff = data[num_columns].groupby('customer_ID').apply(lambda x: x.iloc[-1] - x.iloc[-2] if len(x) > 1 else x.iloc[-1] - x.iloc[0])
    num_last_minus2_diff.columns = ['diff_last_-2_' + c for c in num_last_minus2_diff]
    print('diff_last_-2 Done')

    data['date'] = pd.to_datetime(data['S_2'])
    day_fe = data['date'].dt.day
    day_fe.rename('day', inplace=True)
    weekday_fe = data['date'].dt.weekday
    weekday_fe.rename('weekday', inplace=True)
    month_fe = data['date'].dt.month
    month_fe.rename('month', inplace=True)
    date_df = pd.concat([day_fe, weekday_fe, month_fe], axis=1)
    date = date_df.groupby('customer_ID').agg(['mean', 'min', 'max', 'last', 'first'])
    date.columns = ['date_' + c[0] + '_' + c[1] for c in date.columns]
    print('date Done')

    return pd.concat([cat_minus2_value, cat_first_value, na_fe, num_last_minus2_diff, date], axis=1)

In [None]:
train = pd.read_parquet('Data/train.parquet')
train.set_index('customer_ID', inplace=True)

columns = train.columns
cat_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
num_columns = [c for c in columns if c not in cat_columns]
num_columns.remove('S_2')
train.shape

In [None]:
tr_fe = feature_engineering(train, cat_columns, num_columns)

In [None]:
del train; gc.collect()
train_agg = pd.read_parquet('Data/train_all_slopes_corr_pcaslope_lagv2_avediff.parquet')
pd.concat([train_agg, tr_fe], axis=1).to_parquet('Data/train_all_slopes_corr_pcaslope_lagv2_avediff_catLastLastNAdate.parquet')

In [None]:
test = pd.read_parquet('Data/test.parquet')
test.set_index('customer_ID', inplace=True)

columns = test.columns
cat_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
num_columns = [c for c in columns if c not in cat_columns]
num_columns.remove('S_2')
test.shape

In [None]:
te_fe = feature_engineering(test, cat_columns, num_columns)

In [None]:
te_fe.to_parquet('Data/catLastLastNAdate.parquet')

In [None]:
del test; gc.collect()

In [None]:
te_fe = pd.read_parquet('Data/catLastLastNAdate.parquet')

In [None]:
te_fe.head()

In [None]:
test_agg = pd.read_parquet('Data/test_all_slopes_corr_pcaslope_lagv2_avediff.parquet')
test_agg.shape

In [None]:
test_agg_first_half = test_agg.iloc[:int(test_agg.shape[0]/2)]
test_agg_second_half = test_agg.iloc[int(test_agg.shape[0]/2):]
del test_agg; gc.collect()

In [None]:
test_new_first_half = pd.concat([test_agg_first_half, te_fe.loc[test_agg_first_half.index]], axis=1)
test_new_second_half = pd.concat([test_agg_second_half, te_fe.loc[test_agg_second_half.index]], axis=1)
del test_agg_first_half, test_agg_second_half; gc.collect()

In [None]:
test_new_first_half.shape, test_new_second_half.shape

In [None]:
test_new_first_half.to_parquet('Data/test_all_slopes_corr_pcaslope_lagv2_avediff_catLastLastNAdate_part1.parquet')
test_new_second_half.to_parquet('Data/test_all_slopes_corr_pcaslope_lagv2_avediff_catLastLastNAdate_part2.parquet')