In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder 
import json 
import gc 

In [6]:
# Load our selected features from EDA Notebook
with open("selected_features.json") as f:
    num_features = json.load(f)

with open("selected_cat_features.json") as f:
    cat_features = json.load(f)

In [7]:
# Load train data
train_data = pd.read_csv("train_data.csv", usecols=['customer_ID', 'S_2'] + num_features + cat_features)

print(f"train_data shape: {train_data.shape}")
train_data.head(3)


train_data shape: (5531451, 97)


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,D_46,D_47,D_48,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_7,B_12,S_8,D_55,D_56,D_58,B_14,D_59,...,D_74,D_75,D_76,D_77,B_25,D_78,D_79,B_28,B_30,B_33,S_22,S_23,S_24,S_25,D_102,D_103,D_104,D_107,B_37,R_26,R_27,B_38,B_40,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_126,D_128,D_129,D_130,D_131,D_142
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,,,0.00063,0.080986,0.708906,0.1706,0.358587,0.525351,0.255736,0.063902,0.059416,0.006466,0.148698,1.335856,0.008207,0.001423,0.207334,0.736463,0.096219,,0.023381,0.002768,0.161345,0.148266,0.922998,0.354596,0.152025,0.158612,0.018385,0.063646,...,0.080422,0.069067,,,0.007729,0.001576,0.004239,0.084683,0.0,1.001101,0.89409,0.135561,0.911191,0.974539,0.766688,1.008691,1.004587,0.670041,0.004572,,1.008949,2.0,0.21006,0.676922,0.007871,1.0,0.23825,0.0,4.0,0.23212,0.236266,0.0,0.70228,0.434345,1.0,1.007819,1.00008,0.002052,0.005972,
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,,,0.002526,0.069419,0.712795,0.113239,0.35363,0.521311,0.223329,0.065261,0.057744,0.001614,0.149723,1.339794,0.008373,0.001984,0.202778,0.720886,0.099804,,0.030599,0.002749,0.140951,0.14353,0.919414,0.326757,0.156201,0.148459,0.013035,0.065501,...,0.081413,0.074166,,,0.001864,0.009896,0.007597,0.081843,0.0,1.006779,0.902135,0.136333,0.919876,0.975624,0.786007,1.000084,1.004118,0.668647,0.004654,,1.003205,2.0,0.184093,0.822281,0.003444,1.0,0.247217,0.0,4.0,0.243532,0.241885,0.0,0.707017,0.430501,1.0,1.004333,1.008344,0.001034,0.004838,
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,,,0.007605,0.068839,0.720884,0.060492,0.33465,0.524568,0.189424,0.066982,0.056647,0.005126,0.151955,1.337179,0.009355,0.007426,0.206629,0.738044,0.134073,,0.048367,0.010077,0.112229,0.137014,1.001977,0.304124,0.153795,0.139504,0.056653,0.070607,...,0.078891,0.07651,,,0.005419,0.009629,0.003094,0.081954,0.0,1.001014,0.939654,0.134938,0.958699,0.974067,0.80684,1.003014,1.009285,0.670901,0.019176,,1.000754,2.0,0.154837,0.853498,0.003269,1.0,0.239867,0.0,4.0,0.240768,0.23971,0.0,0.704843,0.434409,1.0,1.007831,1.006878,0.005681,0.005497,


In [8]:
# Encode categorical features 
for c in cat_features:
    le = LabelEncoder()
    train_data[c] = train_data[c].astype(str).fillna("nan")
    train_data[c] = le.fit_transform(train_data[c])

print("Categorical features encoded.")

Categorical features encoded.


In [9]:
# Aggregate numerical features
num_aggs = {col: ['mean', 'std', 'min', 'max', 'last'] for col in num_features}

num_agg = train_data.groupby('customer_ID')[num_features].agg(num_aggs)
num_agg.columns = ['_'.join(col).strip() for col in num_agg.columns.values]
num_agg.reset_index(inplace=True)

print(f"Numeric aggregation done. Shape: {num_agg.shape}")

Numeric aggregation done. Shape: (458913, 426)


In [10]:
# Aggregate categorical features
cat_aggs = {col: ['last', pd.Series.mode] for col in cat_features}

cat_agg = train_data.groupby('customer_ID')[cat_features].agg(cat_aggs)
cat_agg.columns = ['_'.join(col).strip() for col in cat_agg.columns.values]
cat_agg.reset_index(inplace=True)

print(f"Categorical aggregation done. Shape: {cat_agg.shape}")

Categorical aggregation done. Shape: (458913, 21)


In [11]:
# Merge numerical and categorical aggregates
train_agg = num_agg.merge(cat_agg, on='customer_ID', how='left')
print(f"Combined training data shape: {train_agg.shape}")

# Merge with target labels
train_labels = pd.read_csv("train_labels.csv")
train_final = train_agg.merge(train_labels, on="customer_ID", how="left")

print(f"Final training table shape: {train_final.shape}")
train_final.head(3)

Combined training data shape: (458913, 446)
Final training table shape: (458913, 447)


Unnamed: 0,customer_ID,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_48_mean,D_48_std,D_48_min,D_48_max,D_48_last,D_44_mean,D_44_std,D_44_min,D_44_max,D_44_last,B_9_mean,B_9_std,B_9_min,B_9_max,B_9_last,D_77_mean,D_77_std,D_77_min,D_77_max,D_77_last,B_3_mean,B_3_std,B_3_min,B_3_max,B_3_last,B_18_mean,B_18_std,B_18_min,B_18_max,B_18_last,B_7_mean,B_7_std,B_7_min,B_7_max,...,D_71_std,D_71_min,D_71_max,D_71_last,D_79_mean,D_79_std,D_79_min,D_79_max,D_79_last,D_130_mean,D_130_std,D_130_min,D_130_max,D_130_last,D_41_mean,D_41_std,D_41_min,D_41_max,D_41_last,B_30_last,B_30_mode,B_38_last,B_38_mode,D_114_last,D_114_mode,D_116_last,D_116_mode,D_117_last,D_117_mode,D_120_last,D_120_mode,D_126_last,D_126_mode,D_63_last,D_63_mode,D_64_last,D_64_mode,D_68_last,D_68_mode,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.933824,0.024194,0.86858,0.960384,0.934745,0.240978,0.076875,0.135586,0.403448,0.192376,0.004673,0.002822,0.00063,0.009397,0.003258,0.00622,0.00318,0.000519,0.009535,0.009535,0.419295,0.002884,0.417256,0.421334,0.421334,0.006456,0.002942,0.000783,0.009866,0.007174,0.842565,0.184835,0.645819,1.007897,1.007897,0.036624,0.023195,0.001681,0.060502,...,0.13233,0.075868,0.430954,0.377991,0.00417,0.002254,0.00116,0.008742,0.002193,0.005195,0.003004,0.001034,0.009851,0.004186,0.005021,0.003331,0.000685,0.009857,0.001604,0,0,1,1,1,1,0,0,4,4,0,0,2,2,2,2,1,1,6,6,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.89982,0.022119,0.861109,0.929122,0.880519,0.048203,0.031312,0.010117,0.105999,0.014696,0.004311,0.002976,0.000123,0.009873,0.008781,0.010298,0.011024,0.001722,0.045093,0.012926,0.223911,0.002707,0.218147,0.227755,0.227755,0.005663,0.003354,0.000861,0.012861,0.005068,1.004884,0.003183,1.000319,1.009827,1.003602,0.028049,0.013631,0.015836,0.068204,...,0.003392,0.005655,0.016488,0.007636,0.004831,0.003047,1.8e-05,0.009117,0.009117,0.003449,0.002149,0.000322,0.006812,0.002202,0.004993,0.002903,0.000728,0.009513,0.005552,0,0,1,1,1,1,0,0,0,0,0,0,2,2,1,1,1,1,6,6,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.878454,0.028911,0.79767,0.904482,0.880875,0.092284,0.060616,0.030227,0.255134,0.08037,0.013902,0.03443,0.000628,0.128216,0.000628,0.00473,0.003302,0.000422,0.009521,0.009392,0.386343,0.07167,0.14791,0.409137,0.407122,0.005493,0.002834,0.000626,0.009383,0.007196,0.933173,0.137122,0.689252,1.009126,1.00408,0.034433,0.015459,0.021261,0.079764,...,0.002854,0.007514,0.015802,0.015025,0.003859,0.002161,0.000266,0.006786,0.006786,0.005951,0.002992,0.001975,0.009955,0.002654,0.006842,0.002683,0.001653,0.00956,0.003796,0,0,0,0,1,1,0,0,0,0,0,0,2,2,1,1,2,2,6,6,0


In [12]:
# To optimize memory (just putting this here first but check whether deleting this cell can improve score at the end)
for col in train_final.columns:
    if train_final[col].dtype == 'float64':
        train_final[col] = train_final[col].astype('float32')
    elif train_final[col].dtype == 'int64':
        train_final[col] = train_final[col].astype('int32')

print("Memory optimization complete.")

Memory optimization complete.


In [13]:
# Trying to cleanup to possibly improve RAM usage
del train_data, num_agg, cat_agg
gc.collect()
print("Feature Engineering complete.")

Feature Engineering complete.
