## Pos-cash balance time series feature extraction
Train GRU network on pos-cash balance time series data. Save prediction to be used as features in final training.

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
import gc

import os
print(os.listdir("../input"))
    
gc.enable()

['download_command.txt', 'application_test.csv', 'HomeCredit_columns_description.csv', 'POS_CASH_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'application_train.csv', 'bureau.csv', 'previous_application.csv', 'bureau_balance.csv', 'sample_submission.csv']


Read pos-cash balance and create features.

In [2]:
pos = pd.read_csv('../input/POS_CASH_balance.csv')
pos = pd.concat([pos, pd.get_dummies(pos['NAME_CONTRACT_STATUS'], prefix='NAME_CONTRACT_STATUS')], axis=1)
pos['CNT_INSTALMENT']/=10
pos['CNT_INSTALMENT_FUTURE']/=10
del pos['NAME_CONTRACT_STATUS']

Read target from main table.

In [3]:
data_app = pd.read_csv('../input/application_train.csv',usecols=['SK_ID_CURR','TARGET'])
data_test = pd.read_csv('../input/application_test.csv',usecols=['SK_ID_CURR'])
data_app.shape, data_test.shape

((307511, 2), (48744, 1))

In [4]:
trn_id = data_app['SK_ID_CURR'].loc[data_app.SK_ID_CURR.isin(pos.SK_ID_CURR)]
test_id = data_test['SK_ID_CURR'].loc[data_test['SK_ID_CURR'].isin(pos.SK_ID_CURR)]
trn_id.shape, test_id.shape

((289444,), (47808,))

Split train and test set. Group by ID and month to create time series.

In [5]:
pos_trn = pos.loc[pos.SK_ID_CURR.isin(trn_id)]
pos_test = pos.loc[pos.SK_ID_CURR.isin(test_id)]
num_aggregations = {
    'SK_ID_PREV': ['count'],
    'CNT_INSTALMENT': ['sum', 'max', 'mean'],
    'CNT_INSTALMENT_FUTURE': ['sum', 'max', 'mean'],
    'NAME_CONTRACT_STATUS_Approved': ['sum'],
    'NAME_CONTRACT_STATUS_Canceled': ['sum'],
    'NAME_CONTRACT_STATUS_Completed': ['sum'],
    'NAME_CONTRACT_STATUS_Demand': ['sum'],
    'NAME_CONTRACT_STATUS_Returned to the store': ['sum'],
    'NAME_CONTRACT_STATUS_Signed': ['sum'],
    'NAME_CONTRACT_STATUS_XNA': ['sum'],
    'SK_DPD': ['sum', 'mean'],
    'SK_DPD_DEF': ['sum', 'mean']
}
pos_trn = pos_trn.groupby(['SK_ID_CURR','MONTHS_BALANCE']).agg(num_aggregations)
pos_test = pos_test.groupby(['SK_ID_CURR','MONTHS_BALANCE']).agg(num_aggregations)
pos_trn.columns = pd.Index([e[0] + "_" + e[1].upper() for e in pos_trn.columns.tolist()])
pos_test.columns = pd.Index([e[0] + "_" + e[1].upper() for e in pos_test.columns.tolist()])
pos_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SK_ID_PREV_COUNT,CNT_INSTALMENT_SUM,CNT_INSTALMENT_MAX,CNT_INSTALMENT_MEAN,CNT_INSTALMENT_FUTURE_SUM,CNT_INSTALMENT_FUTURE_MAX,CNT_INSTALMENT_FUTURE_MEAN,NAME_CONTRACT_STATUS_Approved_SUM,NAME_CONTRACT_STATUS_Canceled_SUM,NAME_CONTRACT_STATUS_Completed_SUM,NAME_CONTRACT_STATUS_Demand_SUM,NAME_CONTRACT_STATUS_Returned to the store_SUM,NAME_CONTRACT_STATUS_Signed_SUM,NAME_CONTRACT_STATUS_XNA_SUM,SK_DPD_SUM,SK_DPD_MEAN,SK_DPD_DEF_SUM,SK_DPD_DEF_MEAN
SK_ID_CURR,MONTHS_BALANCE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
100001,-96,1,0.4,0.4,0.4,0.2,0.2,0.2,0,0,0,0,0,0,0,0,0.0,0,0.0
100001,-95,1,0.4,0.4,0.4,0.1,0.1,0.1,0,0,0,0,0,0,0,7,7.0,7,7.0
100001,-94,1,0.4,0.4,0.4,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0,0.0
100001,-93,1,0.4,0.4,0.4,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0.0,0,0.0
100001,-57,1,0.4,0.4,0.4,0.4,0.4,0.4,0,0,0,0,0,0,0,0,0.0,0,0.0


Convert dataframe to 3D array (n_sample * n_time_step * n_features) for GRU network training.

In [6]:
pos_trn.fillna(-9.0, inplace=True)
pos_test.fillna(-9.0, inplace=True)
# 1. 使用 unstack 将时间序列从索引“解堆”到列，并直接填充缺失值
#    level='MONTHS_BALANCE' 指定了要转换的索引层级
#    fill_value=-9 在转换过程中一步到位地填充了缺失的时间点
train_wide = pos_trn.unstack(level='MONTHS_BALANCE', fill_value=-9)
test_wide = pos_test.unstack(level='MONTHS_BALANCE', fill_value=-9)

# 2. 获取维度信息，为重塑 (reshape) 做准备
n_train_samples = len(train_wide.index)
n_test_samples = len(test_wide.index)
n_features = len(pos_trn.columns)
n_timesteps = len(train_wide.columns) // n_features

# 3. 将2D宽数据重塑为3D数组，并交换维度以匹配GRU/LSTM的输入要求
#    目标维度: (样本数, 时间步长, 特征数)
train_x = train_wide.values.reshape(n_train_samples, n_features, n_timesteps)
train_x = np.swapaxes(train_x, 1, 2)

test_x = test_wide.values.reshape(n_test_samples, n_features, n_timesteps)
test_x = np.swapaxes(test_x, 1, 2)

# 4. train_y 的逻辑保持不变
#    假设 data_app 和 trn_id 已经定义
train_y = data_app['TARGET'].loc[data_app.SK_ID_CURR.isin(trn_id)]

# 5. 打印形状以验证
print("train_x shape:", train_x.shape)
print("test_x shape:", test_x.shape)
print("train_y shape:", train_y.shape)

train_x shape: (289444, 96, 18)
test_x shape: (47808, 96, 18)
train_y shape: (289444,)


Define GRU model. Use callback to evaluate auc metric.

In [7]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, GRU
from keras.regularizers import l2
from keras.optimizers import RMSprop, Adam

def build_model(time_step, n_features):
    model = Sequential()
    model.add(GRU(8, input_shape=(time_step, n_features))) #unit: #of neurons in each LSTM cell? input_shape=(time_step, n_features)
    model.add(Dense(1,activation='sigmoid'))
    return model

from keras.callbacks import Callback
from keras.callbacks import EarlyStopping
import logging

class IntervalEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == (self.interval-1):
            y_pred = self.model.predict(self.X_val, verbose=0)[:,0]
            score = roc_auc_score(self.y_val, y_pred)
            print('roc score',score)

Training...

In [8]:
from tensorflow.keras.optimizers.legacy import Adam # for mac only

#Run a 5 fold
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
oof_preds = np.zeros(train_x.shape[0])
sub_preds = np.zeros(test_x.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_x, train_y)):
    trn_x, val_x = train_x[trn_idx], train_x[val_idx]
    trn_y, val_y = train_y.values[trn_idx], train_y.values[val_idx]
    ival = IntervalEvaluation(validation_data=(val_x, val_y), interval=5)
    
    model = build_model(trn_x.shape[1],trn_x.shape[2])
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001))
    model.fit(trn_x, trn_y,
              validation_data= [val_x, val_y],
              epochs=20, batch_size=2048, 
              class_weight = {0:1,1:10},
              callbacks=[ival], verbose=0)
    
    oof_preds[val_idx] = model.predict(val_x)[:,0]
    sub_preds += model.predict(test_x)[:,0] / folds.n_splits
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
  
    del model, trn_x, trn_y, val_x, val_y
    gc.collect()

2025-10-15 15:18:34.387081: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-10-15 15:18:34.387307: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-10-15 15:18:34.387326: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.92 GB
2025-10-15 15:18:34.387501: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-10-15 15:18:34.387804: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-10-15 15:18:39.602486: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:18:39.774644:

roc score 0.4890230449932504
roc score 0.5307974923556896
roc score 0.5438424637111718
roc score 0.5515033564901897
Fold  1 AUC : 0.551503


2025-10-15 15:21:02.382573: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:21:02.538546: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:21:02.667984: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:21:06.569981: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:21:06.626213: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:21:22.103015: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:21:22.144178: I tensorflow/core/grappler/optimizers/cust

roc score 0.5294599282839128
roc score 0.5399941672646458
roc score 0.5449230098045219
roc score 0.5489119215154075
Fold  2 AUC : 0.548912


2025-10-15 15:23:45.491897: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:23:45.658068: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:23:45.826750: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:23:50.567737: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:23:50.662007: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:24:10.057707: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:24:10.101798: I tensorflow/core/grappler/optimizers/cust

roc score 0.51808879390946
roc score 0.5458377068562532
roc score 0.5518504929256632
roc score 0.5564795940266483
Fold  3 AUC : 0.556480


2025-10-15 15:26:18.837440: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:26:19.036854: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:26:19.218051: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:26:23.422648: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:26:23.477110: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:26:38.115295: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:26:38.154147: I tensorflow/core/grappler/optimizers/cust

roc score 0.5130906684755088
roc score 0.5289794433207754
roc score 0.5372363134978391
roc score 0.5434127597745201
Fold  4 AUC : 0.543413


2025-10-15 15:28:19.839633: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:28:20.015584: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:28:20.158835: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:28:24.430019: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:28:24.483141: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:28:38.837401: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-10-15 15:28:38.876097: I tensorflow/core/grappler/optimizers/cust

roc score 0.5248818662727912
roc score 0.5418667197012665
roc score 0.5511828053381509
roc score 0.5590604052285991
Fold  5 AUC : 0.559060


Save model prediction to disk.

In [10]:
pos_score_train = pd.DataFrame({'pos_score':oof_preds}, index=trn_id)
pos_score_test = pd.DataFrame({'pos_score':sub_preds}, index=test_id)             
pos_score_train.to_csv('../output/pos_score_train.csv')
pos_score_test.to_csv('../output/pos_score_test.csv')