In [1]:
import pandas as pd 
import numpy as np
import joblib

In [2]:
pos = pd.read_csv("POS_CASH_balance.csv")


In [3]:
pos.head()
 


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [4]:
pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001358 entries, 0 to 10001357
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   SK_ID_PREV             int64  
 1   SK_ID_CURR             int64  
 2   MONTHS_BALANCE         int64  
 3   CNT_INSTALMENT         float64
 4   CNT_INSTALMENT_FUTURE  float64
 5   NAME_CONTRACT_STATUS   object 
 6   SK_DPD                 int64  
 7   SK_DPD_DEF             int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 610.4+ MB


In [5]:
pos.select_dtypes(include='object').columns

Index(['NAME_CONTRACT_STATUS'], dtype='object')

In [6]:
pos.isnull().sum().sort_values(ascending=False).head(20)

CNT_INSTALMENT_FUTURE    26087
CNT_INSTALMENT           26071
SK_ID_PREV                   0
SK_ID_CURR                   0
MONTHS_BALANCE               0
NAME_CONTRACT_STATUS         0
SK_DPD                       0
SK_DPD_DEF                   0
dtype: int64

One-hot encode contract status

Example values = “Active”, “Completed”, “Signed”, etc.

In [7]:
pos = pd.get_dummies(pos, columns=["NAME_CONTRACT_STATUS"], drop_first=True)


Create key feature: delinquency flag

Days Past Due > 0 → missed payments.

In [8]:
pos["LATE"] = (pos["SK_DPD"] > 0).astype(int)


Aggregate MONTHLY POS → per loan (SK_ID_PREV)

In [9]:
pos_agg_prev = pos.groupby("SK_ID_PREV").agg({
    "MONTHS_BALANCE": ["mean","max","min"],
    "SK_DPD": ["mean","max"],
    "SK_DPD_DEF": ["mean","max"],
    "LATE": ["sum","mean"],
})


In [10]:
pos_agg_prev.columns = ["POS_" + "_".join(col).upper() for col in pos_agg_prev.columns]
pos_agg_prev = pos_agg_prev.reset_index()


In [13]:
prev = joblib.load("prev_with_inst_per_loan.pkl")


In [14]:
prev = prev.merge(pos_agg_prev, on="SK_ID_PREV", how="left")
prev = prev.fillna(0)


In [15]:
num_cols = prev.select_dtypes(include=['float64','int64']).columns.tolist()
num_cols.remove("SK_ID_PREV")
num_cols.remove("SK_ID_CURR")

agg_dict = {col: ['mean','max','min','sum'] for col in num_cols}

prev_pos_agg = prev.groupby("SK_ID_CURR").agg(agg_dict)
prev_pos_agg.columns = ["PREV_POS_" + "_".join(col).upper() for col in prev_pos_agg.columns]
prev_pos_agg = prev_pos_agg.reset_index()


In [16]:
joblib.dump(prev, "prev_full_with_inst_cc_pos_per_loan.pkl")
joblib.dump(prev_pos_agg, "prev_inst_cc_pos_clean.pkl")


['prev_inst_cc_pos_clean.pkl']

In [3]:
mtraindf = joblib.load("train_with_bureau.pkl")
mtestdf  = joblib.load("test_with_bureau.pkl")

prev_full = joblib.load("prev_inst_cc_pos_clean.pkl")


In [4]:
mtraindf = mtraindf.merge(prev_full, on="SK_ID_CURR", how="left")
mtestdf  = mtestdf.merge(prev_full, on="SK_ID_CURR", how="left")


In [5]:
mtraindf = mtraindf.fillna(0)
mtestdf  = mtestdf.fillna(0)


In [6]:
joblib.dump(mtraindf, "train_FINAL.pkl")
joblib.dump(mtestdf, "test_FINAL.pkl")


['test_FINAL.pkl']

In [7]:
mtraindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 425 entries, SK_ID_CURR to PREV_POS_POS_LATE_MEAN_SUM
dtypes: bool(124), float64(261), int64(40)
memory usage: 742.5 MB


In [8]:
mtestdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Columns: 425 entries, SK_ID_CURR to PREV_POS_POS_LATE_MEAN_SUM
dtypes: bool(121), float64(265), int64(39)
memory usage: 118.7 MB
