In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
cc = pd.read_csv("credit_card_balance.csv")


In [3]:
cc.head()
 


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [4]:
cc.info()
 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3840312 entries, 0 to 3840311
Data columns (total 23 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   SK_ID_PREV                  int64  
 1   SK_ID_CURR                  int64  
 2   MONTHS_BALANCE              int64  
 3   AMT_BALANCE                 float64
 4   AMT_CREDIT_LIMIT_ACTUAL     int64  
 5   AMT_DRAWINGS_ATM_CURRENT    float64
 6   AMT_DRAWINGS_CURRENT        float64
 7   AMT_DRAWINGS_OTHER_CURRENT  float64
 8   AMT_DRAWINGS_POS_CURRENT    float64
 9   AMT_INST_MIN_REGULARITY     float64
 10  AMT_PAYMENT_CURRENT         float64
 11  AMT_PAYMENT_TOTAL_CURRENT   float64
 12  AMT_RECEIVABLE_PRINCIPAL    float64
 13  AMT_RECIVABLE               float64
 14  AMT_TOTAL_RECEIVABLE        float64
 15  CNT_DRAWINGS_ATM_CURRENT    float64
 16  CNT_DRAWINGS_CURRENT        int64  
 17  CNT_DRAWINGS_OTHER_CURRENT  float64
 18  CNT_DRAWINGS_POS_CURRENT    float64
 19  CNT_INSTALMENT_MATURE

In [5]:
cc.select_dtypes(include='object').columns
 


Index(['NAME_CONTRACT_STATUS'], dtype='object')

In [6]:
cc.isnull().sum().sort_values(ascending=False).head(15)

AMT_PAYMENT_CURRENT           767988
AMT_DRAWINGS_ATM_CURRENT      749816
CNT_DRAWINGS_POS_CURRENT      749816
AMT_DRAWINGS_OTHER_CURRENT    749816
AMT_DRAWINGS_POS_CURRENT      749816
CNT_DRAWINGS_OTHER_CURRENT    749816
CNT_DRAWINGS_ATM_CURRENT      749816
CNT_INSTALMENT_MATURE_CUM     305236
AMT_INST_MIN_REGULARITY       305236
SK_ID_PREV                         0
AMT_TOTAL_RECEIVABLE               0
SK_DPD                             0
NAME_CONTRACT_STATUS               0
CNT_DRAWINGS_CURRENT               0
AMT_PAYMENT_TOTAL_CURRENT          0
dtype: int64

In [7]:
cc = cc.fillna(0)


Credit Utilization Rate

"How much of their credit limit do they use?"

In [8]:
cc["UTILIZATION"] = cc["AMT_BALANCE"] / cc["AMT_CREDIT_LIMIT_ACTUAL"]
cc["UTILIZATION"] = cc["UTILIZATION"].replace([np.inf, -np.inf], 0)


Payment Difference (same logic as installments)

In [9]:
cc["PAY_DIFF"] = cc["AMT_PAYMENT_TOTAL_CURRENT"] - cc["AMT_INST_MIN_REGULARITY"]


Overdue signal

In [10]:
cc["OVERDUE"] = (cc["SK_DPD"] > 0).astype(int)


Aggregate monthly data → per credit card (SK_ID_PREV)

In [11]:
cc_agg_prev = cc.groupby("SK_ID_PREV").agg({
    "UTILIZATION": ["mean","max"],
    "AMT_BALANCE": ["mean","max","sum"],
    "PAY_DIFF": ["mean","sum"],
    "OVERDUE": ["sum","mean"],
    "SK_DPD": ["max","mean"],
    "SK_DPD_DEF": ["max","mean"],
})


Flatten column names

In [12]:
cc_agg_prev.columns = ["CC_" + "_".join(col).upper() for col in cc_agg_prev.columns]
cc_agg_prev = cc_agg_prev.reset_index()


Merge into previous_application dataset

In [13]:
prev = joblib.load("prev_clean.pkl")  # or your latest file

prev = prev.merge(cc_agg_prev, on="SK_ID_PREV", how="left")
prev = prev.fillna(0)


Aggregate everything per customer (SK_ID_CURR)

In [14]:
num_cols = prev.select_dtypes(include=['int64','float64']).columns.tolist()
num_cols.remove("SK_ID_CURR")
num_cols.remove("SK_ID_PREV")

agg_dict = {col: ['mean','max','min','sum'] for col in num_cols}

prev_cc_agg = prev.groupby("SK_ID_CURR").agg(agg_dict)
prev_cc_agg.columns = ["PREV_CC_" + "_".join(col).upper() for col in prev_cc_agg.columns]
prev_cc_agg = prev_cc_agg.reset_index()


In [15]:
prev_inst = joblib.load("prev_inst_clean.pkl")


In [16]:
prev_full = prev_inst.merge(prev_cc_agg, on="SK_ID_CURR", how="left")
prev_full = prev_full.fillna(0)


In [17]:
joblib.dump(prev_full, "prev_inst_cc_clean.pkl")


['prev_inst_cc_clean.pkl']