In [148]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cash-loans/prev_approved_cash_loans.csv
/kaggle/input/cash-loans/curr_cash_loans.csv
/kaggle/input/raw-data/POS_CASH_balance.csv
/kaggle/input/raw-data/installments_payments.csv
/kaggle/input/raw-data/previous_application-home-credit.csv


In [149]:
from pathlib import Path
raw_dir = Path('/kaggle/input/raw-data')
cash_loan_dir = Path('/kaggle/input/cash-loans')
output_dir = Path('/kaggle/working')

instal_csv = 'installments_payments.csv'
curr_cash_csv = 'curr_cash_loans.csv'
prev_cash_csv = 'prev_approved_cash_loans.csv'
pos_csv = 'POS_CASH_balance.csv'

In [150]:
prev_cash_df = pd.read_csv(cash_loan_dir / prev_cash_csv)
curr_cash_df = pd.read_csv(cash_loan_dir / curr_cash_csv)
pos_df = pd.read_csv(raw_dir / pos_csv)

In [151]:
print(f'num rows before filter: {prev_cash_df.shape[0]}')

#note: get all previous loan information for current applicants
#some current applicants do not have previous information and I do not use them.

prev_cash_df = prev_cash_df[prev_cash_df.SK_ID_CURR.isin(curr_cash_df.SK_ID_CURR)]
print(f'num rows after filter: {prev_cash_df.shape[0]}')
print(set(prev_cash_df.SK_ID_CURR.unique()) - set(prev_cash_df.SK_ID_CURR.unique()))

num rows before filter: 312536
num rows after filter: 249345
set()


In [152]:
print(f'num rows before filter: {curr_cash_df.shape[0]}')
curr_cash_df = curr_cash_df[curr_cash_df.SK_ID_CURR.isin(prev_cash_df.SK_ID_CURR)]
print(f'num rows after filter: {curr_cash_df.shape[0]}')
print(set(prev_cash_df.SK_ID_CURR.unique()) - set(prev_cash_df.SK_ID_CURR.unique()))

num rows before filter: 278232
num rows after filter: 124915
set()


In [153]:
instal_df = pd.read_csv(raw_dir / instal_csv)
instal_df = instal_df.sort_values(
    by=['SK_ID_CURR', 'SK_ID_PREV',
        'NUM_INSTALMENT_NUMBER', 'NUM_INSTALMENT_VERSION'])
print(instal_df.shape)
#note: Competition host mentioned previous application dataset contains up-to-date loan information
#as a result, I decided to use instalments for up-to-date previous loans
is_up_to_date = instal_df.SK_ID_PREV \
                    .isin(prev_cash_df.SK_ID_PREV)
instal_df = instal_df[is_up_to_date] 
print(instal_df.shape)
print(set(instal_df.SK_ID_CURR.unique()) - set(curr_cash_df.SK_ID_CURR.unique()))

(13605401, 8)
(3000744, 8)
set()


### Remove Duplicates for POS_DF
See example below. <br>
I need the following columns from POS_DF:
1. **SK_ID_PREV**.
2. **CNT_INSTALMENT**.

Hence, duplicates occur due to decreasing **CNT_INSTALMENT_FUTURE**, also there are more than 1 **CNT_INSTALMENT** and I will only preserve **MAX(CNT_INSTALMENT)**. <br>

In [154]:
pos_df[pos_df.SK_ID_PREV == 18_105_18].tail()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
3969672,1810518,100003,-20,12.0,7.0,Active,0,0
4887765,1810518,100003,-19,12.0,6.0,Active,0,0
4887970,1810518,100003,-24,12.0,11.0,Active,0,0
5467865,1810518,100003,-18,7.0,0.0,Completed,0,0
6413902,1810518,100003,-22,12.0,9.0,Active,0,0


In [155]:
pos_cols = ['SK_ID_PREV', 'CNT_INSTALMENT']
pos_df = pos_df[pos_cols].drop_duplicates()
pos_df = pos_df.groupby('SK_ID_PREV')['CNT_INSTALMENT'].max().reset_index()
pos_df = pos_df[pos_df.SK_ID_PREV.isin(prev_cash_df.SK_ID_PREV)]
pos_df.shape

(230154, 2)

### Transforming Instalments Dataset

In [156]:
prev_cash_df_subset = prev_cash_df[['SK_ID_PREV', 'AMT_ANNUITY']].copy()
instal_df_combined = instal_df.merge(
    prev_cash_df_subset, how='left', on=['SK_ID_PREV'])

In [157]:
instal_df_combined['TOTAL_REPAID_PER_INSTAL'] = instal_df_combined \
                                                .groupby(by=['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'])['AMT_PAYMENT'].transform(np.sum)

#Easier to work with integers
instal_df_combined['AMT_INSTALMENT'] = instal_df_combined['AMT_INSTALMENT'].round()
instal_df_combined['AMT_PAYMENT'] = instal_df_combined['AMT_PAYMENT'].round()
instal_df_combined['TOTAL_REPAID_PER_INSTAL'] = instal_df_combined['TOTAL_REPAID_PER_INSTAL'].round()
instal_df_combined['AMT_ANNUITY'] = instal_df_combined['AMT_ANNUITY'].round()

## Examples of Duplicates
Consider the following **3** examples to understand when duplicates arise.

### Example 1
Consider **SK_ID_PREV == 12_393_48** and **NUM_INSTALMENT_NUMBER == 23**. <br>
The same instalment number has 2 versions: 1 and 2. <br>
Note that **AMT_INSTALMENT** == **AMT_PAYMENT** for **NUM_INSTALMENT_VERSION == 1/2** and **NUM_INSTALMENT_NUMBER == 23** on **DAYS_ENTRY_PAYMENT == -1196**.

In [158]:
filter_cond = (instal_df_combined.SK_ID_PREV == 12_393_48) \
                & (instal_df_combined.NUM_INSTALMENT_NUMBER == 23)
instal_df_combined[filter_cond]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,AMT_ANNUITY,TOTAL_REPAID_PER_INSTAL
309347,1239348,136896,1.0,23,-1196.0,-1196.0,12979.0,12979.0,12979.0,50647.0
309348,1239348,136896,1.0,23,-1196.0,-1166.0,12979.0,5637.0,12979.0,50647.0
309349,1239348,136896,1.0,23,-1196.0,-1196.0,12979.0,857.0,12979.0,50647.0
309350,1239348,136896,1.0,23,-1196.0,-1174.0,12979.0,5850.0,12979.0,50647.0
309351,1239348,136896,2.0,23,-1196.0,-1196.0,12344.0,12979.0,12979.0,50647.0
309352,1239348,136896,2.0,23,-1196.0,-1166.0,12344.0,5637.0,12979.0,50647.0
309353,1239348,136896,2.0,23,-1196.0,-1196.0,12344.0,857.0,12979.0,50647.0
309354,1239348,136896,2.0,23,-1196.0,-1174.0,12344.0,5850.0,12979.0,50647.0


### Example 2
Consider **SK_ID_PREV == 28_434_84** and **NUM_INSTALMENT_NUMBER == 11, 12**. <br>
Each instalment number is splitted into 2 partial repayments. <br>
Note that **TOTAL_REPAID_PER_INSTAL == AMT_INSTALMENT**.


In [159]:
filter_cond = (instal_df_combined.SK_ID_PREV == 28_434_84) \
                & (instal_df_combined.NUM_INSTALMENT_NUMBER.isin([11, 12]))
instal_df_combined[filter_cond]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,AMT_ANNUITY,TOTAL_REPAID_PER_INSTAL
1084062,2843484,229590,1.0,11,-1168.0,-1197.0,5040.0,334.0,5040.0,5040.0
1084063,2843484,229590,1.0,11,-1168.0,-1162.0,5040.0,4706.0,5040.0,5040.0
1084064,2843484,229590,1.0,12,-1138.0,-1162.0,5036.0,334.0,5040.0,5036.0
1084065,2843484,229590,1.0,12,-1138.0,-1136.0,5036.0,4702.0,5040.0,5036.0


### Example 3
Consider **SK_ID_PREV == 26_313_84** and **NUM_INSTALMENT_NUMBER == 1**. <br>
Note duplicates occur when we consider the following columns:
1. **SK_ID_PREV**.
2. **NUM_INSTALMENT_NUMBER**.
3. **DAYS_ENTRY_PAYMENT**.
4. **AMT_PAYMENT**.

Note:while **AMT_INSTALMENT** is different from versions 1 and 2, I think it is more crucial to avoid counting **AMT_PAYMENT**.

In [160]:
filter_cond = (instal_df_combined.SK_ID_PREV == 26_313_84) \
                & (instal_df_combined.NUM_INSTALMENT_NUMBER == 1)
instal_df_combined[filter_cond]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,AMT_ANNUITY,TOTAL_REPAID_PER_INSTAL
3000715,2631384,456255,1.0,1,-756.0,-768.0,54022.0,669252.0,54022.0,1338503.0
3000716,2631384,456255,2.0,1,-756.0,-768.0,615230.0,669252.0,54022.0,1338503.0


### Removing Duplicates
2 conditions to remove duplicates:
1. **AMT_INSTALMENT <= AMT_PAYMENT**: to account for exact payment or prepayment.
2. **TOTAL_REPAID_PER_INSTAL == AMT_INSTALMENT**: to account for multiple partial payments for 1 instalment.

Finally, drop duplicates except for the 1st occurrence based on the following columns:
1. **SK_ID_PREV**.
2. **NUM_INSTALMENT_NUMBER**.
3. **DAYS_ENTRY_PAYMENT**.
4. **AMT_PAYMENT**.

Note: I sorted the data based on **NUM_INSTALMENT_NUMBER** and **NUM_INSTALMENT_VERSION** earliest, hence I preserve earlier data.

In [161]:
filter_cond_1 = (instal_df_combined.AMT_INSTALMENT \
                     <= instal_df_combined.AMT_PAYMENT) #to account for prepayment

filter_cond_2 = (instal_df_combined.TOTAL_REPAID_PER_INSTAL \
                     == instal_df_combined.AMT_INSTALMENT) #to account for 1 instalment split into multiple partial payments

In [162]:
cleaned_instal_df = instal_df_combined[(filter_cond_1) | (filter_cond_2)].copy()
cleaned_instal_df.drop_duplicates(
    subset=['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER', 
            'DAYS_ENTRY_PAYMENT', 'AMT_PAYMENT'], inplace=True)

## Check the 3 Cases again
Expectations:
1. For **SK_ID_PREV 12_393_48**, should see 1 row for **NUM_INSTALMENT_VERSION 1, NUM_INSTALMENT_NUMBER 23**.
2. For **SK_ID_PREV 28_434_84**, no change, the original rows are valid.
3. For **SK_ID_PREV 26_313_84**, should see 1 row for **NUM_INSTALMENT_VERSION 1, NUM_INSTALMENT_NUMBER 1**.

In [163]:
filter_cond = (instal_df_combined.SK_ID_PREV == 12_393_48) \
                & (instal_df_combined.NUM_INSTALMENT_NUMBER == 23)
cleaned_instal_df[filter_cond]

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,AMT_ANNUITY,TOTAL_REPAID_PER_INSTAL
309347,1239348,136896,1.0,23,-1196.0,-1196.0,12979.0,12979.0,12979.0,50647.0


In [164]:
filter_cond = (instal_df_combined.SK_ID_PREV == 28_434_84) \
                & (instal_df_combined.NUM_INSTALMENT_NUMBER.isin([11, 12]))
cleaned_instal_df[filter_cond]

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,AMT_ANNUITY,TOTAL_REPAID_PER_INSTAL
1084062,2843484,229590,1.0,11,-1168.0,-1197.0,5040.0,334.0,5040.0,5040.0
1084063,2843484,229590,1.0,11,-1168.0,-1162.0,5040.0,4706.0,5040.0,5040.0
1084064,2843484,229590,1.0,12,-1138.0,-1162.0,5036.0,334.0,5040.0,5036.0
1084065,2843484,229590,1.0,12,-1138.0,-1136.0,5036.0,4702.0,5040.0,5036.0


In [165]:
filter_cond = (instal_df_combined.SK_ID_PREV == 26_313_84) \
                & (instal_df_combined.NUM_INSTALMENT_NUMBER == 1)
cleaned_instal_df[filter_cond]

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,AMT_ANNUITY,TOTAL_REPAID_PER_INSTAL
3000715,2631384,456255,1.0,1,-756.0,-768.0,54022.0,669252.0,54022.0,1338503.0


In [166]:
#check if we keep earlier instalment version
test_df = cleaned_instal_df.groupby(
    ['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'])['NUM_INSTALMENT_VERSION'].nunique().reset_index()
test_df.NUM_INSTALMENT_VERSION.unique()

array([1])

### Preparing Data For Feature Extraction
1. Drop **TOTAL_REPAID_PER_INSTAL** column and recompute it again.
2. Drop **AMT_ANNUITY** and inner join with **prev_cash_df** on **SK_ID_PREV**.
3. Inner Join with **pos_df** on **SK_ID_PREV**.
4. Inner Join with **curr_cash_df** on **SK_ID_CURR**.


10. Drop the following columns as they will not be needed:
    1. **NUM_INSTALMENT_VERSION**.

In [167]:
cleaned_instal_df.drop('TOTAL_REPAID_PER_INSTAL', axis=1, inplace=True)
cleaned_instal_df['TOTAL_REPAID_PER_INSTAL'] = cleaned_instal_df \
                                                .groupby(by=['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'])['AMT_PAYMENT'].transform(np.sum)

In [168]:
print(f'cols: {cleaned_instal_df.columns}')
print(cleaned_instal_df.shape)

cols: Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
       'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
       'AMT_INSTALMENT', 'AMT_PAYMENT', 'AMT_ANNUITY',
       'TOTAL_REPAID_PER_INSTAL'],
      dtype='object')
(2948674, 10)


In [169]:
prev_cash_cols = ['SK_ID_PREV', 'NAME_PRODUCT_TYPE', 'NAME_YIELD_GROUP', 'AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE']
pos_df_copy = pos_df.copy()
combined_pos_prev_cash = prev_cash_df[prev_cash_cols].merge(
    pos_df_copy, how='inner', on=['SK_ID_PREV'])

In [170]:
#cleaned_instal_df.drop('AMT_ANNUITY', axis=1, inplace=True)
joined_1 = cleaned_instal_df.merge(
        combined_pos_prev_cash, how='inner', on=['SK_ID_PREV'])

In [171]:
joined_1.shape

(2948644, 16)

In [172]:
new_names = {'AMT_CREDIT': 'CURR_AMT_CREDIT', 'AMT_ANNUITY': 'CURR_AMT_ANNUITY'}
curr_cash_df.rename(new_names, axis=1, inplace=True)

joined_2 = joined_1.merge(
    curr_cash_df, how='inner', on=['SK_ID_CURR'])

In [173]:
joined_2.shape

(2948644, 37)