In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/raw-dataset/POS_CASH_balance.csv
/kaggle/input/raw-dataset/application_train.csv
/kaggle/input/raw-dataset/previous_application.csv
/kaggle/input/raw-dataset/installments_payments.csv
/kaggle/input/intermediate-dataset/prev_approved_cash_loans.csv
/kaggle/input/intermediate-dataset/curr_cash_loans.csv
/kaggle/input/intermediate-dataset/de_duplicated_data.csv


In [2]:
from pathlib import Path
input_dir = Path('/kaggle/input/raw-dataset')
intermediate_dir = Path('/kaggle/input/intermediate-dataset')
output_dir = Path('/kaggle/working')

instal_csv = 'installments_payments.csv'
curr_cash_csv = 'curr_cash_loans.csv'
prev_cash_csv = 'prev_approved_cash_loans.csv'

In [3]:
prev_cash_df = pd.read_csv(intermediate_dir / prev_cash_csv)
curr_cash_df = pd.read_csv(intermediate_dir / curr_cash_csv)
instal_df = pd.read_csv(input_dir / instal_csv)

In [4]:
## sort instal_df for easier visual inspection according to installment number.
instal_df_sorted = instal_df.sort_values(
    by=['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER', 'NUM_INSTALMENT_VERSION'])

### Duplicate Examples
Use 3 cases below to explain the causes of duplicates.

### Example 1
Consider **SK_ID_PREV == 12_393_48** and **NUM_INSTALMENT_NUMBER == 23**. <br>
The same instalment number has 2 versions: 1 and 2. <br>
Note that **AMT_INSTALMENT** == **AMT_PAYMENT** for **NUM_INSTALMENT_VERSION == 1/2** and **NUM_INSTALMENT_NUMBER == 23** on **DAYS_ENTRY_PAYMENT == -1196**.

In [5]:
filter_cond = (instal_df_sorted.SK_ID_PREV == 12_393_48) \
                & (instal_df_sorted.NUM_INSTALMENT_NUMBER == 23)
instal_df_sorted[filter_cond]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
642124,1239348,136896,1.0,23,-1196.0,-1196.0,12979.08,12979.08
1967766,1239348,136896,1.0,23,-1196.0,-1166.0,12979.08,5637.33
2815271,1239348,136896,1.0,23,-1196.0,-1196.0,12979.08,856.98
2869838,1239348,136896,1.0,23,-1196.0,-1174.0,12979.08,5850.0
642125,1239348,136896,2.0,23,-1196.0,-1196.0,12344.31,12979.08
1967767,1239348,136896,2.0,23,-1196.0,-1166.0,12344.31,5637.33
2815272,1239348,136896,2.0,23,-1196.0,-1196.0,12344.31,856.98
2869839,1239348,136896,2.0,23,-1196.0,-1174.0,12344.31,5850.0


### Example 2
Consider **SK_ID_PREV == 28_434_84** and **NUM_INSTALMENT_NUMBER == 11, 12**. <br>
Each instalment number is splitted into 2 partial repayments. <br>
Note that **TOTAL_REPAID_PER_INSTAL == AMT_INSTALMENT**.

In [6]:
filter_cond = (instal_df_sorted.SK_ID_PREV == 28_434_84) \
                & (instal_df_sorted.NUM_INSTALMENT_NUMBER.isin([11, 12]))
instal_df_sorted[filter_cond]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
4248106,2843484,229590,1.0,11,-1168.0,-1197.0,5039.73,333.63
5991645,2843484,229590,1.0,11,-1168.0,-1162.0,5039.73,4706.1
4093862,2843484,229590,1.0,12,-1138.0,-1162.0,5036.355,333.9
7553708,2843484,229590,1.0,12,-1138.0,-1136.0,5036.355,4702.455


### Example 3
Consider **SK_ID_PREV == 26_313_84** and **NUM_INSTALMENT_NUMBER == 1**. <br>
Note duplicates occur when we consider the following columns:
1. **SK_ID_PREV**.
2. **NUM_INSTALMENT_NUMBER**.
3. **DAYS_ENTRY_PAYMENT**.
4. **AMT_PAYMENT**.

Realise that **AMT_PAYMENT** > **AMT_INSTALMENT** for version 1. <br>
Hence, a new row with version 2 instalment is created to balance the excess **AMT_PAYMENT** made in version 1. <br>
To confirm: (54_022.140 + 61_522_9.515) == 66_925_1.655.

In [7]:
filter_cond = (instal_df_sorted.SK_ID_PREV == 26_313_84) \
                & (instal_df_sorted.NUM_INSTALMENT_NUMBER == 1)
instal_df_sorted[filter_cond]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
11721916,2631384,456255,1.0,1,-756.0,-768.0,54022.14,669251.655
11721915,2631384,456255,2.0,1,-756.0,-768.0,615229.515,669251.655


### A step to remove duplicates.
1. Group by **SK_ID_PREV**, **NUM_INSTALMENT_NUMBER**, then get **MIN(NUM_INSTALMENT_VERSION)**, assign to min_version_df.
2. Inner join min_version_df and instal_df_sorted, this eliminates the issue related to Example 3.
3. Removing floating points for **AMT_INSTALMENT** and **AMT_PAYMENT**. Do a groupby to compute **TOTAL_REPAID_PER_INSTAL**.
4. Filter to according to the following:
    1. **AMT_INSTALMENT <= AMT_PAYMENT**: to account for exact payment or prepayment.
    2. **TOTAL_REPAID_PER_INSTAL == AMT_INSTALMENT**: to account for multiple partial payments for 1 instalment.
    3. This removes the issue related to Example 1.

In [8]:
min_version_df = instal_df_sorted.groupby(['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'])['NUM_INSTALMENT_VERSION'].min().reset_index()

In [9]:
instal_df_cleaned = instal_df_sorted.merge(min_version_df, how='inner', on=['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER', 'NUM_INSTALMENT_VERSION'])

In [10]:
#Easier to work without floating points
instal_df_cleaned['AMT_INSTALMENT'] = instal_df_cleaned['AMT_INSTALMENT'].round()
instal_df_cleaned['AMT_PAYMENT'] = instal_df_cleaned['AMT_PAYMENT'].round()
instal_df_cleaned['TOTAL_REPAID_PER_INSTAL'] = instal_df_cleaned\
                                                .groupby(by=['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'])['AMT_PAYMENT'].transform(np.sum)

In [11]:
filter_cond_1 = (instal_df_cleaned.AMT_INSTALMENT \
                     <= instal_df_cleaned.AMT_PAYMENT) #to account for prepayment

filter_cond_2 = (instal_df_cleaned.TOTAL_REPAID_PER_INSTAL \
                     == instal_df_cleaned.AMT_INSTALMENT)

In [12]:
instal_df_no_dup = instal_df_cleaned[(filter_cond_1) | (filter_cond_2)].copy()
instal_df_no_dup.drop('TOTAL_REPAID_PER_INSTAL', axis=1, inplace=True)

## Check the 3 Cases again
Expectations:
1. For **SK_ID_PREV 12_393_48**, should see 1 row for **NUM_INSTALMENT_VERSION 1, NUM_INSTALMENT_NUMBER 23**.
2. For **SK_ID_PREV 28_434_84**, no change, the original rows are valid.
3. For **SK_ID_PREV 26_313_84**, should see 1 row for **NUM_INSTALMENT_VERSION 1, NUM_INSTALMENT_NUMBER 1**.

In [13]:
filter_cond = (instal_df_no_dup.SK_ID_PREV == 12_393_48) \
                & (instal_df_no_dup.NUM_INSTALMENT_NUMBER == 23)
instal_df_no_dup[filter_cond]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
1870473,1239348,136896,1.0,23,-1196.0,-1196.0,12979.0,12979.0


In [14]:
filter_cond = (instal_df_no_dup.SK_ID_PREV == 28_434_84) \
                & (instal_df_no_dup.NUM_INSTALMENT_NUMBER.isin([11, 12]))
instal_df_no_dup[filter_cond]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
13515122,2843484,229590,1.0,11,-1168.0,-1197.0,5040.0,334.0
13515123,2843484,229590,1.0,11,-1168.0,-1162.0,5040.0,4706.0
13515124,2843484,229590,1.0,12,-1138.0,-1162.0,5036.0,334.0
13515125,2843484,229590,1.0,12,-1138.0,-1136.0,5036.0,4702.0


In [15]:
filter_cond = (instal_df_no_dup.SK_ID_PREV == 26_313_84) \
                & (instal_df_no_dup.NUM_INSTALMENT_NUMBER == 1)
instal_df_no_dup[filter_cond]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
12019598,2631384,456255,1.0,1,-756.0,-768.0,54022.0,669252.0


### Preparing Data For Feature Extraction
1. Recompute **TOTAL_REPAID_PER_INSTAL** again.
2. Compute Interest Rate using Present Value of Cash Loans for **prev_cash_df**. Assume **AMT_CREDIT** is the present value of the Principal.
3. **instal_df_no_dup** inner join **prev_cash_df** on **SK_ID_PREV**, call this **joined_1**.
4. **joined_1** inner Join with **curr_cash_df** on **SK_ID_CURR**, call this **joined_2**.

In [16]:
instal_df_no_dup['TOTAL_REPAID_PER_INSTAL'] = instal_df_no_dup \
                                                .groupby(by=['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'])['AMT_PAYMENT'].transform(np.sum)

In [17]:
print(f'cols: {instal_df_no_dup.columns}')
print(instal_df_no_dup.shape)

cols: Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
       'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
       'AMT_INSTALMENT', 'AMT_PAYMENT', 'TOTAL_REPAID_PER_INSTAL'],
      dtype='object')
(13210964, 9)


In [18]:
prev_cash_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,NAME_PRODUCT_TYPE,NAME_CONTRACT_STATUS,NAME_YIELD_GROUP,DAYS_FIRST_DUE,DAYS_LAST_DUE,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,CNT_PAYMENT
0,2802425,108129,Cash loans,x-sell,Approved,low_action,-134.0,365243.0,25188.615,679671.0,607500.0,36.0
1,2523466,122040,Cash loans,x-sell,Approved,high,-271.0,365243.0,15060.735,136444.5,112500.0,12.0
2,2819243,176158,Cash loans,x-sell,Approved,middle,-482.0,-182.0,47041.335,470790.0,450000.0,12.0
3,1383531,199383,Cash loans,x-sell,Approved,low_normal,-654.0,-144.0,23703.93,340573.5,315000.0,18.0
4,1715995,447712,Cash loans,x-sell,Approved,low_normal,-705.0,-345.0,11368.62,335754.0,270000.0,54.0


In [19]:
#Excluding NAME_CONTRACT_TYPE/STATUS because we know prev_cash_df is for Approved Cash Loans
cols_to_keep = ['SK_ID_PREV', 'SK_ID_CURR', 'NAME_PRODUCT_TYPE', 
                'NAME_YIELD_GROUP', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE', 
                'AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'CNT_PAYMENT']

prev_cash_df = prev_cash_df.filter(items=cols_to_keep)

In [20]:
def compute_interest(num_payments, monthly_annuity, principal):
    #assume monthly compounding
    fv = num_payments * monthly_annuity
    compounding_factor = (fv / principal)**(1/num_payments) #this is the monthly compounding factor
    
    ear = (compounding_factor**12) - 1 #ear stands for effective annual rate of interest
    return round(ear,1) * 100 #convert to percentage

prev_cash_df['INTEREST_RATE'] = np.vectorize(compute_interest)(prev_cash_df['CNT_PAYMENT'], prev_cash_df['AMT_ANNUITY'], prev_cash_df['AMT_CREDIT'])

In [21]:
prev_cash_df['INTEREST_RATE'] = round(prev_cash_df['INTEREST_RATE'], 2)
prev_cash_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_PRODUCT_TYPE,NAME_YIELD_GROUP,DAYS_FIRST_DUE,DAYS_LAST_DUE,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,CNT_PAYMENT,INTEREST_RATE
0,2802425,108129,x-sell,low_action,-134.0,365243.0,25188.615,679671.0,607500.0,36.0,10.0
1,2523466,122040,x-sell,high,-271.0,365243.0,15060.735,136444.5,112500.0,12.0,30.0
2,2819243,176158,x-sell,middle,-482.0,-182.0,47041.335,470790.0,450000.0,12.0,20.0
3,1383531,199383,x-sell,low_normal,-654.0,-144.0,23703.93,340573.5,315000.0,18.0,20.0
4,1715995,447712,x-sell,low_normal,-705.0,-345.0,11368.62,335754.0,270000.0,54.0,10.0


In [22]:
instal_df_no_dup.drop('SK_ID_CURR', axis=1, inplace=True)
joined_1 = instal_df_no_dup.merge(
        prev_cash_df, how='inner', on=['SK_ID_PREV'])

In [23]:
joined_1.shape

(3576021, 18)

In [24]:
joined_1.columns

Index(['SK_ID_PREV', 'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER',
       'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT', 'AMT_INSTALMENT',
       'AMT_PAYMENT', 'TOTAL_REPAID_PER_INSTAL', 'SK_ID_CURR',
       'NAME_PRODUCT_TYPE', 'NAME_YIELD_GROUP', 'DAYS_FIRST_DUE',
       'DAYS_LAST_DUE', 'AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE',
       'CNT_PAYMENT', 'INTEREST_RATE'],
      dtype='object')

In [25]:
new_names = {'AMT_CREDIT': 'CURR_AMT_CREDIT', 'AMT_ANNUITY': 'CURR_AMT_ANNUITY'}
curr_cash_df.rename(new_names, axis=1, inplace=True)

joined_2 = joined_1.merge(
    curr_cash_df, how='inner', on=['SK_ID_CURR'])

In [26]:
joined_2.shape

(2839760, 39)

In [27]:
joined_2.columns

Index(['SK_ID_PREV', 'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER',
       'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT', 'AMT_INSTALMENT',
       'AMT_PAYMENT', 'TOTAL_REPAID_PER_INSTAL', 'SK_ID_CURR',
       'NAME_PRODUCT_TYPE', 'NAME_YIELD_GROUP', 'DAYS_FIRST_DUE',
       'DAYS_LAST_DUE', 'AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE',
       'CNT_PAYMENT', 'INTEREST_RATE', 'TARGET', 'CODE_GENDER',
       'DAYS_ID_PUBLISH', 'CURR_AMT_CREDIT', 'CURR_AMT_ANNUITY',
       'NAME_CONTRACT_TYPE', 'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE',
       'DAYS_EMPLOYED', 'FLAG_OWN_REALTY', 'NAME_HOUSING_TYPE',
       'NAME_EDUCATION_TYPE', 'LIVE_CITY_NOT_WORK_CITY',
       'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'FLAG_EMAIL',
       'FLAG_EMP_PHONE', 'FLAG_MOBIL', 'FLAG_CONT_MOBILE',
       'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE'],
      dtype='object')

In [28]:
joined_2[['SK_ID_CURR', 'TARGET']].drop_duplicates().groupby('TARGET')['TARGET'].count()

TARGET
0    110371
1      9752
Name: TARGET, dtype: int64

In [29]:
joined_2.isnull().sum(axis=0)

SK_ID_PREV                    0
NUM_INSTALMENT_VERSION        0
NUM_INSTALMENT_NUMBER         0
DAYS_INSTALMENT               0
DAYS_ENTRY_PAYMENT            4
AMT_INSTALMENT                0
AMT_PAYMENT                   4
TOTAL_REPAID_PER_INSTAL       0
SK_ID_CURR                    0
NAME_PRODUCT_TYPE             0
NAME_YIELD_GROUP              0
DAYS_FIRST_DUE              639
DAYS_LAST_DUE               639
AMT_ANNUITY                   0
AMT_CREDIT                    0
AMT_GOODS_PRICE               0
CNT_PAYMENT                   0
INTEREST_RATE                 0
TARGET                        0
CODE_GENDER                   0
DAYS_ID_PUBLISH               0
CURR_AMT_CREDIT               0
CURR_AMT_ANNUITY            153
NAME_CONTRACT_TYPE            0
AMT_INCOME_TOTAL              0
NAME_INCOME_TYPE              0
DAYS_EMPLOYED                 0
FLAG_OWN_REALTY               0
NAME_HOUSING_TYPE             0
NAME_EDUCATION_TYPE           0
LIVE_CITY_NOT_WORK_CITY       0
REG_CITY

## 4 Missing Data for DAYS_ENTRY_PAYMENT AND AMT_PAYMENT
**AMT_INSTALMENT == 0** causes the null data. <br>
Just remove **AMT_INSTALMENT == 0**.

In [30]:
joined_2[joined_2.DAYS_ENTRY_PAYMENT.isnull()] #113_144_2, 138_428_3, 194_879_2, 202_886_5

Unnamed: 0,SK_ID_PREV,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,TOTAL_REPAID_PER_INSTAL,SK_ID_CURR,NAME_PRODUCT_TYPE,...,NAME_EDUCATION_TYPE,LIVE_CITY_NOT_WORK_CITY,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,FLAG_EMAIL,FLAG_EMP_PHONE,FLAG_MOBIL,FLAG_CONT_MOBILE,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE
353184,1131442,2.0,3,-52.0,,0.0,,0.0,130761,x-sell,...,Secondary / secondary special,0,0,0,0,0,1,1,2.0,2.0
1299188,1384283,2.0,3,-43.0,,0.0,,0.0,153840,x-sell,...,Higher education,0,0,0,0,1,1,1,0.0,0.0
1724469,1948792,2.0,5,-68.0,,0.0,,0.0,321514,x-sell,...,Secondary / secondary special,0,0,0,0,1,1,1,0.0,0.0
2293594,2028865,3.0,3,-39.0,,0.0,,0.0,352148,x-sell,...,Secondary / secondary special,0,0,0,0,0,1,1,0.0,0.0


In [31]:
joined_2.dtypes

SK_ID_PREV                    int64
NUM_INSTALMENT_VERSION      float64
NUM_INSTALMENT_NUMBER         int64
DAYS_INSTALMENT             float64
DAYS_ENTRY_PAYMENT          float64
AMT_INSTALMENT              float64
AMT_PAYMENT                 float64
TOTAL_REPAID_PER_INSTAL     float64
SK_ID_CURR                    int64
NAME_PRODUCT_TYPE            object
NAME_YIELD_GROUP             object
DAYS_FIRST_DUE              float64
DAYS_LAST_DUE               float64
AMT_ANNUITY                 float64
AMT_CREDIT                  float64
AMT_GOODS_PRICE             float64
CNT_PAYMENT                 float64
INTEREST_RATE               float64
TARGET                        int64
CODE_GENDER                  object
DAYS_ID_PUBLISH               int64
CURR_AMT_CREDIT             float64
CURR_AMT_ANNUITY            float64
NAME_CONTRACT_TYPE           object
AMT_INCOME_TOTAL            float64
NAME_INCOME_TYPE             object
DAYS_EMPLOYED                 int64
FLAG_OWN_REALTY             

In [32]:
joined_2 = joined_2[joined_2.AMT_INSTALMENT != 0]

In [33]:
joined_2.isnull().sum(axis=0)

SK_ID_PREV                    0
NUM_INSTALMENT_VERSION        0
NUM_INSTALMENT_NUMBER         0
DAYS_INSTALMENT               0
DAYS_ENTRY_PAYMENT            0
AMT_INSTALMENT                0
AMT_PAYMENT                   0
TOTAL_REPAID_PER_INSTAL       0
SK_ID_CURR                    0
NAME_PRODUCT_TYPE             0
NAME_YIELD_GROUP              0
DAYS_FIRST_DUE              639
DAYS_LAST_DUE               639
AMT_ANNUITY                   0
AMT_CREDIT                    0
AMT_GOODS_PRICE               0
CNT_PAYMENT                   0
INTEREST_RATE                 0
TARGET                        0
CODE_GENDER                   0
DAYS_ID_PUBLISH               0
CURR_AMT_CREDIT               0
CURR_AMT_ANNUITY            153
NAME_CONTRACT_TYPE            0
AMT_INCOME_TOTAL              0
NAME_INCOME_TYPE              0
DAYS_EMPLOYED                 0
FLAG_OWN_REALTY               0
NAME_HOUSING_TYPE             0
NAME_EDUCATION_TYPE           0
LIVE_CITY_NOT_WORK_CITY       0
REG_CITY

In [34]:
joined_2.to_csv(output_dir / 'de_duplicated_data.csv')

In [35]:
cols_to_show = ['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_NUMBER', 
                'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
                'AMT_INSTALMENT', 'AMT_PAYMENT', 'INTEREST_RATE', 
                'AMT_CREDIT', 'AMT_ANNUITY']
joined_2.head()[cols_to_show]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,INTEREST_RATE,AMT_CREDIT,AMT_ANNUITY
0,1000011,198678,1,-435.0,-438.0,92435.0,92435.0,30.0,879831.0,92435.04
1,1000011,198678,2,-405.0,-411.0,92435.0,92435.0,30.0,879831.0,92435.04
2,1000011,198678,3,-375.0,-377.0,92435.0,92435.0,30.0,879831.0,92435.04
3,1000011,198678,4,-345.0,-346.0,92435.0,92435.0,30.0,879831.0,92435.04
4,1000011,198678,5,-315.0,-314.0,92435.0,92435.0,30.0,879831.0,92435.04
