### Goal of this Notebook
1. Load relevant csv files as dataframes.
2. Select the relevant column(s) from each csv files.

### Raw Dataset (downloaded from Kaggle)
1. application_train.csv.
2. previous_application.csv.

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/raw-dataset/POS_CASH_balance.csv
/kaggle/input/raw-dataset/application_train.csv
/kaggle/input/raw-dataset/previous_application.csv
/kaggle/input/raw-dataset/installments_payments.csv


In [4]:
import pandas as pd
import os
from pathlib import Path

input_dir = Path('/kaggle/input/raw-dataset')
output_dir = Path('/kaggle/working')

### Start with **application_train.csv**

In [5]:
app_train_path = input_dir / 'application_train.csv'
app_train_df = pd.read_csv(app_train_path)
app_train_df.shape

(307511, 122)

### Select for current applicants with Cash Loans

In [6]:
is_cash_loans = app_train_df.NAME_CONTRACT_TYPE == 'Cash loans'
cash_loans_only = app_train_df[is_cash_loans]
cash_loans_only.shape #a sanity check to validate the filtering

(278232, 122)

In [7]:
cash_loans_only[cash_loans_only.AMT_ANNUITY.isnull()]

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
47531,155054,0,Cash loans,M,N,N,0,180000.0,450000.0,,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
50035,157917,0,Cash loans,F,N,N,0,94500.0,450000.0,,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,1.0
51594,159744,0,Cash loans,F,N,N,0,202500.0,539100.0,,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,1.0
55025,163757,0,Cash loans,F,N,N,0,162000.0,296280.0,,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,4.0
59934,169487,0,Cash loans,M,Y,N,0,202500.0,360000.0,,...,0,0,0,0,0.0,0.0,1.0,0.0,0.0,6.0
75873,187985,0,Cash loans,M,Y,N,0,144000.0,219249.0,,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,11.0
89343,203726,0,Cash loans,F,Y,N,0,90000.0,157500.0,,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0
123872,243648,0,Cash loans,F,N,Y,0,202500.0,929088.0,,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,5.0
207186,340147,0,Cash loans,M,N,N,0,171000.0,486000.0,,...,0,0,0,0,0.0,0.0,1.0,1.0,0.0,2.0
227939,364022,0,Cash loans,F,N,Y,0,315000.0,628069.5,,...,0,0,0,0,0.0,0.0,1.0,0.0,0.0,2.0


### Duplicates Check
Consider unique values for **SK_ID_CURR**, which contains current customer ID. <br>
Check if count of unique values is equal to number of rows of data belonging to cash loans.

In [8]:
count_unique_ids = len(cash_loans_only['SK_ID_CURR'].unique())
count_rows_cash_loans = cash_loans_only.shape[0]
count_unique_ids == count_rows_cash_loans

True

### Identify all the columns to keep

In [9]:
identification_cols = ['SK_ID_CURR', 'TARGET', 'CODE_GENDER', 'DAYS_ID_PUBLISH']
loan_info_cols = ['AMT_CREDIT', 'AMT_ANNUITY', 'NAME_CONTRACT_TYPE']
income_info_cols = ['AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'DAYS_EMPLOYED']
asset_info_cols = ['FLAG_OWN_REALTY', 'NAME_HOUSING_TYPE']
socio_economic_cols = ['NAME_EDUCATION_TYPE']

address_cols = ['LIVE_CITY_NOT_WORK_CITY', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY']
mobile_email_cols = ['FLAG_EMAIL', 'FLAG_EMP_PHONE', 'FLAG_MOBIL', 'FLAG_CONT_MOBILE']
contact_details_accuracy_cols =  address_cols+ mobile_email_cols

#default measures for borrowers "linked" to applicants
other_relevant_cols = ['DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE']

In [10]:
cols_to_keep = identification_cols + loan_info_cols + income_info_cols \
                + asset_info_cols + socio_economic_cols \
                + contact_details_accuracy_cols + other_relevant_cols
cash_loans_only = cash_loans_only[cols_to_keep]
cash_loans_only.shape #to validate only relevant cols are kept

(278232, 22)

### Save the new dataset belonging to current cash loans

In [11]:
new_path = output_dir / 'curr_cash_loans.csv'

cash_loans_only.to_csv(new_path, index=False) #look at the missing values again

### Now consider previous_application.csv

In [12]:
prev_app_path = input_dir / 'previous_application.csv'
prev_app_df = pd.read_csv(prev_app_path)
prev_app_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [13]:
prev_app_df[prev_app_df.SK_ID_PREV == 26_313_84][['SK_ID_PREV', 'AMT_ANNUITY', 'CNT_PAYMENT']]

Unnamed: 0,SK_ID_PREV,AMT_ANNUITY,CNT_PAYMENT
1383554,2631384,54022.14,36.0


### Check for duplicates
Unique identifiers should be defined by **SK_ID_PREV** and **SK_ID_CURR** which are IDs for previous and current loans respectively.

In [14]:
prev_app_df[prev_app_df[['SK_ID_PREV', 'SK_ID_CURR']].duplicated()]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL


### Filter for previous Cash loans which were approved

In [15]:
is_cash_loan = (prev_app_df.NAME_CONTRACT_TYPE == 'Cash loans')
is_approved = (prev_app_df.NAME_CONTRACT_STATUS == 'Approved')
is_purpose = (prev_app_df.NAME_PORTFOLIO == 'Cash') #as an additional layer of check

prev_cash_loans = prev_app_df[is_cash_loan & is_approved & is_purpose]
prev_cash_loans.shape

(312536, 37)

In [16]:
prev_cash_loans.groupby(['NAME_PRODUCT_TYPE'])['NAME_PRODUCT_TYPE'].count()

NAME_PRODUCT_TYPE
walk-in     44850
x-sell     267686
Name: NAME_PRODUCT_TYPE, dtype: int64

In [17]:
prev_cash_loans.groupby(['NAME_PORTFOLIO'])['NAME_PORTFOLIO'].count()

NAME_PORTFOLIO
Cash    312536
Name: NAME_PORTFOLIO, dtype: int64

### Keep Relevant Columns Only

In [18]:
ID_cols = ['SK_ID_PREV', 'SK_ID_CURR']

loan_info_cols = ['NAME_CONTRACT_TYPE', 'NAME_PRODUCT_TYPE', 'NAME_CONTRACT_STATUS', 'NAME_YIELD_GROUP', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE']
repayment_info_cols = ['AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'CNT_PAYMENT']

cols_to_keep = ID_cols + loan_info_cols + repayment_info_cols

In [19]:
new_path = output_dir / 'prev_approved_cash_loans.csv'
prev_cash_loans[cols_to_keep].to_csv(new_path, index=False)