# <center> **Home Credit Default Risk Assessment**
# <center> **Previous Applications Dataset**

# **Libraries**

In [1]:
import pandas as pd
import numpy as np

from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import ArbitraryNumberImputer

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Data**

## **Load Data**

In [3]:
previous = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\previous_application.csv",
    index_col=False
)

## **Reduce Memory Usage**

In [4]:
previous = functions.reduce_memory_usage(previous)

Memory usage of dataframe is 471.48 MB
Memory usage after optimization is: 309.01 MB
Decreased by 34.5%


## **Clean Bad Data**

In [5]:
previous['DAYS_FIRST_DRAWING'][previous['DAYS_FIRST_DRAWING'] == 365243.0] = np.nan
previous['DAYS_FIRST_DUE'][previous['DAYS_FIRST_DUE'] == 365243.0] = np.nan
previous['DAYS_LAST_DUE_1ST_VERSION'][previous['DAYS_LAST_DUE_1ST_VERSION'] == 365243.0] = np.nan
previous['DAYS_LAST_DUE'][previous['DAYS_LAST_DUE'] == 365243.0] = np.nan
previous['DAYS_TERMINATION'][previous['DAYS_TERMINATION'] == 365243.0] = np.nan

## **Remove Infinity Values**

In [6]:
previous.replace([np.inf, -np.inf], np.nan, inplace=True)

## **Missing Values**

In [7]:
functions.MissingValues(previous)

Unnamed: 0,NumberMissing,PercentageMissing,DataType
RATE_INTEREST_PRIVILEGED,1664263,99.64,float16
RATE_INTEREST_PRIMARY,1664263,99.64,float16
DAYS_FIRST_DRAWING,1607509,96.25,float32
DAYS_TERMINATION,898978,53.82,float32
AMT_DOWN_PAYMENT,895844,53.64,float32
RATE_DOWN_PAYMENT,895844,53.64,float16
DAYS_LAST_DUE,884286,52.94,float32
NAME_TYPE_SUITE,820405,49.12,object
DAYS_LAST_DUE_1ST_VERSION,766929,45.92,float32
DAYS_FIRST_DUE,713710,42.73,float32


## **Drop Features (More than 50% Missing)**

In [8]:
columns = ['RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'RATE_DOWN_PAYMENT', 'AMT_DOWN_PAYMENT']
previous = previous.drop(columns, axis=1)

## **Imputation**

In [9]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(previous)
previous = ani.transform(previous)

In [10]:
ci = CategoricalImputer(imputation_method='missing', fill_value='UNKNOWN')
ci.fit(previous)
previous = ci.transform(previous)

## **Aggregation**

In [11]:
previous = previous.groupby('SK_ID_CURR').agg({
    'SK_ID_PREV': 'count',  
    'AMT_ANNUITY': 'mean',    
    'DAYS_DECISION': ['mean', 'max', 'min'],  
    'CNT_PAYMENT': ['sum'],
    'DAYS_FIRST_DUE': lambda x: x.max() - x.min(),
    'DAYS_LAST_DUE': lambda x: x.max() - x.min()
}).reset_index()

previous.columns = ['_'.join(col).strip() if type(col) is tuple else col for col in previous.columns]

previous = previous.rename(columns={
    'SK_ID_CURR_': 'SK_ID_CURR', 
    'SK_ID_PREV_count': 'NUM_PREVIOUS_APPLICATIONS',
    'AMT_ANNUITY_mean': 'AVG_ANNUITY_AMOUNT',
    'DAYS_DECISION_mean': 'AVG_DAYS_DECISION',
    'DAYS_DECISION_max': 'MAX_DAYS_DECISION',
    'DAYS_DECISION_min': 'MIN_DAYS_DECISION',
    'CNT_PAYMENT_sum': 'SUM_CNT_PAYMENT',
    'DAYS_FIRST_DUE_<lambda>': 'RANGE_DAYS_FIRST_DUE',
    'DAYS_LAST_DUE_<lambda>': 'RANGE_DAYS_LAST_DUE',
})

In [12]:
previous.head()

Unnamed: 0,SK_ID_CURR,NUM_PREVIOUS_APPLICATIONS,AVG_ANNUITY_AMOUNT,AVG_DAYS_DECISION,MAX_DAYS_DECISION,MIN_DAYS_DECISION,SUM_CNT_PAYMENT,RANGE_DAYS_FIRST_DUE,RANGE_DAYS_LAST_DUE
0,100001,1,3951.0,-1740.0,-1740,-1740,8.0,0.0,0.0
1,100002,1,9251.775391,-606.0,-606,-606,24.0,0.0,0.0
2,100003,3,56553.988281,-1305.0,-746,-2341,30.0,1594.0,1444.0
3,100004,1,5357.25,-815.0,-815,-815,4.0,0.0,0.0
4,100005,2,-47592.898438,-536.0,-315,-757,-inf,99293.0,99533.0


# **Save Dataframe as CSV File**

In [13]:
previous.to_csv(r"C:\Users\Dell\Documents\AI\Risk\Data\Data\previous 25.csv", index=False)