# **Libraries**

In [1]:
import pandas as pd
import numpy as np

from feature_engine.imputation import ArbitraryNumberImputer

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Data**

## **Load Data**

In [3]:
install = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\installments_payments.csv",
    index_col=False)

In [4]:
install.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


## **Reduce Memory Usage**

In [5]:
install = functions.reduce_memory_usage(install)

Memory usage of dataframe is 830.41 MB
Memory usage after optimization is: 311.40 MB
Decreased by 62.5%


## **Remove Infinity Values**

In [6]:
install.replace([np.inf, -np.inf], np.nan, inplace=True)

## **Missing Values**

In [7]:
functions.MissingValues(install)

Unnamed: 0,NumberMissing,PercentageMissing,DataType
DAYS_ENTRY_PAYMENT,2905,0.02,float16
AMT_PAYMENT,2905,0.02,float32


## **Imputation**

In [8]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(install)
install = ani.transform(install)

## **Aggregation**

In [9]:
install = install.groupby('SK_ID_CURR').agg({

    'SK_ID_PREV': 'count',  
    'AMT_INSTALMENT': ['sum', 'mean'],   
    'AMT_PAYMENT': ['sum', 'mean', 'max', 'min'],  

}).reset_index()


install['SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT'] = install[('AMT_PAYMENT', 'sum')] / install[('AMT_INSTALMENT', 'sum')]
install['MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT'] = install[('AMT_PAYMENT', 'mean')] - install[('AMT_INSTALMENT', 'mean')]


install.columns = ['_'.join(col).strip() if type(col) is tuple else col for col in install.columns]

install = install.rename(columns={

    'SK_ID_CURR_': 'SK_ID_CURR', 
    'SK_ID_PREV_count': 'NUM_PREVIOUS_APPLICATIONS',
    'AMT_INSTALMENT_sum': 'SUM_AMT_INSTALMENT',
    'AMT_INSTALMENT_mean': 'AVG_AMT_INSTALMENT',
    'AMT_PAYMENT_sum': 'SUM_AMT_PAYMENT',
    'AMT_PAYMENT_mean': 'AVG_AMT_PAYMENT',
    'AMT_PAYMENT_max': 'MAX_AMT_PAYMENT',
    'AMT_PAYMENT_min': 'MIN_AMT_PAYMENT',
    'SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT_': 'SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT',
    'MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT_': 'MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT'

})

In [10]:
install.head()

Unnamed: 0,SK_ID_CURR,NUM_PREVIOUS_APPLICATIONS,SUM_AMT_INSTALMENT,AVG_AMT_INSTALMENT,SUM_AMT_PAYMENT,AVG_AMT_PAYMENT,MAX_AMT_PAYMENT,MIN_AMT_PAYMENT,SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT,MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT
0,100001,7,41195.93,5885.132324,41195.93,5885.132324,17397.900391,3951.0,1.0,0.0
1,100002,19,219625.7,11559.24707,219625.7,11559.24707,53093.746094,9251.775391,1.0,0.0
2,100003,25,1618865.0,64754.585938,1618865.0,64754.585938,560835.375,6662.970215,1.0,0.0
3,100004,3,21288.46,7096.154785,21288.46,7096.154785,10573.964844,5357.25,1.0,0.0
4,100005,9,56161.84,6240.205078,56161.84,6240.205078,17656.244141,4813.200195,1.0,0.0


# **Save Dataframe as CSV File**

In [11]:
install.to_csv(r"C:\Users\Dell\Documents\AI\Risk\Data\Data\install.csv", index=False)