In [49]:
import pandas as pd
import numpy as np
import gc
%matplotlib inline

In [12]:
def one_hot_encoding(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [
        col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns,
                        dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [63]:
def installments_payments(num_rows=None, nan_as_category=True):
    ins = pd.read_csv('../input/installments_payments.csv', nrows=num_rows)
    ins, cat_cols = one_hot_encoding(ins, nan_as_category=True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_PAYMENT'] - ins['AMT_INSTALMENT']
    ins['PAYMENT_NOT_ENOUGH'] = ins['PAYMENT_DIFF'] < 0
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # construct some manual features
    ins_agg = ins.groupby('SK_ID_CURR')[['SK_ID_PREV']].count().rename(columns={'SK_ID_PREV': 'INSTAL_USR_REC_CNT'})
    ins_agg['INSTAL_USR_LOAN_CNT'] = ins.groupby('SK_ID_CURR')['SK_ID_PREV'].nunique()
    ins_agg['INSTAL_REC_CNT_PER_LOAN'] = ins_agg['INSTAL_USR_REC_CNT'] / ins_agg['INSTAL_USR_LOAN_CNT']
    
    # TIME_SPAN
    temp = ins.groupby('SK_ID_PREV')[['DAYS_INSTALMENT']].\
            agg(lambda x: x.max() - x.min()).reset_index().\
            rename(columns={'DAYS_INSTALMENT': 'TIME_SPAN'})
    temp = temp.merge(
            ins[['SK_ID_PREV', 'SK_ID_CURR']].drop_duplicates('SK_ID_PREV'),
            on='SK_ID_PREV',
            how='left')
    ins_agg['INSTAL_TIME_SPAN_MAX'] = temp.groupby('SK_ID_CURR')['TIME_SPAN'].max()
    ins_agg['INSTAL_TIME_SPAN_MIN'] = temp.groupby('SK_ID_CURR')['TIME_SPAN'].min()
    ins_agg['INSTAL_TIME_SPAN_MEAN'] = temp.groupby('SK_ID_CURR')['TIME_SPAN'].mean()
    
    # PAYMENT_TIMES
    temp = ins.groupby('SK_ID_PREV')[['NUM_INSTALMENT_NUMBER']].max().\
            rename(columns={'NUM_INSTALMENT_NUMBER': 'INSTALL_TIMES'})
    temp = temp.merge(
            ins[['SK_ID_PREV', 'SK_ID_CURR']].drop_duplicates('SK_ID_PREV'),
            on='SK_ID_PREV',
            how='left')
    ins_agg['INSTAL_TIMES_MAX'] = temp.groupby('SK_ID_CURR')['INSTALL_TIMES'].max()
    ins_agg['INSTAL_TIMES_MIN'] = temp.groupby('SK_ID_CURR')['INSTALL_TIMES'].min()
    ins_agg['INSTAL_TIMES_MEAN'] = temp.groupby('SK_ID_CURR')['INSTALL_TIMES'].mean()
    
    
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['min', 'mean', 'var'],
        'PAYMENT_DIFF': ['min', 'mean', 'var'],
        'PAYMENT_NOT_ENOUGH': ['mean', 'sum'],  # NOT_ENOUGH's mean is the underpay ratio of a user
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg_auto = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg_auto.columns = pd.Index(
        ['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg_auto.columns.tolist()])
    
    recent = ins[ins['DAYS_INSTALMENT'] > -365]
    recent_agg = recent.groupby('SK_ID_CURR').agg(aggregations)
    recent_agg.columns = pd.Index(
        ['RECENT_INSTAL_' + e[0] + "_" + e[1].upper() for e in recent_agg.columns.tolist()])
    
    ins_agg = ins_agg.merge(ins_agg_auto, on='SK_ID_CURR', how='left')
    ins_agg = ins_agg.merge(recent_agg, on='SK_ID_CURR', how='left')
    del ins, temp, recent, ins_agg_auto, recent_agg
    gc.collect()
    return ins_agg
ins_agg = installments_payments()
ins_agg

Unnamed: 0_level_0,INSTAL_USR_REC_CNT,INSTAL_USR_LOAN_CNT,INSTAL_REC_CNT_PER_LOAN,INSTAL_TIME_SPAN_MAX,INSTAL_TIME_SPAN_MIN,INSTAL_TIME_SPAN_MEAN,INSTAL_TIMES_MAX,INSTAL_TIMES_MIN,INSTAL_TIMES_MEAN,INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE,...,RECENT_INSTAL_AMT_INSTALMENT_MAX,RECENT_INSTAL_AMT_INSTALMENT_MEAN,RECENT_INSTAL_AMT_INSTALMENT_SUM,RECENT_INSTAL_AMT_PAYMENT_MIN,RECENT_INSTAL_AMT_PAYMENT_MAX,RECENT_INSTAL_AMT_PAYMENT_MEAN,RECENT_INSTAL_AMT_PAYMENT_SUM,RECENT_INSTAL_DAYS_ENTRY_PAYMENT_MAX,RECENT_INSTAL_DAYS_ENTRY_PAYMENT_MEAN,RECENT_INSTAL_DAYS_ENTRY_PAYMENT_SUM
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,7,2,3.500000,90.0,60.0,75.000000,4,4,4.000000,2,...,,,,,,,,,,
100002,19,1,19.000000,540.0,540.0,540.000000,19,19,19.000000,2,...,53093.745,12905.272500,154863.270,9251.775,53093.745,12905.272500,154863.270,-49.0,-207.583333,-2491.0
100003,25,3,8.333333,330.0,150.0,220.000000,12,6,8.333333,2,...,,,,,,,,,,
100004,3,1,3.000000,60.0,60.0,60.000000,3,3,3.000000,2,...,,,,,,,,,,
100005,9,1,9.000000,240.0,240.0,240.000000,9,9,9.000000,2,...,,,,,,,,,,
100006,16,3,5.333333,270.0,0.0,130.000000,10,1,5.333333,2,...,691786.890,89278.371818,982062.090,29027.520,691786.890,89278.371818,982062.090,-12.0,-152.636364,-1679.0
100007,66,5,13.200000,480.0,270.0,348.000000,17,10,12.600000,2,...,16046.100,16038.290769,208497.780,16037.640,16046.100,16038.290769,208497.780,-14.0,-196.076923,-2549.0
100008,35,4,8.750000,270.0,150.0,225.000000,10,6,8.500000,2,...,17885.835,17884.863000,178848.630,17876.115,17885.835,17884.863000,178848.630,-82.0,-215.000000,-2150.0
100009,51,8,6.375000,330.0,0.0,161.250000,12,1,6.500000,1,...,10418.670,8622.197609,198310.545,7499.565,10418.670,8622.197609,198310.545,-58.0,-213.130435,-4902.0
100010,10,1,10.000000,270.0,270.0,270.000000,10,10,10.000000,1,...,,,,,,,,,,
