In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
performance  = pd.read_csv('performance_train.csv')
facturation  = pd.read_csv('facturation_train.csv')
payments     = pd.read_csv('paiements_train.csv')
transactions = pd.read_csv('transactions_train.csv')

customer_ids = performance['ID_CPTE']

print('Proportion of clients who default:', sum(performance['Default'])/len(performance))

Proportion of clients who default: 0.19336134453781512


# Raw Data Preview

#### Performance

In [3]:
performance.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
0,99690111,2015-12-01,0
1,57427180,2012-12-01,0
2,29617912,2015-12-01,0
3,61632809,2015-12-01,0
4,14117855,2013-12-01,0


#### Facturation

In [4]:
facturation.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
0,99690111,2015-05-01,2015-05-03,8497.84,4293.12,16200.0,0
1,99690111,2014-11-01,2014-11-03,866.0,0.0,12000.0,0
2,99690111,2015-06-01,2015-05-31,10790.95,5224.44,16200.0,0
3,99690111,2015-10-01,2015-10-04,12388.46,4786.08,16200.0,0
4,99690111,2015-11-01,2015-11-02,12746.5,4818.48,16200.0,0


#### Payments

In [5]:
payments.head()

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
0,99690111,208.0,2015-04-26 00:00:00,Q
1,99690111,176.8,2015-05-28 00:00:00,Q
2,99690111,200.0,2015-03-27 04:00:00,Q
3,99690111,80.8,2015-04-02 00:00:00,Q
4,99690111,250.0,2015-11-24 00:00:00,Q


#### Transactions

In [6]:
transactions.head()

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,MERCHANT_CITY_NAME,MERCHANT_COUNTRY_XCD,DECISION_XCD,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT,TRANSACTION_CATEGORY_XCD,TRANSACTION_DTTM,TRANSACTION_TYPE_XCD,SICGROUP
0,99690111,A,365767,DP,C,5927.0,52.53,E,2015-06-20 12:00:00,F,AN
1,99690111,L,2635650,DP,C,13343.0,28.35,B,2015-01-25 12:00:00,F,AN
2,99690111,L,2635650,DP,C,13343.0,0.0,A,2015-01-26 12:00:00,G,AN
3,99690111,J,680536,AF,C,9430.0,0.0,A,2015-03-25 08:00:00,G,AW
4,99690111,J,680536,AF,C,10600.0,0.0,A,2015-03-03 08:00:00,G,AW


In [7]:
print('performance length:\t', len(performance))
print('facturation length:\t', len(facturation))
print('payments length:\t', len(payments))
print('transactions length:\t', len(transactions))

performance length:	 11900
facturation length:	 166543
payments length:	 292320
transactions length:	 690730


In [14]:
class customer:
    
    def __init__(self, customer_id, performance, facturation, payments, transactions):
        
        self.customer_id  = customer_id
        self.performance  = performance
        self.facturation  = facturation
        self.payments     = payments
        self.transactions = transactions
        
        self.assessment = performance['PERIODID_MY'][0]
        self.default    = performance['Default'][0]

In [15]:
def generate_clients(customer_ids, *dfs):
    
    for cus in customer_ids:
        
        yield [cus] + [df[df['ID_CPTE'] == cus] for df in dfs]

In [16]:
client_generator = generate_clients(customer_ids, performance, facturation, payments, transactions)
clients = [customer(*client_info) for client_info in client_generator]

## Notes

- All ```PERIODID_MY``` occur on the 1st day of a given month, i.e. ```PERIODID_MY = 201X-XX-01```.

#### Performance

In [31]:
clients[0].performance

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
0,99690111,2015-12-01,0


#### Facturation

In [60]:
clients[0].facturation

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
0,99690111,2015-05-01,2015-05-03,8497.84,4293.12,16200.0,0
1,99690111,2014-11-01,2014-11-03,866.0,0.0,12000.0,0
2,99690111,2015-06-01,2015-05-31,10790.95,5224.44,16200.0,0
3,99690111,2015-10-01,2015-10-04,12388.46,4786.08,16200.0,0
4,99690111,2015-11-01,2015-11-02,12746.5,4818.48,16200.0,0
5,99690111,2015-08-01,2015-08-02,10610.05,4753.35,16200.0,0
6,99690111,2015-09-01,2015-08-30,11417.12,4616.46,16200.0,0
7,99690111,2014-12-01,2014-12-03,1151.85,0.0,12000.0,0
8,99690111,2015-02-01,2015-01-31,4045.67,1148.45,16200.0,0
9,99690111,2015-12-01,2015-11-30,13119.6,4791.44,16200.0,0


In [76]:
clients[0].facturation.sort_values(by=['PERIODID_MY'])

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
1,99690111,2014-11-01,2014-11-03,866.0,0.0,12000.0,0
7,99690111,2014-12-01,2014-12-03,1151.85,0.0,12000.0,0
10,99690111,2015-01-01,2015-01-02,2298.96,350.0,12000.0,0
8,99690111,2015-02-01,2015-01-31,4045.67,1148.45,16200.0,0
11,99690111,2015-03-01,2015-03-03,5926.2,2567.25,16200.0,0
12,99690111,2015-04-01,2015-03-31,6916.62,3307.33,16200.0,0
0,99690111,2015-05-01,2015-05-03,8497.84,4293.12,16200.0,0
2,99690111,2015-06-01,2015-05-31,10790.95,5224.44,16200.0,0
13,99690111,2015-07-01,2015-07-05,10560.0,5127.54,16200.0,0
5,99690111,2015-08-01,2015-08-02,10610.05,4753.35,16200.0,0


#### Payments

In [77]:
clients[0].payments.sort_values(by=['TRANSACTION_DTTM'])

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
12,99690111,262.5,2015-01-26 00:00:00,Q
11,99690111,303.0,2015-02-26 00:00:00,Q
2,99690111,200.0,2015-03-27 04:00:00,Q
3,99690111,80.8,2015-04-02 00:00:00,Q
0,99690111,208.0,2015-04-26 00:00:00,Q
1,99690111,176.8,2015-05-28 00:00:00,Q
10,99690111,303.0,2015-06-25 00:00:00,Q
7,99690111,618.0,2015-07-27 00:00:00,Q
6,99690111,267.5,2015-08-23 00:00:00,Q
8,99690111,226.6,2015-09-30 00:00:00,Q


#### Transactions

In [20]:
clients[0].transactions

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,MERCHANT_CITY_NAME,MERCHANT_COUNTRY_XCD,DECISION_XCD,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT,TRANSACTION_CATEGORY_XCD,TRANSACTION_DTTM,TRANSACTION_TYPE_XCD,SICGROUP
0,99690111,A,365767,DP,C,5927.0,52.53,E,2015-06-20 12:00:00,F,AN
1,99690111,L,2635650,DP,C,13343.0,28.35,B,2015-01-25 12:00:00,F,AN
2,99690111,L,2635650,DP,C,13343.0,0.00,A,2015-01-26 12:00:00,G,AN
3,99690111,J,680536,AF,C,9430.0,0.00,A,2015-03-25 08:00:00,G,AW
4,99690111,J,680536,AF,C,10600.0,0.00,A,2015-03-03 08:00:00,G,AW
5,99690111,J,680536,AF,C,12203.0,7.28,C,2015-02-16 16:00:00,B,AW
6,99690111,J,680536,AF,C,13314.0,7.00,C,2015-01-20 12:00:00,B,AW
7,99690111,J,680536,AF,C,9696.0,7.14,C,2015-01-11 08:00:00,B,AW
8,99690111,J,680536,AF,C,12203.0,0.00,A,2015-02-20 16:00:00,G,AW
9,99690111,J,680536,AF,C,10600.0,6.36,C,2015-03-04 08:00:00,B,AW


In [51]:
set([date[-2:] for date in facturation['PERIODID_MY']])

{'01'}