In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# Show all columns
pd.set_option('display.max_rows', None)

In [2]:
# Import banking data
# Note: This data was extracted on 1999

# Each record describes characteristics of a client
client = pd.read_csv('./data/raw/data_berka/client.asc', sep=';')

# Each record describes static characteristics of an account
account = pd.read_csv('./data/raw/data_berka/account.asc', sep=';')

# Each record describes a credit card issued to an account
card = pd.read_csv('./data/raw/data_berka/card.asc', sep=';')

# Each record describes demographic characteristics of a district
district = pd.read_csv('./data/raw/data_berka/district.asc', sep=';')

# Each record relates together a client with an account
# i.e. this relation describes the rights of clients to operate accounts
disp = pd.read_csv('./data/raw/data_berka/disp.asc', sep=';')

# Each record describes characteristics of a payment order (debits only)
order = pd.read_csv('./data/raw/data_berka/order.asc', sep=';')

# Each record describes one transaction on an account
trans = pd.read_csv('./data/raw/data_berka/trans.asc', sep=';', low_memory=False)

# Each record describes a loan granted for a given account
loan = pd.read_csv('./data/raw/data_berka/loan.asc', sep=';')

FileNotFoundError: [Errno 2] No such file or directory: './data/raw/data_berka/client.asc'

In [None]:
client['birth year'] = client['birth_number'].astype(str).str[0:2].astype(int)+ 1900
client['birth month'] = client['birth_number'].astype(str).str[2:4].astype(int)
client['birth day'] = client['birth_number'].astype(str).str[4:6].astype(int)
client['gender'] = 'M'
client.loc[client['birth month']>50, 'gender'] = 'F'
client.loc[client['birth month']>50, 'birth month'] = client['birth month'] - 50
client['age'] = 1996 - client['birth year'] 
client['age_group'] = client['age']//10*10

client['birth date'] = client.apply(lambda a: datetime(a['birth year'], a['birth month'], a['birth day']), axis = 1)
del client['birth_number'], client['birth month'], client['birth day'], client['birth year']

In [None]:
card['year'] = card['issued'].astype(str).str[0:2].astype(int)+ 1900
card['month'] = card['issued'].astype(str).str[2:4].astype(int)
card['day'] = card['issued'].astype(str).str[4:6].astype(int)

card['Card Issued Date'] = card.apply(lambda a: datetime(a['year'], a['month'], a['day']), axis = 1)

card_1997 = card[card['year'] == 1997 ]
card_1997 = card_1997.reset_index()

card_1997['Issued_0/1'] = 1

del card_1997['month'], card_1997['day'], card_1997['index'], card_1997['issued'], card_1997['year'], card_1997['Card Issued Date']

In [None]:
loan['year'] = loan['date'].astype(str).str[0:2].astype(int)+ 1900
loan['month'] = loan['date'].astype(str).str[2:4].astype(int)
loan['day'] = loan['date'].astype(str).str[4:6].astype(int)

loan['loan date'] = loan.apply(lambda a: datetime(a['year'], a['month'], a['day']), axis = 1)
del loan['month'], loan['day']

loan_1997 = loan[loan['year'] == 1997]
loan_1997 = loan_1997.reset_index()

loan_1997['loan granted'] = 1

del loan_1997['index'], loan_1997['date'], loan_1997['duration'], loan_1997['status'], loan_1997['loan date'], loan_1997['year'], loan_1997['payments']

In [None]:
district  = district.rename(columns = {'A1': 'district_id', 
                                       'A2':'district name',
                                       'A3': 'region',
                                       'A4': 'inhabitants',
                                       'A5': 'municipalities with inhabitants < 499',
                                       'A6': 'municipalities with inhabitants 500-1999',
                                       'A7': 'municipalities with inhabitants 2000-9999',
                                       'A8': 'municipalities with inhabitants >10000',
                                       'A9': 'cities',
                                       'A10': 'ratio of urban inhabitants',
                                       'A11': 'average salary',
                                       'A12': 'unemploymant rate 1995',
                                       'A13': 'unemploymant rate 1996',
                                       'A14': 'enterpreneurs per 1000 inhabitants',
                                       'A15': 'commited crimes 1995',
                                       'A16': 'commited crimes 1996'})

In [None]:
order['k_symbol'] = order['k_symbol'].map({
    'SIPO' : 'household payment',
    'UVER' : 'loan payment', ' ': 'other', 'POJISTNE' : 'insurance payment', 'LEASING': 'lease'
})

order = order.pivot_table(index=['account_id'], columns = 'k_symbol', values = 'amount', fill_value = 0)
order.columns.name = None
order = order.reset_index()

order['total order payment'] = order['household payment']+order['lease']+order['other']+order['loan payment'] + order['insurance payment']


In [None]:
trans['year'] = trans['date'].astype(str).str[0:2].astype(int) + 1900
trans['month'] = trans['date'].astype(str).str[2:4].astype(int)
trans['day'] = trans['date'].astype(str).str[4:6].astype(int) 
trans['transaction_date'] = trans.apply(lambda a: datetime(a['year'], a['month'], a['day']), axis = 1)

del trans['date']

In [None]:
trans_balance = trans[['account_id', 'year', 'month', 'day', 'balance']]
trans_balance = trans_balance.sort_values(['account_id', 'year', 'month', 'day'], ascending = [True, True, True, False])

def FindMonthEnd(df):
    df1 = pd.Series(0, index=df.index)
    df1.iloc[-1] = 1
    return df1

trans_balance["Month end"] = trans_balance.groupby(['account_id','year', 'month'])['day'].apply(FindMonthEnd)
trans_balance = trans_balance[trans_balance['Month end'] == 1]
trans_balance = trans_balance[trans_balance['year'] == 1996]
del trans_balance['year'], trans_balance['month'], trans_balance['day'], trans_balance['Month end']

trans_balance = trans_balance.groupby('account_id').agg( {'balance':['mean', 'median']}) 
trans_balance.columns = ['Average Montly Balance', 'Median Monthly Balance']

trans_balance.columns.name = None
trans_balance = trans_balance.reset_index()

In [None]:
trans['operation'] = trans['operation'].map({
    'VKLAD': 'Credit in Cash', 'PREVOD Z UCTU': 'Collection from another bank' , 'VYBER':'withdrawal in cash', 
    'PREVOD NA UCET': 'remittance to another bank',
       'VYBER KARTOU': 'remittance to another bank',
})
trans.loc[trans['operation'].isna(), 'operation'] = 'Interest Credited'

trans = trans.pivot_table(index = ['account_id', 'transaction_date', 'balance', 'month', 'year'], columns = 'operation', 
                          values = 'amount', fill_value = 0)

trans.columns.name = None
trans = trans.reset_index()

trans_96 = trans[trans['year'] == 1996]
del trans_96['year'], trans_96['balance']

trans_96 = trans_96.groupby(['account_id', 'month']).agg('sum')

trans_96.columns.name = None
trans_96 = trans_96.reset_index()

trans_96['Total Credit'] = trans_96['Collection from another bank'] + trans_96['Credit in Cash'] + trans_96['Interest Credited']
trans_96['Total Debit'] = trans_96['remittance to another bank'] + trans_96['withdrawal in cash']

trans_96['monthly savings'] = trans_96['Total Credit'] - trans_96['Total Debit']
trans_96 = trans_96.groupby('account_id').agg('mean')

trans_96.columns.name = None
trans_96 = trans_96.reset_index()

trans_96 = trans_96.rename(columns = {'Collection from another bank': 'Average Collection from another bank' , 
                                      'Credit in Cash' : 'Average Credit in Cash' ,
                                      'Interest Credited':'Average Interest Credited', 
                                      'remittance to another bank' : ' Average remittance to another bank' ,
                                      'withdrawal in cash': 'Average withdrawal in cash',
                                      'Total Credit' : 'Average Total Credit', 
                                      'Total Debit' : 'Average Total Debit', 
                                      'monthly savings': 'Average Monthly Savings'})

In [None]:
transaction_summary = pd.merge(trans_96, trans_balance, on = 'account_id')
del transaction_summary['month']

In [None]:
trans96 = trans[trans['year']==1996]
# Aggregate the total credit per account
trans_agg_credit = trans96[trans96['type'].isin(['PRIJEM'])].groupby('account_id')['amount'].agg('sum')
trans_agg_credit = trans_agg_credit.reset_index()
trans_agg_credit = trans_agg_credit.rename(columns={'amount':'total_credit'})
trans_agg_credit.head()

In [None]:
# Aggregate the total withdrawal per account
trans_agg_withdrawal = trans96[trans96['type'].isin(['VYDAJ', 'VYBER'])].groupby('account_id')['amount'].agg('sum')
trans_agg_withdrawal = trans_agg_withdrawal.reset_index()
trans_agg_withdrawal = trans_agg_withdrawal.rename(columns={'amount':'total_withdrawal'})
trans_agg_withdrawal.head()

In [None]:
transaction_summary = pd.merge(transaction_summary, trans_agg_credit, how='left', on='account_id')
transaction_summary = pd.merge(transaction_summary, trans_agg_withdrawal, how='left', on='account_id')

In [None]:
account['open year'] = account['date'].astype(str).str[0:2].astype(int) + 1900
account['open month'] = account['date'].astype(str).str[2:4].astype(int)
account['day'] = account['date'].astype(str).str[4:6].astype(int)
account['open_date'] = account.apply(lambda a: datetime(a['open year'], a['open month'], a['day']), axis = 1)
account['LOR'] = 1996 - account['open year']

account = account[account['open year'] < 1997]

account['frequency'] = account['frequency'].map({'POPLATEK MESICNE':'Monthly Issuance',
                                                'POPLATEK TYDNE': 'Weekly Issuance',
                                                'POPLATEK PO OBRATU': 'Issuance after transaction'})



del account['date'], account['open month'], account['day'], account['open year'], account['open_date']

In [None]:
client_disp = pd.merge(client, disp, on = 'client_id', how = 'left')
client_disp['can apply loan?'] = 'Yes'
client_disp.loc[client_disp['type']== 'DISPONENT', 'can apply loan?'] = 'No'
client_disp['permanent orders applicable?'] = 'Yes'
client_disp.loc[client_disp['type']== 'DISPONENT', 'permanent orders applicable?'] = 'No'