# Import CSV

In [38]:
import pandas as pd
# Use to not cut columns
pd.set_option('max_columns', None)

# Import data
read_csv = lambda file, dtype={}: pd.read_csv(file, sep=";", dtype=dtype)

account_df = read_csv("data/account.csv")
card_df = read_csv("data/card_train.csv")
client_df = read_csv("data/client.csv")
disp_df = read_csv("data/disp.csv")
district_df = read_csv("data/district.csv")
loan_df = read_csv("data/loan_train.csv")
trans_df = read_csv("data/trans_train.csv", {'bank': 'str'})

# Clean column names (Some columns come with an extra whitespace)
rename = lambda df: df.rename(columns=lambda x: x.strip())
district_df = rename(district_df)

trans_df

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,1548749,5270,930113,credit,credit in cash,800.0,800.0,,,
1,1548750,5270,930114,credit,collection from another bank,44749.0,45549.0,,IJ,80269753.0
2,3393738,11265,930114,credit,credit in cash,1000.0,1000.0,,,
3,3122924,10364,930117,credit,credit in cash,1100.0,1100.0,,,
4,1121963,3834,930119,credit,credit in cash,700.0,700.0,,,
...,...,...,...,...,...,...,...,...,...,...
396680,515914,1763,961231,withdrawal,withdrawal in cash,14.6,67769.5,payment for statement,,
396681,516262,1765,961231,withdrawal,withdrawal in cash,14.6,19708.1,payment for statement,,
396682,520019,1775,961231,withdrawal,withdrawal in cash,14.6,15944.5,payment for statement,,
396683,517894,1769,961231,withdrawal,withdrawal in cash,14.6,34679.4,payment for statement,,


## Join data

In [39]:
def join(df1, df2, key1, key2, suff, t="inner"):
    return df1.merge(df2, left_on=key1, right_on=key2, how=t, suffixes=suff)

# Account U
join_df = join(loan_df, account_df, 'account_id', 'account_id', ['_loan', '_account'])
join_df = join(join_df, disp_df, 'account_id', 'account_id', ['', '_disp'])
join_df = join(join_df, card_df, 'disp_id', 'disp_id', ['', '_card'])
join_df = join(join_df, trans_df, 'account_id', 'account_id', ['', '_trans'])
join_df = join(join_df, client_df, 'client_id', 'client_id', ['', '_client'])
join_df = join(join_df, district_df, 'district_id', 'code', ['', '_district'])

df = join_df
join_df.columns

Index(['loan_id', 'account_id', 'date_loan', 'amount', 'duration', 'payments',
       'status', 'district_id', 'frequency', 'date_account', 'disp_id',
       'client_id', 'type', 'card_id', 'type_card', 'issued', 'trans_id',
       'date', 'type_trans', 'operation', 'amount_trans', 'balance',
       'k_symbol', 'bank', 'account', 'birth_number', 'district_id_client',
       'code', 'name', 'region', 'no. of inhabitants',
       'no. of municipalities with inhabitants < 499',
       'no. of municipalities with inhabitants 500-1999',
       'no. of municipalities with inhabitants 2000-9999',
       'no. of municipalities with inhabitants >10000', 'no. of cities',
       'ratio of urban inhabitants', 'average salary', 'unemploymant rate '95',
       'unemploymant rate '96', 'no. of enterpreneurs per 1000 inhabitants',
       'no. of commited crimes '95', 'no. of commited crimes '96'],
      dtype='object')

In [40]:
# This already counts as cleaning data
loan_df

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
0,5314,1787,930705,96396,12,8033,-1
1,5316,1801,930711,165960,36,4610,1
2,6863,9188,930728,127080,60,2118,1
3,5325,1843,930803,105804,36,2939,1
4,7240,11013,930906,274740,60,4579,1
...,...,...,...,...,...,...,...
323,6818,9030,961212,155616,48,3242,1
324,5625,3189,961215,222180,60,3703,-1
325,6805,8972,961221,45024,48,938,1
326,7233,10963,961225,115812,36,3217,1


## 

In [41]:
import datetime

age_bins = [20, 29, 39, 49, 65]
# Create a new gender column
def identify_gender(old_tup):
    birth = old_tup['birth_number']
    month = (birth // 100) % 100
    return 'F' if month > 12 else 'M' 
client_df['gender'] = client_df.apply(identify_gender, axis=1)

def identify_year(old_tup, select_date='birth_number'):
    birth = old_tup[select_date]
    today = datetime.date.today()

    year = birth // 10000
    year = (year + 1900) if year > (today.year % 100) else (year + 2000)
    month = (birth // 100) % 100
    month = month - 50 if month > 12 else month # For cases when subject is female, month is +50
    day = birth % 100

    return datetime.datetime(year, month, day)
# Create new birth date to client
client_df['birth_date'] = client_df.apply(identify_year, axis=1)
# Create new date for transactions
trans_df['date'] = trans_df.apply(lambda x: identify_year(x, select_date="date"), axis=1)

# Create a new age column for clients
def identify_age(old_tup):
    born = old_tup['birth_date']
    today = datetime.date.today()
    age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
    return age
client_df['age'] = client_df.apply(identify_age, axis=1)

# Discretize ages
age_bins=[20, 29, 39, 49, 66, 150]
labels=['20-29', '30-39', '40-49', '50-65', 'reformers']
client_df['age_bins'] = pd.cut(x=client_df['age'], bins=age_bins, labels=labels)

# Convert ammount to negative according to transaction type
def convert_amount(old_tup):
    ammount = old_tup['amount']
    t = old_tup['type']
    return ammount if t == "credit" else -ammount
trans_df['amount'] = trans_df.apply(convert_amount, axis=1)

trans_df

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,1548749,5270,1993-01-13,credit,credit in cash,800.0,800.0,,,
1,1548750,5270,1993-01-14,credit,collection from another bank,44749.0,45549.0,,IJ,80269753.0
2,3393738,11265,1993-01-14,credit,credit in cash,1000.0,1000.0,,,
3,3122924,10364,1993-01-17,credit,credit in cash,1100.0,1100.0,,,
4,1121963,3834,1993-01-19,credit,credit in cash,700.0,700.0,,,
...,...,...,...,...,...,...,...,...,...,...
396680,515914,1763,1996-12-31,withdrawal,withdrawal in cash,-14.6,67769.5,payment for statement,,
396681,516262,1765,1996-12-31,withdrawal,withdrawal in cash,-14.6,19708.1,payment for statement,,
396682,520019,1775,1996-12-31,withdrawal,withdrawal in cash,-14.6,15944.5,payment for statement,,
396683,517894,1769,1996-12-31,withdrawal,withdrawal in cash,-14.6,34679.4,payment for statement,,


In [42]:
# Generate revenue per month

balance_df = join(account_df, trans_df, 'account_id', 'account_id', ['_account', '_trans'])
balance_df = balance_df.groupby([balance_df.date_trans.dt.year, balance_df.date_trans.dt.month, balance_df.account_id])
balance_df = balance_df['amount'].sum().rename_axis(['year', 'month', 'account_id']).reset_index()
balance_df['amount'].mean()
card_df

Unnamed: 0,card_id,disp_id,type,issued
0,1005,9285,classic,931107
1,104,588,classic,940119
2,747,4915,classic,940205
3,70,439,classic,940208
4,577,3687,classic,940215
...,...,...,...,...
172,243,1478,classic,961213
173,162,967,junior,961223
174,594,3794,junior,961227
175,609,3893,classic,961229


In [72]:
card_disp = join(card_df, disp_df, 'disp_id', 'disp_id', ['_card', '_disp'])
card_disp.groupby([card_disp.account_id, card_disp.type_card]).size().unstack(fill_value=0)

# card_df.groupby(card_df.account_id)['type'].value_counts()

type_card,classic,gold,junior
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33,0,1,0
43,0,0,1
66,1,0,0
71,0,0,1
73,1,0,0
...,...,...,...
10227,1,0,0
10411,0,1,0
10520,0,1,0
11042,1,0,0
