# AC FEUP 21/22

In [24]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np

# use to not cut columns
pd.set_option('max_columns', None)

# njobs to use in sklearn models
configNJobs = -1

## Import data

### Training data

In [33]:
# import data
def read_csv(file, dtype={}):
    return pd.read_csv("data/" + file, sep=";", dtype=dtype)

account_df = read_csv("account.csv")
card_df = read_csv("card_train.csv")
client_df = read_csv("client.csv")
disp_df = read_csv("disp.csv")
district_df = read_csv("district.csv")
loan_df = read_csv("loan_train.csv")
trans_df = read_csv("trans_train.csv", {'bank': 'str'})

# clean column names (Some columns come with an extra whitespace)
district_df = district_df.rename(columns=lambda x: x.strip())

trans_df.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,1548749,5270,930113,credit,credit in cash,800.0,800.0,,,
1,1548750,5270,930114,credit,collection from another bank,44749.0,45549.0,,IJ,80269753.0
2,3393738,11265,930114,credit,credit in cash,1000.0,1000.0,,,
3,3122924,10364,930117,credit,credit in cash,1100.0,1100.0,,,
4,1121963,3834,930119,credit,credit in cash,700.0,700.0,,,


### Test data

In [26]:
card_test_df = read_csv("card_test.csv")
loan_test_df = read_csv("loan_test.csv")
trans_test_df = read_csv("trans_test.csv", {'bank': 'str'})

## Data analysis

### Account data

In [34]:
account_df.dtypes

account_id      int64
district_id     int64
frequency      object
date            int64
dtype: object

The frequency is categorical data. It might need to be treated

In [44]:
account_df['frequency'].value_counts()

monthly issuance              4167
weekly issuance                240
issuance after transaction      93
Name: frequency, dtype: int64

In [38]:
account_df.isnull().sum()

account_id     0
district_id    0
frequency      0
date           0
dtype: int64

No null data.

## Join data

In [27]:
def join(df1, df2, key1, key2, suff, t="inner"):
    return df1.merge(df2, left_on=key1, right_on=key2, how=t, suffixes=suff)

# join all data into a single table
join_df = join(loan_df, account_df, 'account_id', 'account_id', ['_loan', '_account'])
join_df = join(join_df, disp_df, 'account_id', 'account_id', ['', '_disp'])
join_df = join(join_df, card_df, 'disp_id', 'disp_id', ['', '_card'])
join_df = join(join_df, trans_df, 'account_id', 'account_id', ['', '_trans'])
join_df = join(join_df, client_df, 'client_id', 'client_id', ['', '_client'])
join_df = join(join_df, district_df, 'district_id', 'code', ['', '_district'])

df = join_df
join_df.head()

Unnamed: 0,loan_id,account_id,date_loan,amount,duration,payments,status,district_id,frequency,date_account,disp_id,client_id,type,card_id,type_card,issued,trans_id,date,type_trans,operation,amount_trans,balance,k_symbol,bank,account,birth_number,district_id_client,code,name,region,no. of inhabitants,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants 2000-9999,no. of municipalities with inhabitants >10000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96
0,6577,7753,940311,51696,24,2154,1,74,monthly issuance,930208,9285,9593,OWNER,1005,classic,931107,2349697,930208,credit,credit in cash,600.0,600.0,,,,685128,74,74,Ostrava - mesto,north Moravia,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
1,6577,7753,940311,51696,24,2154,1,74,monthly issuance,930208,9285,9593,OWNER,1005,classic,931107,2349709,930212,credit,credit in cash,19588.0,20188.0,,,,685128,74,74,Ostrava - mesto,north Moravia,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
2,6577,7753,940311,51696,24,2154,1,74,monthly issuance,930208,9285,9593,OWNER,1005,classic,931107,2349705,930212,credit,credit in cash,27078.0,47266.0,,,,685128,74,74,Ostrava - mesto,north Moravia,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
3,6577,7753,940311,51696,24,2154,1,74,monthly issuance,930208,9285,9593,OWNER,1005,classic,931107,3492040,930228,credit,,119.6,47385.6,interest credited,,,685128,74,74,Ostrava - mesto,north Moravia,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
4,6577,7753,940311,51696,24,2154,1,74,monthly issuance,930208,9285,9593,OWNER,1005,classic,931107,2350078,930310,withdrawal,withdrawal in cash,12000.0,35385.6,,,,685128,74,74,Ostrava - mesto,north Moravia,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,18782,18347


## Processing data

### Derive clients' gender

In [28]:
def identify_gender(old_tup):
    birth = old_tup['birth_number']
    month = (birth // 100) % 100
    return 'F' if month > 12 else 'M' 

# create a new gender column
# it is derived from the clients' birth_number
client_df['gender'] = client_df.apply(identify_gender, axis=1)

### Simplify dates

In [30]:
from datetime import datetime, date

def identify_year(old_tup, select_date):
    birth = old_tup[select_date]
    today = date.today()

    year = birth // 10000
    year = (year + 1900) if year > (today.year % 100) else (year + 2000)
    month = (birth // 100) % 100
    month = month - 50 if month > 12 else month # For cases when subject is female, month is +50
    day = birth % 100

    return datetime(year, month, day)

# create new birth date attribute for client
client_df['birth_date'] = client_df.apply(lambda x: identify_year(x, "birth_number"), axis=1)
# create new date for transactions
trans_df['date'] = trans_df.apply(lambda x: identify_year(x, "date"), axis=1)

### Discretize client ages

In [None]:
# create a new age column for clients
def identify_age(old_tup):
    born = old_tup['birth_date']
    today = date.today()
    age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
    return age
client_df['age'] = client_df.apply(identify_age, axis=1)

# discretize ages
age_bins=[20, 29, 39, 49, 66, 150]
labels=['20-29', '30-39', '40-49', '50-65', 'reformers']
client_df['age_bins'] = pd.cut(x=client_df['age'], bins=age_bins, labels=labels)

### Apply transaction types to the transaction amount

In [None]:
# convert ammount to negative according to transaction type
def convert_amount(old_tup):
    ammount = old_tup['amount']
    t = old_tup['type']
    return ammount if t == "credit" else -ammount
trans_df['amount'] = trans_df.apply(convert_amount, axis=1)

trans_df

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,1548749,5270,1993-01-13,credit,credit in cash,800.0,800.0,,,
1,1548750,5270,1993-01-14,credit,collection from another bank,44749.0,45549.0,,IJ,80269753.0
2,3393738,11265,1993-01-14,credit,credit in cash,1000.0,1000.0,,,
3,3122924,10364,1993-01-17,credit,credit in cash,1100.0,1100.0,,,
4,1121963,3834,1993-01-19,credit,credit in cash,700.0,700.0,,,
...,...,...,...,...,...,...,...,...,...,...
396680,515914,1763,1996-12-31,withdrawal,withdrawal in cash,-14.6,67769.5,payment for statement,,
396681,516262,1765,1996-12-31,withdrawal,withdrawal in cash,-14.6,19708.1,payment for statement,,
396682,520019,1775,1996-12-31,withdrawal,withdrawal in cash,-14.6,15944.5,payment for statement,,
396683,517894,1769,1996-12-31,withdrawal,withdrawal in cash,-14.6,34679.4,payment for statement,,


## TODO

In [6]:
# generate revenue per month

balance_df = join(account_df, trans_df, 'account_id', 'account_id', ['_account', '_trans'])
balance_df = balance_df.groupby([balance_df.date_trans.dt.year, balance_df.date_trans.dt.month, balance_df.account_id])
balance_df = balance_df['amount'].sum().rename_axis(['year', 'month', 'account_id']).reset_index()
balance_df.head()

Unnamed: 0,year,month,account_id,amount
0,1993,1,9,400.0
1,1993,1,163,900.0
2,1993,1,192,300.0
3,1993,1,212,200.0
4,1993,1,280,300.0


In [7]:
card_disp = join(card_df, disp_df, 'disp_id', 'disp_id', ['_card', '_disp'])
cardtypes_df = card_disp.groupby([card_disp.account_id, card_disp.type_card]).size().unstack(fill_value=0)

# card_df.groupby(card_df.account_id)['type'].value_counts()