# AC FEUP 21/22

## Main dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
import numpy as np
import sklearn

# use to not cut columns
pd.set_option('max_columns', None)

# njobs to use in sklearn models
configNJobs = -1

## Import data

### Training data

In [2]:
# import data
def read_csv(file, dtype={}, na_values=None):
    return pd.read_csv("data/" + file, sep=";", dtype=dtype, na_values=na_values)

account_df = read_csv("account.csv")
card_df = read_csv("card_train.csv")
client_df = read_csv("client.csv")
disp_df = read_csv("disp.csv")
district_df = read_csv("district.csv", na_values="?")
loan_df = read_csv("loan_train.csv")
trans_df = read_csv("trans_train.csv", {'bank': 'str'})

# clean column names (Some columns come with an extra whitespace)
district_df = district_df.rename(columns=lambda x: x.strip())

trans_df.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,1548749,5270,930113,credit,credit in cash,800.0,800.0,,,
1,1548750,5270,930114,credit,collection from another bank,44749.0,45549.0,,IJ,80269753.0
2,3393738,11265,930114,credit,credit in cash,1000.0,1000.0,,,
3,3122924,10364,930117,credit,credit in cash,1100.0,1100.0,,,
4,1121963,3834,930119,credit,credit in cash,700.0,700.0,,,


### Test data

In [3]:
card_test_df = read_csv("card_test.csv")
loan_test_df = read_csv("loan_test.csv")
trans_test_df = read_csv("trans_test.csv", {'bank': 'str'})

## Tyding missing values and categorical data

### Account data

In [4]:
account_df.dtypes

account_id      int64
district_id     int64
frequency      object
date            int64
dtype: object

The frequency is possibly categorical data. Let's look at the cardinality of the attribute and whether it has an implicit order.

In [5]:
account_df['frequency'].value_counts()

monthly issuance              4167
weekly issuance                240
issuance after transaction      93
Name: frequency, dtype: int64

There are 3  possible values for the **frquency attribute**: _monthly issuance_, _weekly issuance_, _issuance_ after transation. There is an implicit order between weekly and monthly, but with _issuance after transation_ it isn't so obvious. It was decided that, since it is expected that people do more than 1 transaction per week, _issuance after transation_ also relates to the other 2 in terms of time.

When considering the time intervals between transations: _issuance after transation_ < _weekly issuance_ < _monthly issuance_.

There is also a magnitude difference between all of them, e.g.: monthly is around 4 times more frequent than weekly. This raises the issue of how much more frequent is _issuance after transaction_ than _weekly issuance_. Let's assume the _weekly issuance_ is 7 time less frequent.

Let's replace this categorical data by ordered integers:

In [6]:
account_df.replace({'frequency':
                    {'issuance after transation': 1, 'weekly issuance': 7, 'monthly issuance': 30}
                   }, inplace=True)
account_df['frequency']

0       30
1       30
2       30
3       30
4       30
        ..
4495    30
4496    30
4497    30
4498    30
4499    30
Name: frequency, Length: 4500, dtype: object

In [7]:
account_df.isnull().sum()

account_id     0
district_id    0
frequency      0
date           0
dtype: int64

No NULL data to deal with on this dataframe.

### Card data

In [8]:
card_df.dtypes

card_id     int64
disp_id     int64
type       object
issued      int64
dtype: object

The **type** attribute is probably categorical data.

In [9]:
card_df['type'].value_counts()

classic    127
junior      41
gold         9
Name: type, dtype: int64

_junior_ accounts are for underarge people (lowest tier) and usually it isn't possible to withdrawl money from them, and _gold_ account are the highest tier. There is an implicit order here: _junior_ < _classic_ < _gold_.

In [10]:
card_df.replace({'type':
                    {'junior': 0, 'classic': 1, 'gold': 2}
                   }, inplace=True)
card_df['type']

0      1
1      1
2      1
3      1
4      1
      ..
172    1
173    0
174    0
175    1
176    1
Name: type, Length: 177, dtype: int64

In [11]:
card_df.isnull().sum()

card_id    0
disp_id    0
type       0
issued     0
dtype: int64

No NULL data to deal with on this dataframe.

### Client data

In [12]:
client_df.dtypes

client_id       int64
birth_number    int64
district_id     int64
dtype: object

In [13]:
card_df.isnull().sum()

card_id    0
disp_id    0
type       0
issued     0
dtype: int64

There are no NULL values or categorical data => nothing to do.

### Disposition data

In [14]:
disp_df.dtypes

disp_id        int64
client_id      int64
account_id     int64
type          object
dtype: object

The **type** attribute is probably categorical data.

In [15]:
disp_df['type'].value_counts()

OWNER        4500
DISPONENT     869
Name: type, dtype: int64

There are only two categories here. We can replace this by a boolean depicting where the disposition is owner or not.

In [16]:
disp_df['is_owner'] = disp_df['type'].apply(lambda x: True if x == "OWNER" else False)
disp_df.drop(['type'], axis=1, inplace=True)
disp_df.head()

Unnamed: 0,disp_id,client_id,account_id,is_owner
0,1,1,1,True
1,2,2,2,True
2,3,3,2,False
3,4,4,3,True
4,5,5,3,False


In [17]:
disp_df.isnull().sum()

disp_id       0
client_id     0
account_id    0
is_owner      0
dtype: int64

No NULL data to deal with on this dataframe.

### District data

In [18]:
district_df.dtypes

code                                                  int64
name                                                 object
region                                               object
no. of inhabitants                                    int64
no. of municipalities with inhabitants < 499          int64
no. of municipalities with inhabitants 500-1999       int64
no. of municipalities with inhabitants 2000-9999      int64
no. of municipalities with inhabitants >10000         int64
no. of cities                                         int64
ratio of urban inhabitants                          float64
average salary                                        int64
unemploymant rate '95                               float64
unemploymant rate '96                               float64
no. of enterpreneurs per 1000 inhabitants             int64
no. of commited crimes '95                          float64
no. of commited crimes '96                            int64
dtype: object

Both name and region are categorical data.

In [19]:
print("Number of different 'code' values:", len(district_df['code'].value_counts()))
print("Number of different 'name' values:", len(district_df['name'].value_counts()))

Number of different 'code' values: 77
Number of different 'name' values: 77


We can see that there is a direct association between the **code** and **name** attributes. This means they are redundant and we don't have to deal with the **name** attribute.

There is a problem with this. The **code** attributes are integers which indirectly imply an order, even though the names of the regions are nominal features.

// TODO for now it is dropped.

In [20]:
district_df.drop(['name'], axis=1, inplace=True)

The **region** feature is categorial and there is no order associated. This means this attribute should be encoded in order to remove it's categorical nature and still remain unordered. One-Hot encoding could be used for simplicity, but Binary enconding is be better.

In [21]:
district_df['region'].value_counts()

south Moravia      14
central Bohemia    12
east Bohemia       11
north Moravia      11
west Bohemia       10
north Bohemia      10
south Bohemia       8
Prague              1
Name: region, dtype: int64

In [22]:
from category_encoders import BinaryEncoder

encoder = BinaryEncoder(cols=['region'])
district_df = encoder.fit_transform(district_df)
district_df.head()

Unnamed: 0,code,region_0,region_1,region_2,region_3,no. of inhabitants,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants 2000-9999,no. of municipalities with inhabitants >10000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96
0,1,0,0,0,1,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677.0,99107
1,2,0,0,1,0,88884,80,26,6,2,5,46.7,8507,1.67,1.85,132,2159.0,2674
2,3,0,0,1,0,75232,55,26,4,1,5,41.7,8980,1.95,2.21,111,2824.0,2813
3,4,0,0,1,0,149893,63,29,6,2,6,67.4,9753,4.64,5.05,109,5244.0,5892
4,5,0,0,1,0,95616,65,30,4,1,6,51.4,9307,3.85,4.43,118,2616.0,3040


In [23]:
district_df.isnull().sum()

code                                                0
region_0                                            0
region_1                                            0
region_2                                            0
region_3                                            0
no. of inhabitants                                  0
no. of municipalities with inhabitants < 499        0
no. of municipalities with inhabitants 500-1999     0
no. of municipalities with inhabitants 2000-9999    0
no. of municipalities with inhabitants >10000       0
no. of cities                                       0
ratio of urban inhabitants                          0
average salary                                      0
unemploymant rate '95                               1
unemploymant rate '96                               0
no. of enterpreneurs per 1000 inhabitants           0
no. of commited crimes '95                          1
no. of commited crimes '96                          0
dtype: int64

There is 1 NULL value in _unemploymant rate '95_ and _no. of commited crimes '95_.

In [24]:
district_df[district_df["unemploymant rate '95"].isnull()]

Unnamed: 0,code,region_0,region_1,region_2,region_3,no. of inhabitants,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants 2000-9999,no. of municipalities with inhabitants >10000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96
68,69,1,0,0,0,42821,4,13,5,1,3,48.4,8173,,7.01,124,,1358


Both of these missing value are for the district of _Jesenik_. We can find the mean growth/shrinkage of the unemploymant and number of crimes commit from 1995 to 1996, and use that to fill the missing values of 1995 for this district.

In [25]:
unem_rate_95 = district_df[district_df["unemploymant rate '95"].notnull()]["unemploymant rate '95"]
unem_rate_96 = district_df[district_df["unemploymant rate '96"].notnull()]["unemploymant rate '96"]

unem_growth_series = unem_rate_95 / unem_rate_96
unem_growth = unem_growth_series.sum() / unem_growth_series.count()

district_df["unemploymant rate '95"] = district_df.apply(lambda x: x["unemploymant rate '96"] * unem_growth if np.isnan(x["unemploymant rate '95"]) else x["unemploymant rate '95"], axis=1)

In [26]:
no_crime_95 = district_df[district_df["no. of commited crimes '95"].notnull()]["no. of commited crimes '95"]
no_crime_96 = district_df[district_df["no. of commited crimes '96"].notnull()]["no. of commited crimes '96"]

crime_growth_series = no_crime_95 / no_crime_96
crime_growth = crime_growth_series.sum() / crime_growth_series.count()

district_df["no. of commited crimes '95"] = district_df.apply(lambda x: x["no. of commited crimes '96"] * crime_growth if np.isnan(x["no. of commited crimes '95"]) else x["unemploymant rate '95"], axis=1)

In [27]:
district_df[district_df["code"] == 69]

Unnamed: 0,code,region_0,region_1,region_2,region_3,no. of inhabitants,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants 2000-9999,no. of municipalities with inhabitants >10000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96
68,69,1,0,0,0,42821,4,13,5,1,3,48.4,8173,5.759683,7.01,124,1321.520511,1358


The missing values have been taken care of.

### Loan data

In [34]:
loan_df.dtypes

loan_id       int64
account_id    int64
date          int64
amount        int64
duration      int64
payments      int64
status        int64
dtype: object

There is no categorical data to deal with on this dataframe.

In [36]:
loan_df.isnull().sum()

loan_id       0
account_id    0
date          0
amount        0
duration      0
payments      0
status        0
dtype: int64

There are no NULL values to fill on this dataframe.

### Transaction data

In [37]:
trans_df.dtypes

trans_id        int64
account_id      int64
date          float64
type           object
operation      object
amount        float64
balance       float64
k_symbol       object
bank           object
account       float64
dtype: object

There are 4 categorical attributes in this dataframe: _type_, _operation_, _k\_symbol_, and _bank_.

In [40]:
trans_df['type'].value_counts()

withdrawal            232093
credit                159468
withdrawal in cash      5124
Name: type, dtype: int64

In [41]:
trans_df['operation'].value_counts()

withdrawal in cash              165270
remittance to another bank       70737
credit in cash                   62202
collection from another bank     26505
credit card withdrawal            1210
Name: operation, dtype: int64

In [42]:
trans_df['k_symbol'].value_counts()

interest credited                        70761
payment for statement                    58377
household                                42839
                                         19065
old-age pension                          13502
insurrance payment                        6592
sanction interest if negative balance      305
Name: k_symbol, dtype: int64

In [43]:
trans_df['bank'].value_counts()

ST    8114
GH    7886
EF    7878
AB    7666
UV    7618
OP    7595
IJ    7536
YZ    7471
QR    7413
KL    7397
WX    7033
CD    7009
MN    6626
Name: bank, dtype: int64

In [38]:
trans_df.isnull().sum()

trans_id           0
account_id         0
date               0
type               0
operation      70761
amount             0
balance            0
k_symbol      185244
bank          299443
account       294456
dtype: int64

## Processing data

### Derive clients' gender

In [28]:
def identify_gender(old_tup):
    birth = old_tup['birth_number']
    month = (birth // 100) % 100
    return False if month > 12 else True 

# create a new gender column
# it is derived from the clients' birth_number
client_df['is_male'] = client_df.apply(identify_gender, axis=1)

### Simplify dates

In [29]:
from datetime import datetime, date

def identify_year(old_tup, select_date):
    birth = old_tup[select_date]
    today = date.today()

    year = birth // 10000
    year = (year + 1900) if year > (today.year % 100) else (year + 2000)
    month = (birth // 100) % 100
    month = month - 50 if month > 12 else month # For cases when subject is female, month is +50
    day = birth % 100

    return datetime(year, month, day)

# create new birth date attribute for client
client_df['birth_date'] = client_df.apply(lambda x: identify_year(x, "birth_number"), axis=1)
# create new date for transactions
trans_df['date'] = trans_df.apply(lambda x: identify_year(x, "date").timestamp(), axis=1)

### Discretize client ages

In [30]:
# create a new age column for clients
def identify_age(old_tup):
    born = old_tup['birth_date']
    today = date.today()
    age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
    return age
client_df['age'] = client_df.apply(identify_age, axis=1)

# discretize ages
age_bins=[20, 29, 39, 49, 66, 150]
labels=['20-29', '30-39', '40-49', '50-65', 'reformers']
client_df['age_bins'] = pd.cut(x=client_df['age'], bins=age_bins, labels=labels)
# we don't need the birth dates anymore
client_df.drop(['age_bins'], axis=1)

Unnamed: 0,client_id,birth_number,district_id,is_male,birth_date,age
0,1,706213,18,False,1970-12-13,50
1,2,450204,1,True,1945-02-04,76
2,3,406009,1,False,1940-10-09,81
3,4,561201,5,True,1956-12-01,64
4,5,605703,5,False,1960-07-03,61
...,...,...,...,...,...,...
5364,13955,456030,1,False,1945-10-30,76
5365,13956,430406,1,True,1943-04-06,78
5366,13968,680413,61,True,1968-04-13,53
5367,13971,626019,67,False,1962-10-19,59


### Apply transaction types to the transaction amount

In [31]:
# convert ammount to negative according to transaction type
def convert_amount(old_tup):
    ammount = old_tup['amount']
    t = old_tup['type']
    return ammount if t == "credit" else -ammount
trans_df['amount'] = trans_df.apply(convert_amount, axis=1)

trans_df

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,1548749,5270,726879600.0,credit,credit in cash,800.0,800.0,,,
1,1548750,5270,726966000.0,credit,collection from another bank,44749.0,45549.0,,IJ,80269753.0
2,3393738,11265,726966000.0,credit,credit in cash,1000.0,1000.0,,,
3,3122924,10364,727225200.0,credit,credit in cash,1100.0,1100.0,,,
4,1121963,3834,727398000.0,credit,credit in cash,700.0,700.0,,,
...,...,...,...,...,...,...,...,...,...,...
396680,515914,1763,851990400.0,withdrawal,withdrawal in cash,-14.6,67769.5,payment for statement,,
396681,516262,1765,851990400.0,withdrawal,withdrawal in cash,-14.6,19708.1,payment for statement,,
396682,520019,1775,851990400.0,withdrawal,withdrawal in cash,-14.6,15944.5,payment for statement,,
396683,517894,1769,851990400.0,withdrawal,withdrawal in cash,-14.6,34679.4,payment for statement,,


## Join data

In [32]:
def join(df1, df2, key1, key2, suff, t="inner"):
    return df1.merge(df2, left_on=key1, right_on=key2, how=t, suffixes=suff)

# join all data into a single table
join_df = join(loan_df, account_df, 'account_id', 'account_id', ['_loan', '_account'])
join_df = join(join_df, disp_df, 'account_id', 'account_id', ['', '_disp'])
join_df = join(join_df, card_df, 'disp_id', 'disp_id', ['', '_card'])
join_df = join(join_df, trans_df, 'account_id', 'account_id', ['', '_trans'])
join_df = join(join_df, client_df, 'client_id', 'client_id', ['', '_client'])
join_df = join(join_df, district_df, 'district_id', 'code', ['', '_district'])

df = join_df
join_df.head()

Unnamed: 0,loan_id,account_id,date_loan,amount,duration,payments,status,district_id,frequency,date_account,disp_id,client_id,is_owner,card_id,type,issued,trans_id,date,type_trans,operation,amount_trans,balance,k_symbol,bank,account,birth_number,district_id_client,is_male,birth_date,age,age_bins,code,region_0,region_1,region_2,region_3,no. of inhabitants,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants 2000-9999,no. of municipalities with inhabitants >10000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96
0,6577,7753,940311,51696,24,2154,1,74,30,930208,9285,9593,True,1005,1,931107,2349697,729126000.0,credit,credit in cash,600.0,600.0,,,,685128,74,False,1968-01-28,53,50-65,74,1,0,0,0,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,4.75,18347
1,6577,7753,940311,51696,24,2154,1,74,30,930208,9285,9593,True,1005,1,931107,2349709,729471600.0,credit,credit in cash,19588.0,20188.0,,,,685128,74,False,1968-01-28,53,50-65,74,1,0,0,0,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,4.75,18347
2,6577,7753,940311,51696,24,2154,1,74,30,930208,9285,9593,True,1005,1,931107,2349705,729471600.0,credit,credit in cash,27078.0,47266.0,,,,685128,74,False,1968-01-28,53,50-65,74,1,0,0,0,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,4.75,18347
3,6577,7753,940311,51696,24,2154,1,74,30,930208,9285,9593,True,1005,1,931107,3492040,730854000.0,credit,,119.6,47385.6,interest credited,,,685128,74,False,1968-01-28,53,50-65,74,1,0,0,0,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,4.75,18347
4,6577,7753,940311,51696,24,2154,1,74,30,930208,9285,9593,True,1005,1,931107,2350078,731718000.0,withdrawal,withdrawal in cash,-12000.0,35385.6,,,,685128,74,False,1968-01-28,53,50-65,74,1,0,0,0,323870,0,0,0,1,1,100.0,10673,4.75,5.44,100,4.75,18347


### Save clean data

In [33]:
train_data_clean='cool_data/train-data-clean.csv'
df.to_csv(train_data_clean, index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'cool_data/train-data-clean.csv'

## Classification

In [None]:
def getXy(df):
    # inputs
    X = df.drop(["status"], axis=1)
    # target
    y = df["status"]
    return (X, y)

Note that we scaled our data. We did this because we need it to obtain better results on distance-based models: K-NN and SVC. This doesn't affect tree-based algorithms so we do it here.

In [None]:
from sklearn.model_selection import StratifiedKFold

k = 10
kf = StratifiedKFold(n_splits=k)

We will be using **Stratified K-Fold with 10 folds** for our cross-validations.

### Establishing a baseline

Let's establish a baseline for our classifiers.

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

baseline_clfs = [
    {"name": "Decision tree", "clf": DecisionTreeClassifier()},
    {"name": "Naive Bayes", "clf": GaussianNB()},
]

scoring = {"accuracy": "accuracy",
           "precision": "precision_weighted",
           "recall": "recall_weighted",
           "f1": "f1_weighted"}

# save classifier information in DataFrame: name, score, fit_time, score_time
baseline_df = pd.DataFrame()
for clf_entry in baseline_clfs:
    scores = cross_validate(clf_entry["clf"], X_train, y_train, scoring=scoring, cv=kf, n_jobs=configNJobs)
    info = scores | {"classifier": [clf_entry["name"]] * k}
    baseline_df = baseline_df.append(pd.DataFrame(info))

baseline_df.head()