# AC FEUP 21/22

## Main dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
import numpy as np
import sklearn

# use to not cut columns
pd.set_option('max_columns', None)

# njobs to use in sklearn models
configNJobs = -1

## Import data

### Training data

In [2]:
# import data
def read_csv(file, dtype={}, na_values=None):
    return pd.read_csv("data/" + file, sep=";", dtype=dtype, na_values=na_values)

account_df = read_csv("account.csv")
card_df = read_csv("card_train.csv")
client_df = read_csv("client.csv")
disp_df = read_csv("disp.csv")
district_df = read_csv("district.csv", na_values="?")
loan_df = read_csv("loan_train.csv")
trans_df = read_csv("trans_train.csv", dtype={'bank': 'str'})

# clean column names (Some columns come with an extra whitespace)
district_df.rename(columns=lambda x: x.strip(), inplace=True)

trans_df.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,1548749,5270,930113,credit,credit in cash,800.0,800.0,,,
1,1548750,5270,930114,credit,collection from another bank,44749.0,45549.0,,IJ,80269753.0
2,3393738,11265,930114,credit,credit in cash,1000.0,1000.0,,,
3,3122924,10364,930117,credit,credit in cash,1100.0,1100.0,,,
4,1121963,3834,930119,credit,credit in cash,700.0,700.0,,,


### Test data

In [3]:
card_test_df = read_csv("card_test.csv")
loan_test_df = read_csv("loan_test.csv")
trans_test_df = read_csv("trans_test.csv", dtype={'bank': 'str'})

## Dealing with missing values

### Account data

In [4]:
account_df.isnull().sum()

account_id     0
district_id    0
frequency      0
date           0
dtype: int64

No NULL data to deal with on this dataframe.

### Card data

In [5]:
card_df.isnull().sum()

card_id    0
disp_id    0
type       0
issued     0
dtype: int64

No NULL data to deal with on this dataframe.

In [6]:
card_test_df.isnull().sum()

card_id    0
disp_id    0
type       0
issued     0
dtype: int64

### Client data

In [7]:
card_df.isnull().sum()

card_id    0
disp_id    0
type       0
issued     0
dtype: int64

There are no NULL values on this dataset to deal with.

### Disposition data

In [8]:
disp_df.isnull().sum()

disp_id       0
client_id     0
account_id    0
type          0
dtype: int64

No NULL data to deal with on this dataframe.

### District data

In [9]:
district_df.isnull().sum()

code                                                0
name                                                0
region                                              0
no. of inhabitants                                  0
no. of municipalities with inhabitants < 499        0
no. of municipalities with inhabitants 500-1999     0
no. of municipalities with inhabitants 2000-9999    0
no. of municipalities with inhabitants >10000       0
no. of cities                                       0
ratio of urban inhabitants                          0
average salary                                      0
unemploymant rate '95                               1
unemploymant rate '96                               0
no. of enterpreneurs per 1000 inhabitants           0
no. of commited crimes '95                          1
no. of commited crimes '96                          0
dtype: int64

There is 1 NULL value in _unemploymant rate '95_ and _no. of commited crimes '95_.

In [10]:
district_df[district_df["unemploymant rate '95"].isnull()]

Unnamed: 0,code,name,region,no. of inhabitants,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants 2000-9999,no. of municipalities with inhabitants >10000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96
68,69,Jesenik,north Moravia,42821,4,13,5,1,3,48.4,8173,,7.01,124,,1358


Both of these missing value are for the district of _Jesenik_. We can find the mean growth/shrinkage of the unemploymant and number of crimes commit from 1995 to 1996, and use that to fill the missing values of 1995 for this district.

In [11]:
unem_rate_95 = district_df[district_df["unemploymant rate '95"].notnull()]["unemploymant rate '95"]
unem_rate_96 = district_df[district_df["unemploymant rate '96"].notnull()]["unemploymant rate '96"]

unem_growth_series = unem_rate_95 / unem_rate_96
unem_growth = unem_growth_series.sum() / unem_growth_series.count()

district_df["unemploymant rate '95"] = district_df.apply(lambda x: x["unemploymant rate '96"] * unem_growth if np.isnan(x["unemploymant rate '95"]) else x["unemploymant rate '95"], axis=1)

In [12]:
no_crime_95 = district_df[district_df["no. of commited crimes '95"].notnull()]["no. of commited crimes '95"]
no_crime_96 = district_df[district_df["no. of commited crimes '96"].notnull()]["no. of commited crimes '96"]

crime_growth_series = no_crime_95 / no_crime_96
crime_growth = crime_growth_series.sum() / crime_growth_series.count()

district_df["no. of commited crimes '95"] = district_df.apply(lambda x: x["no. of commited crimes '96"] * crime_growth if np.isnan(x["no. of commited crimes '95"]) else x["unemploymant rate '95"], axis=1)

In [13]:
district_df[district_df["code"] == 69]

Unnamed: 0,code,name,region,no. of inhabitants,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants 2000-9999,no. of municipalities with inhabitants >10000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96
68,69,Jesenik,north Moravia,42821,4,13,5,1,3,48.4,8173,5.759683,7.01,124,1321.520511,1358


The missing values have been taken care of.

### Loan data

In [14]:
loan_df.isnull().sum()

loan_id       0
account_id    0
date          0
amount        0
duration      0
payments      0
status        0
dtype: int64

There are no NULL values to fill on this dataframe.

In [15]:
loan_test_df.isnull().sum()

loan_id         0
account_id      0
date            0
amount          0
duration        0
payments        0
status        354
dtype: int64

These missing values (_status_) don't matter as they are the targets of our prediction model.

### Transaction data

In [16]:
trans_df.isnull().sum()

trans_id           0
account_id         0
date               0
type               0
operation      70761
amount             0
balance            0
k_symbol      185244
bank          299443
account       294456
dtype: int64

In [17]:
trans_test_df.isnull().sum()

trans_id          0
account_id        0
date              0
type              0
operation      5130
amount            0
balance           0
k_symbol      17419
bank          24377
account       21061
dtype: int64

There are 4 attributes with a large quantity of null values on this dataframe that have to dealt with: _operation_, _k\_symbol_, _bank_, and _account_.

In [18]:
trans_df['operation'].value_counts()

withdrawal in cash              165270
remittance to another bank       70737
credit in cash                   62202
collection from another bank     26505
credit card withdrawal            1210
Name: operation, dtype: int64

In [19]:
len(trans_df[trans_df['operation'].isnull()]) / len(trans_df['operation']) * 100.0

17.838083113805663

The _operation_ attribute is categorical and doesn't reveal an inherited order. It can be enconded with 3 attributes using binary encoding.

This attribute is problematic because it contains a significant number of NULL values ($17.8\%$).

It was decided that these rows will be filled with _unkown_.

In [20]:
trans_df['operation'].fillna('unknown', inplace=True)

trans_test_df['operation'].fillna('unknown', inplace=True)

In [21]:
trans_df['k_symbol'].value_counts()

interest credited                        70761
payment for statement                    58377
household                                42839
                                         19065
old-age pension                          13502
insurrance payment                        6592
sanction interest if negative balance      305
Name: k_symbol, dtype: int64

There is a category that is an empty string. It is likelly that the missing values were intended to be part of this category aswell. It is assumed that these rows with the _empty_ category refer to common transactions that aren't related to pensions, insurrance, etc... The missing values and the empty rows of this attribute will be renamed to _no_symbol_.

In [22]:
def treat_null_k_symbol(row):
    k_symb = row['k_symbol']
    if isinstance(k_symb, float):
        return 'no_symbol'
    elif k_symb == " ":
        return 'no_symbol'
    else:
        return k_symb
    

trans_df['k_symbol'] = trans_df.apply(treat_null_k_symbol, axis=1)
trans_test_df['k_symbol'] = trans_test_df.apply(treat_null_k_symbol, axis=1)
trans_df['k_symbol'].value_counts()

no_symbol                                204309
interest credited                         70761
payment for statement                     58377
household                                 42839
old-age pension                           13502
insurrance payment                         6592
sanction interest if negative balance       305
Name: k_symbol, dtype: int64

There many rows where we don't know which is the bank of the partner.

In [23]:
trans_df['bank'].value_counts()

ST    8114
GH    7886
EF    7878
AB    7666
UV    7618
OP    7595
IJ    7536
YZ    7471
QR    7413
KL    7397
WX    7033
CD    7009
MN    6626
Name: bank, dtype: int64

In [24]:
trans_df['account'].value_counts()

0.0           4987
15916598.0      48
62457513.0      48
6149286.0       48
42988401.0      48
              ... 
99218499.0       1
54261672.0       1
28306174.0       1
91523855.0       1
53761804.0       1
Name: account, Length: 4536, dtype: int64

The unknown information about the partner's bank tranlates to unkown information about the partner. This is sometimes represented by a **0** on the _account_ attribute.

In [25]:
print("Number of unknown banks:", len(trans_df[trans_df['bank'].isnull()]))
print("Number of unknown partners:", len(trans_df[trans_df['account'].isnull()]) + len(trans_df[trans_df['account'] == 0]))

Number of unknown banks: 299443
Number of unknown partners: 299443


The number of unknown banks is the same as the number of unkown partners => these 2 features are highly correlated. Even though these 2 attributes aren't likelly to contribute anything to our dataset, we can create an **'unknown' bank** and set all unkown _account_ attributes to 0.

In [26]:
trans_df['account'].fillna(0, inplace=True)
trans_df['bank'].fillna("unknown", inplace=True)

trans_test_df['account'].fillna(0, inplace=True)
trans_test_df['bank'].fillna("unknown", inplace=True)

trans_df.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,1548749,5270,930113,credit,credit in cash,800.0,800.0,no_symbol,unknown,0.0
1,1548750,5270,930114,credit,collection from another bank,44749.0,45549.0,no_symbol,IJ,80269753.0
2,3393738,11265,930114,credit,credit in cash,1000.0,1000.0,no_symbol,unknown,0.0
3,3122924,10364,930117,credit,credit in cash,1100.0,1100.0,no_symbol,unknown,0.0
4,1121963,3834,930119,credit,credit in cash,700.0,700.0,no_symbol,unknown,0.0


## Join data

It is needed to join all datasets into a **single one**.

In [27]:
def join(df1, df2, key1, key2, suff, t="left"):
    return df1.merge(df2, left_on=key1, right_on=key2, how=t, suffixes=suff)

In [28]:
def join_all(card_df, loan_df, trans_df):
    join_df = join(loan_df,
                   account_df,
                   'account_id',
                   'account_id',
                   ['_loan', '_account'])
    join_df = join(join_df, trans_df, 'account_id', 'account_id', ['', '_trans'])
    #
    join_df = join(join_df, district_df, 'district_id', 'code', ['', '_district'])
    join_df = join(join_df, client_df, 'district_id', 'district_id', ['', '_client'])
    #
    join_df = join(join_df,
                   disp_df,
                   ['account_id', 'client_id'],
                   ['account_id', 'client_id'],
                   ['', '_disp'])
    #
    join_df = join(join_df, card_df, 'disp_id', 'disp_id', ['', '_card'])
    #
    join_df.drop(['loan_id',
                  'account_id',
                  'district_id',
                  'client_id',
                  'trans_id',
                  'client_id',
                  'disp_id',
                  'card_id'], axis=1, inplace=True)
    #
    return join_df

This leaves our dataset with a considerable ammount of _trash_, so we drop the **IDs** remaining from the dataset joins.

In [29]:
df = join_all(card_df, loan_df, trans_df)
df_test = join_all(card_test_df, loan_test_df, trans_test_df)
df.head()

Unnamed: 0,date_loan,amount,duration,payments,status,frequency,date_account,date,type,operation,amount_trans,balance,k_symbol,bank,account,code,name,region,no. of inhabitants,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants 2000-9999,no. of municipalities with inhabitants >10000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,birth_number,type_disp,type_card,issued
0,930705,96396,12,8033,-1,weekly issuance,930322,930322,credit,credit in cash,1100.0,1100.0,no_symbol,unknown,0.0,30,Sokolov,west Bohemia,94812,15,13,8,2,10,81.8,9650,3.38,3.67,100,3.38,2804,515911,,,
1,930705,96396,12,8033,-1,weekly issuance,930322,930322,credit,credit in cash,1100.0,1100.0,no_symbol,unknown,0.0,30,Sokolov,west Bohemia,94812,15,13,8,2,10,81.8,9650,3.38,3.67,100,3.38,2804,330425,,,
2,930705,96396,12,8033,-1,weekly issuance,930322,930322,credit,credit in cash,1100.0,1100.0,no_symbol,unknown,0.0,30,Sokolov,west Bohemia,94812,15,13,8,2,10,81.8,9650,3.38,3.67,100,3.38,2804,545421,,,
3,930705,96396,12,8033,-1,weekly issuance,930322,930322,credit,credit in cash,1100.0,1100.0,no_symbol,unknown,0.0,30,Sokolov,west Bohemia,94812,15,13,8,2,10,81.8,9650,3.38,3.67,100,3.38,2804,635102,,,
4,930705,96396,12,8033,-1,weekly issuance,930322,930322,credit,credit in cash,1100.0,1100.0,no_symbol,unknown,0.0,30,Sokolov,west Bohemia,94812,15,13,8,2,10,81.8,9650,3.38,3.67,100,3.38,2804,690814,,,


In [30]:
df.isnull().sum()

date_loan                                                 0
amount                                                    0
duration                                                  0
payments                                                  0
status                                                    0
frequency                                                 0
date_account                                              0
date                                                      0
type                                                      0
operation                                                 0
amount_trans                                              0
balance                                                   0
k_symbol                                                  0
bank                                                      0
account                                                   0
code                                                      0
name                                    

### Save joint data

In [31]:
train_data_join='cool_data/train-data-join.csv'
df.to_csv(train_data_join, index=False)

## Cleaning data

### Work with client's birth_number

From this attribute we can extract the client's 

In [32]:
def identify_gender(row):
    birth = row['birth_number']
    month = (birth // 100) % 100
    return False if month > 12 else True

# create a new gender column
df['is_male'] = df.apply(identify_gender, axis=1)

In [34]:
def identify_year(row):
    birth = row['birth_number']
    year = birth // 10000
    return year

# create a new age column with client's age in year
# dataset is from 99
df['age'] = 99 - df.apply(identify_year, axis=1)

In [35]:
df.drop(['birth_number'], axis=1, inplace=True)

### Discretize client ages

In [36]:
# discretize ages
age_bins=[0, 20, 29, 39, 49, 66, 150]
labels=['0-19', '20-29', '30-39', '40-49', '50-65', 'retired']
df['age'] = pd.cut(x=df['age'], bins=age_bins, labels=labels)

df['age']

0          40-49
1          50-65
2          40-49
3          30-39
4          30-39
           ...  
3687953    20-29
3687954    20-29
3687955    40-49
3687956    30-39
3687957    30-39
Name: age, Length: 3687958, dtype: category
Categories (6, object): ['0-19' < '20-29' < '30-39' < '40-49' < '50-65' < 'retired']

### Loan date

Separa the loan date into a month and year, so it can be explored later: maybe more loans are granted during winter or summer time.

In [39]:
# separate the loan date into loan year and loan month
df['year_load'] = df['date_loan'].apply(lambda x: x // 10000)
df['month_load'] = df['date_loan'].apply(lambda x: x // 100 % 100)

In [40]:
df.drop(['date_loan'], axis=1, inplace=True)

### Apply transaction types to the transaction amount

Withdrawls imply a negative balance change on the account's funds.

In [51]:
def convert_amount(row):
    ammount = abs(row['amount'])
    # else are withdrawls
    return ammount if row['type'] == "credit" else -ammount

# convert ammount to negative according to transaction type
df['amount'] = df.apply(convert_amount, axis=1)

## Deal with categorical data

### Account data

In [None]:
account_df.dtypes

The frequency is possibly categorical data. Let's look at the cardinality of the attribute and whether it has an implicit order.

In [None]:
account_df['frequency'].value_counts()

There are 3  possible values for the **frquency attribute**: _monthly issuance_, _weekly issuance_, _issuance_ after transation. There is an implicit order between weekly and monthly, but with _issuance after transation_ it isn't so obvious. It was decided that, since it is expected that people do more than 1 transaction per week, _issuance after transation_ also relates to the other 2 in terms of time.

When considering the time intervals between transations: _issuance after transation_ < _weekly issuance_ < _monthly issuance_.

There is also a magnitude difference between all of them, e.g.: monthly is around 4 times more frequent than weekly. This raises the issue of how much more frequent is _issuance after transaction_ than _weekly issuance_. Let's assume the _weekly issuance_ is 7 time less frequent.

Let's replace this categorical data by ordered integers:

In [None]:
account_df.replace({'frequency':
                    {'issuance after transaction': 1, 'weekly issuance': 7, 'monthly issuance': 30}
                   }, inplace=True)
account_df['frequency']

### Card data

In [None]:
card_df.dtypes

The **type** attribute is probably categorical data.

In [None]:
card_df['type'].value_counts()

_junior_ accounts are for underarge people (lowest tier) and usually it isn't possible to withdrawl money from them, and _gold_ account are the highest tier. There is an implicit order here: _junior_ < _classic_ < _gold_.

In [None]:
card_df.replace({'type':
                    {'junior': 0, 'classic': 1, 'gold': 2}
                   }, inplace=True)
card_df['type']

### Client data

In [None]:
client_df.dtypes

_age_ is a categorical value. Each category represents a range. This ranges can be divided into 2 attributes: the start and the end of the range.

In [None]:
def uncat_age_bins(row, is_begin=True):
    age = row['age']
    if age == 'retired':
        return 66 if is_begin else 150
    return age.split('-')[0 if is_begin else 1]
    

client_df['age_start'] = client_df.apply(uncat_age_bins, axis=1)
client_df['age_end'] = client_df.apply(lambda x: uncat_age_bins(x, False), axis=1)

client_df.drop(['age'], axis=1, inplace=True)

client_df.head()

### Disposition data

In [None]:
disp_df.dtypes

The **type** attribute is probably categorical data.

In [None]:
disp_df['type'].value_counts()

There are only two categories here. We can replace this by a boolean depicting where the disposition is owner or not.

In [None]:
disp_df['is_owner'] = disp_df['type'].apply(lambda x: True if x == "OWNER" else False)
disp_df.drop(['type'], axis=1, inplace=True)
disp_df.head()

### District data

In [None]:
district_df.dtypes

Both name and region are categorical data.

In [None]:
print("Number of different 'code' values:", len(district_df['code'].value_counts()))
print("Number of different 'name' values:", len(district_df['name'].value_counts()))

We can see that there is a direct association between the **code** and **name** attributes. This means they are redundant and we don't have to deal with the **name** attribute.

There is a problem with this. The **code** attributes are integers which indirectly imply an order, even though the names of the regions are nominal features, but it encoding it would generate too many attributes, so it was decided that it would be dropped.

In [None]:
district_df.drop(['name'], axis=1, inplace=True)

The **region** feature is categorial and there is no order associated. This means this attribute should be encoded in order to remove it's categorical nature and still remain unordered. One-Hot encoding could be used for simplicity, but Binary enconding is be better.

In [None]:
district_df['region'].value_counts()

In [None]:
from category_encoders import BinaryEncoder

encoder = BinaryEncoder(cols=['region'])
district_df = encoder.fit_transform(district_df)
district_df.head()

### Loan data

In [None]:
loan_df.dtypes

There is no categorical data to deal with on this dataframe.

### Transaction data

In [None]:
trans_df.dtypes

There are also 4 categorical attributes in this dataframe: _type_, _operation_, _k\_symbol_, and _bank_.

In [None]:
trans_df['type'].value_counts()

The _type_ attribute has 3 categories with no distinct order between them.

In [None]:
from category_encoders import BinaryEncoder

encoder = BinaryEncoder(cols=['type'])
trans_df = encoder.fit_transform(trans_df)
trans_df.head()

This data could have been encoded with One-Hot encoding, which is simpler. Binary encoding is used in order to use 1 less attribute.

In [None]:
trans_df.drop(['operation', 'k_symbol', 'bank'], axis=1, inplace=True)

## Classification

In [None]:
def getXy(df):
    # inputs
    X = df.drop(["status"], axis=1)
    # target
    y = df["status"]
    return (X, y)

X_train, y_train = getXy(df)
# // TODO scale data?

Note that we scaled our data. We did this because we need it to obtain better results on distance-based models: K-NN and SVC. This doesn't affect tree-based algorithms so we do it here.

In [None]:
from sklearn.model_selection import StratifiedKFold

k = 10
kf = StratifiedKFold(n_splits=k)

We will be using **Stratified K-Fold with 10 folds** for our cross-validations.

### Establishing a baseline

Let's establish a baseline for our classifiers.

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

baseline_clfs = [
    {"name": "Decision tree", "clf": DecisionTreeClassifier()},
    #{"name": "K-Nearest neighbors", "clf": KNeighborsClassifier()},
    {"name": "Naive Bayes", "clf": GaussianNB()},
]

scoring = {"accuracy": "accuracy",
           "precision": "precision_weighted",
           "recall": "recall_weighted",
           "f1": "f1_weighted"}

# save classifier information in DataFrame: name, score, fit_time, score_time
baseline_df = pd.DataFrame()
for clf_entry in baseline_clfs:
    scores = cross_validate(clf_entry["clf"], X_train, y_train, scoring=scoring, cv=kf, n_jobs=configNJobs)
    info = scores | {"classifier": [clf_entry["name"]] * k}
    baseline_df = baseline_df.append(pd.DataFrame(info))

baseline_df