# Machine learning
## Preprocessing
We need to deal with nans and one hot encode the categorical variables.

In [86]:
# import neccesary libraries
import pandas as pd
import numpy as np

In [87]:
data = pd.read_csv("cleared_data.csv") # name cleared_data might be misleading because i didnt remove nans and one-hot-encoded the categorical variables
data = data.apply(lambda column: column.astype("category") if column.dtype == "object" else column)

RANDOM_SEED = 34
OVERSAPLING_NUM = 1000

Before removing nans i want to divide the dataset into `test_data` and `train_data`. \
As said in `analysis.ipynb` data is **EXTREMELY** unbalanced, so we want to make sure that the `test_data` accuratly represents the data distribution (we want to avoid the situation where there is none or only a handful of `high_credit_risk` observations)

### Train Test Split

In [88]:
# select non nan data
non_data = data.dropna()

# select 50 high credit resk observations for test data and 500 low risk observations
test_data = non_data[non_data["credit_risk"] == 1].sample(50, random_state=RANDOM_SEED)
test_data = pd.concat([test_data, non_data[non_data["credit_risk"] == 0].sample(500, random_state=RANDOM_SEED)])

training_data = data.drop(index = test_data.index)

# great so now we have non Nan 550 test set and 9450 training set

### Nan's in training set
mindless deleting nans would mean loss of a lot of valuable information *(especially when information is precious for high risk observations)*, so instead let's replace Nan's with propable values

In [89]:
def deal_with_nans(data: pd.DataFrame) -> pd.DataFrame:
    for column in data.columns:
        if data[column].dtype == "category" or data[column].dtype == "object":
            data[column] = data[column].ffill() # use this instead of fillna() because for some ungodly reason it doesnt delete all the Nan's
        elif data[column].dtype == np.int64:
            data[column] = data[column].fillna(data[column].median())
        else:
            data[column] = data[column].fillna(data[column].mean())
    return data

training_data = deal_with_nans(training_data)
del training_data["Unnamed: 0"]

### Dealing with unbalanced target variable
We need to crteate synthetic data in order to mitigate the negatives of training with unbalanced dataset. And then oversample it in `training_data`.

In [90]:
generated_data = pd.DataFrame()
for column in training_data.columns:
    generated_data[column] = training_data[training_data["credit_risk"] == 1][column].sample(OVERSAPLING_NUM,
                                                                                             replace = True,
                                                                                             ignore_index=True,
                                                                                             random_state=RANDOM_SEED)

generated_data.head()

Unnamed: 0,age,income,children,credit_history,overdue_payments,active_loans,years_in_job,employment_type,owns_property,assets_value,other_loans,education,city,marital_status,credit_risk
0,24,29019.0,1,brak historii,0.488188,6,14,stała,1.0,69812.0,1,1,1,kawaler/panna,1
1,36,21491.0,0,dobra historia,2.0,1,2,stała,1.0,65182.0,1,0,2,żonaty/zamężna,1
2,31,40000.0,1,dobra historia,1.0,6,14,stała,0.595168,67790.0,1,1,1,kawaler/panna,1
3,43,22152.0,0,brak historii,0.488188,1,13,określona,0.595168,80787.621879,0,1,0,żonaty/zamężna,1
4,37,40000.0,2,dobra historia,2.0,2,2,stała,0.0,46504.0,1,1,2,żonaty/zamężna,1


In [91]:
training_data = pd.concat([training_data, generated_data]) ## add generated data to training set

### one hot encoding categorical non-ordinal features

Unnamed: 0,age,income,children,credit_history,overdue_payments,active_loans,years_in_job,employment_type,owns_property,assets_value,other_loans,education,city,marital_status,credit_risk
0,44,15689.000000,0,dobra historia,0.000000,2,9,samozatrudnienie,0.595168,80787.621879,1,2,0,żonaty/zamężna,0
1,38,18906.000000,4,brak historii,0.000000,0,1,stała,1.000000,62965.000000,0,1,1,kawaler/panna,0
2,46,16338.000000,2,brak historii,0.488188,2,4,brak,1.000000,124967.000000,0,0,2,żonaty/zamężna,0
3,55,23276.000000,3,dobra historia,1.000000,2,10,stała,1.000000,52147.000000,1,1,0,kawaler/panna,0
4,37,40000.000000,1,brak historii,0.488188,1,9,określona,0.000000,33957.000000,1,2,0,kawaler/panna,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,54,28640.000000,2,dobra historia,1.000000,5,13,stała,0.595168,197150.000000,1,1,2,żonaty/zamężna,1
996,23,40000.000000,0,dobra historia,0.000000,5,4,brak,1.000000,40198.000000,0,1,0,żonaty/zamężna,1
997,59,23568.985369,3,dobra historia,0.000000,2,5,stała,1.000000,33067.000000,0,0,2,żonaty/zamężna,1
998,60,23568.985369,2,dobra historia,0.000000,0,4,stała,0.595168,33849.000000,0,1,1,żonaty/zamężna,1
