# Classifier for the probability of a customer's timely loan repayment.

## Choosen ML algorithm - Random forest

In [1]:
import pandas as pd
import numpy as np
import pickle
import statsmodels.api as sm
from sklearn import metrics

import matplotlib.pyplot as plt
plt.style.use('seaborn-ticks')
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

### 1. Preparing dataset

In [2]:
# Import data and replace missing values by mean

df = pd.read_csv ('data_DS_HW_train.csv', sep=None, decimal=',', index_col=0)

df.fillna(df.mean(), inplace=True)

In [3]:
# Divide columns due to type of data

target = "TARGET"
numFeatures=df.select_dtypes(include='number').columns.tolist()
catFeatures=df.select_dtypes(exclude='number').columns.tolist()

In [4]:
# Encoding
from sklearn import preprocessing

# Dictionary including maps
mapy = {}
for feature in catFeatures:
    # Encoder initiation
    le = preprocessing.LabelEncoder()
    # Fitting the column
    df[feature] = le.fit_transform(df[feature])
    # Save the map
    mapy[feature] = le

In [5]:
#Delete "TARGET" column from features set
features = df.columns.tolist()
features.remove(target)

### 2. Preparing wrapper to cross validation and first launch

In [6]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


def CVTestRFClass(nFolds = 5, randomState=2020, debug=False, features=features, *args, **kwargs):
    kf = KFold(n_splits=nFolds, shuffle=True, random_state=randomState)

    # lists to capture results
    testResults = []
    trainResults = []
    predictions = []
    indices = []

    # Loop to validate model over consecutive folds
    for train, test in kf.split(df.index.values):
        # Estimator preparing
        clf = RandomForestClassifier(*args, **kwargs, random_state=randomState, n_jobs=-1)
        if debug:
            print(clf)
        # Train the model
        clf.fit(df.iloc[train][features], df.iloc[train][target])

        # Prepare forecasts for train and test sets
        # UWAGA Sklearn return two columns of probability for both classes
        predsTrain = clf.predict_proba(df.iloc[train][features])[:,1]
        preds = clf.predict_proba(df.iloc[test][features])[:,1]
        
        # Save predictions about fold
        predictions.append(preds.tolist().copy())
        
        # With indexes in original data frame
        indices.append(df.iloc[test].index.tolist().copy())
        
        # Calculate fitting score due to metric ROC-AUC
        trainScore = roc_auc_score((df[target].iloc[train]==1).astype(int), predsTrain)
        testScore = roc_auc_score((df[target].iloc[test]==1).astype(int), preds)
        
        # Save results in list
        trainResults.append(trainScore)
        testResults.append(testScore)
        
        # Informations about every fold with train results able to print during program works
        if debug:
            print("Train AUC:", trainScore,
                  "Valid AUC:", testScore)
        
    return trainResults, testResults, predictions, indices


In [7]:
#Launch wrapper on prepared dataset

trainResults, testResults, predictions, indices = CVTestRFClass(debug=True)
print(np.mean(testResults))

RandomForestClassifier(n_jobs=-1, random_state=2020)
Train AUC: 1.0 Valid AUC: 0.9075969827586206
RandomForestClassifier(n_jobs=-1, random_state=2020)
Train AUC: 1.0 Valid AUC: 0.9105294723398379
RandomForestClassifier(n_jobs=-1, random_state=2020)
Train AUC: 1.0 Valid AUC: 0.9162146591272805
RandomForestClassifier(n_jobs=-1, random_state=2020)
Train AUC: 1.0 Valid AUC: 0.916139745175171
RandomForestClassifier(n_jobs=-1, random_state=2020)
Train AUC: 1.0 Valid AUC: 0.9193997668997668
0.9139761252601353


### 3. Boosting results by find best hyperparameters

In [8]:
# Loop over n_estimators to boost results
for k in [10, 25, 50, 100, 200, 500, 1000]:
    trainResults, testResults, predictions, indices = CVTestRFClass(n_estimators=k)
    print(k, np.mean(trainResults), np.mean(testResults), np.mean(trainResults) - np.mean(testResults))

10 0.9998375879566304 0.8830157161526193 0.11682187180401105
25 0.9999995103176265 0.90402630562448 0.09597320469314641
50 1.0 0.9105728745192276 0.08942712548077236
100 1.0 0.9139761252601353 0.08602387473986473
200 1.0 0.9135435604754519 0.08645643952454807
500 1.0 0.914477871251497 0.08552212874850296
1000 1.0 0.9149336840574935 0.08506631594250647


In [9]:
# Loop over max_depth
for k in range(2,22,2):
    trainResults, testResults, predictions, indices = CVTestRFClass(n_estimators=100, max_depth=k)
    print(k, np.mean(trainResults), np.mean(testResults), np.mean(trainResults) - np.mean(testResults))

2 0.727595349822181 0.7174124142116243 0.01018293561055672
4 0.7824029218529794 0.752734122356996 0.029668799495983444
6 0.8511660024280945 0.7868046297485811 0.06436137267951336
8 0.921340876118505 0.8257407527593965 0.0956001233591085
10 0.9693958696642866 0.8583411433579021 0.11105472630638447
12 0.9915564408479028 0.8828070200739493 0.10874942077395355
14 0.998535019604948 0.8965574043080528 0.1019776152968952
16 0.9998110877222501 0.9065761433041905 0.09323494441805968
18 0.9999855532476252 0.9101241799523473 0.08986137329527788
20 0.9999997508363994 0.912259075665844 0.0877406751705554


In [10]:
# Loop over max_features
for k in range(2, 50, 2):
    trainResults, testResults, predictions, indices = CVTestRFClass(n_estimators=100, max_depth=10,
                                                                    max_features=k)
    print(k, np.mean(trainResults), np.mean(testResults), np.mean(trainResults) - np.mean(testResults))

2 0.911709705409223 0.8157750044723814 0.09593470093684164
4 0.9362079612687225 0.8343749671805686 0.10183299408815394
6 0.950714843016307 0.8426187079404677 0.10809613507583937
8 0.9574686093281111 0.8499840060166296 0.10748460331148146
10 0.9631773207036558 0.8534713621190921 0.10970595858456367
12 0.9666905528488717 0.8573767371001371 0.10931381574873467
14 0.9679060863919918 0.8595237452506728 0.10838234114131906
16 0.9701192091326833 0.8597843866683093 0.1103348224643741
18 0.9713698297917333 0.8597744586406495 0.11159537115108376
20 0.9730866084045868 0.8623740982113208 0.110712510193266
22 0.9739634060106204 0.8638108636376873 0.11015254237293304
24 0.9747055517431115 0.8630122017594154 0.11169334998369607
26 0.9757755921754804 0.8663416979006506 0.10943389427482975
28 0.975618042436223 0.8647056436676834 0.11091239876853953
30 0.977157896503486 0.867120906791164 0.1100369897123219
32 0.9768801056055691 0.8644727633392076 0.11240734226636151
34 0.9774533450742202 0.8642838794978

In [11]:
# Loop over min_samples_split
for k in [2, 4, 6, 8, 10, 15, 20, 30]:
    trainResults, testResults, predictions, indices = CVTestRFClass(n_estimators=100, max_depth=10,
                                                                    max_features=12,
                                                                    min_samples_split=k)
    print(k, np.mean(trainResults), np.mean(testResults), np.mean(trainResults) - np.mean(testResults))

2 0.9666905528488717 0.8573767371001371 0.10931381574873467
4 0.9640363138502472 0.8558289863562045 0.10820732749404272
6 0.9605926133995337 0.8521457532883744 0.10844686011115934
8 0.9570100693473973 0.8481120248423911 0.10889804450500618
10 0.9534493942442019 0.8460294804437494 0.10741991380045257
15 0.9444088530537578 0.8421659710750008 0.10224288197875697
20 0.937313604170494 0.8371264307833648 0.10018717338712924
30 0.9248841729251598 0.8296298494192598 0.09525432350590002


In [12]:
# Loop over min_samples_leaf
for k in range(1, 2, 4):
    trainResults, testResults, predictions, indices = CVTestRFClass(n_estimators=100, max_depth=10,
                                                                    max_features=4,
                                                                    min_samples_split=2, min_samples_leaf=k)
    print(k, np.mean(trainResults), np.mean(testResults))

1 0.9362079612687225 0.8343749671805686


In [13]:
# It seems best results is acquire for belowed specification:

# Model 1
trainResults, testResults, predictions, indices = CVTestRFClass(n_estimators=100, max_depth=10,
                                                                    max_features=4,
                                                                    min_samples_split=2, min_samples_leaf=1)
print(k, np.mean(trainResults), np.mean(testResults))

modelRF = {
    "name":"RF",
    "description":"Model RF, ze zmiennymi kategorycznymi z LE",
    "specification":'n_estimators=500, max_depth=20, max_features=4, min_samples_split=4, min_samples_leaf=1',
    "trainResults":trainResults.copy(),
    "testResults":testResults.copy(),
    "predictions":predictions.copy(),
    "indices":indices.copy(),
}

1 0.9362079612687225 0.8343749671805686


### 4.  Finding optimal hyperparameters value - grid search

In [14]:
# List to save results
results = []

import random
# Loop for consecutive searchings

for k in range(50):
    # Przygotujmy słownik parametrów
    # Previous results are references
    params = {
        "max_depth" : random.randint(6, 11),
        "max_features" : random.randint(8, 12),
        "min_samples_split" : random.randint(2, 6),
    }
    # Value of min_samples_leaf must be smaller than min_samples_split
    params["min_samples_leaf"] = random.randint(1, params["min_samples_split"])
    
    # Model estimation for drawn hyperparameters
    trainResults, testResults, predictions, indices = CVTestRFClass(n_estimators=100,
                                                                    max_depth=params["max_depth"],
                                                                    max_features=params["max_features"],
                                                                    min_samples_split=params["min_samples_split"],
                                                                    min_samples_leaf=params["min_samples_leaf"])
    
    # Save results
    results.append((params.copy(), np.mean(trainResults), np.mean(testResults)))
    
    # Print result of current itaretion
    print(params, np.mean(trainResults), np.mean(testResults))
    
    # Every 10 iterations print top 5 results
    if k>1 and k%10==0:    
        print("\n Top 5:")
        for param, train1, test1 in sorted(results, key=lambda x: x[2], reverse=True)[0:5]:
            print(param, train1, test1)
        print()

{'max_depth': 6, 'max_features': 12, 'min_samples_split': 4, 'min_samples_leaf': 3} 0.8443981166339265 0.7843635333930384
{'max_depth': 9, 'max_features': 9, 'min_samples_split': 6, 'min_samples_leaf': 1} 0.9332443137155636 0.833422465305536
{'max_depth': 11, 'max_features': 9, 'min_samples_split': 5, 'min_samples_leaf': 1} 0.973437746586046 0.8634170540595123
{'max_depth': 7, 'max_features': 11, 'min_samples_split': 6, 'min_samples_leaf': 4} 0.8733854429719905 0.798317086394292
{'max_depth': 8, 'max_features': 9, 'min_samples_split': 5, 'min_samples_leaf': 3} 0.9013046615261826 0.8142751221727487
{'max_depth': 8, 'max_features': 12, 'min_samples_split': 6, 'min_samples_leaf': 5} 0.9028805814727464 0.816729689615198
{'max_depth': 8, 'max_features': 10, 'min_samples_split': 5, 'min_samples_leaf': 5} 0.8962236576548397 0.8116393680723037
{'max_depth': 6, 'max_features': 9, 'min_samples_split': 3, 'min_samples_leaf': 1} 0.8419210948996583 0.782742919888338
{'max_depth': 11, 'max_features'

{'max_depth': 8, 'max_features': 12, 'min_samples_split': 4, 'min_samples_leaf': 4} 0.9054867978357277 0.817878708017745
{'max_depth': 7, 'max_features': 12, 'min_samples_split': 3, 'min_samples_leaf': 1} 0.8821006130765147 0.8035981326656094
{'max_depth': 9, 'max_features': 12, 'min_samples_split': 4, 'min_samples_leaf': 4} 0.9303005899316827 0.8306090496252121


In [15]:
# Grid search method brings best result with these values of hyperparams:

# Model 2
trainResults, testResults, predictions, indices = CVTestRFClass(n_estimators=100, max_depth=11,
                                                                    max_features=8,
                                                                    min_samples_split=4, min_samples_leaf=1)
print(k, np.mean(trainResults), np.mean(testResults))

modelRF = {
    "name":"RF",
    "description":"Model RF, ze zmiennymi kategorycznymi z LE",
    "specification":'n_estimators=500, max_depth=20, max_features=4, min_samples_split=4, min_samples_leaf=1',
    "trainResults":trainResults.copy(),
    "testResults":testResults.copy(),
    "predictions":predictions.copy(),
    "indices":indices.copy(),
}

49 0.9720148677063827 0.8600271975943612


###### Due to both of these results to the main part of the analysis will be used <number of model> model. It is possible to look variables were most important during modeling.

### 5. Feature importance

In [16]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


# Preparing wrapper
def CVTestRFClass(nFolds = 5, randomState=2020, debug=False, features=features, saveModels = False, *args, **kwargs):
    
    # Preparing cross-validation
    kf = KFold(n_splits=nFolds, shuffle=True, random_state=randomState)

    # To save the memory information about the fols are only numbers of rows
    testResults = []
    
    # Adding information about train set
    trainResults = []
    
    # Store predictions for every single fold
    predictions = []
    
    # With index in original dataset
    indices = []
    
    # Preparing list of models to store
    models = []
    
    # Loop to models validating
    for train, test in kf.split(df.index.values):
        # Preparing estimator
        clf = RandomForestClassifier(*args, **kwargs, random_state=randomState, n_jobs=-1)
        if debug:
            print(clf)
        # Model testing
        clf.fit(df.iloc[train][features], df.iloc[train][target])

        # Preparing forecasts for train and test datasets
        predsTrain = clf.predict_proba(df.iloc[train][features])[:,1]
        preds = clf.predict_proba(df.iloc[test][features])[:,1]
        
        # Store forecasts informations about fold
        predictions.append(preds.tolist().copy())
        
        # With their indexes in original dataframe
        indices.append(df.iloc[test].index.tolist().copy())
        
        # Calculate fitting score by ROC-AUC metric
        trainScore = roc_auc_score((df[target].iloc[train]==1).astype(int), predsTrain)
        testScore = roc_auc_score((df[target].iloc[test]==1).astype(int), preds)
        
        # Save iteration results
        trainResults.append(trainScore)
        testResults.append(testScore)
        
        # Informatio about every fold with train results are possible to print during calculating
        if debug:
            print("Train AUC:", trainScore,
                  "Valid AUC:", testScore)
        
        # Save results in model's list
        if saveModels:
            models.append(clf)
    
    # List of returnig results with new variable - models
    return trainResults, testResults, predictions, indices, models

In [17]:
trainResults, testResults, predictions, indices, models = CVTestRFClass(n_estimators=100, max_depth=11,
                                                                    max_features=8,
                                                                    min_samples_split=4, min_samples_leaf=1,
                                                                        saveModels=True)

In [18]:
# Print list of features frequency
imps = list(zip(models[0].feature_importances_, features))
imps.sort(reverse=True)
imps

[(0.01845588034248679,
  'swo_ind_same_pesel_different_lender_count_unique_p0_30'),
 (0.0172654470163505, 'swo_ind_loans_amount_p0_210'),
 (0.016142303228840273, 'swo_ind_same_pesel_applications_count_p0_30'),
 (0.01575806886447401, 'swo_ind_loans_amount_p0_plus'),
 (0.01574297584640283,
  'swo_ind_same_pesel_different_lender_count_unique_p0_7'),
 (0.015225114419605914, 'swo_ind_loans_number_p0_plus'),
 (0.014991410496605535, 'sumpaiddebt'),
 (0.014867623173844163, 'score'),
 (0.014524489065822842, 'swo_ind_same_pesel_applications_count_p0_7'),
 (0.014365524865479604, 'swo_ind_loans_amount_p0_360'),
 (0.013462641994346927,
  'swo_ind_same_pesel_different_lender_count_unique_p0_90'),
 (0.013451663174098656, 'chap_coef'),
 (0.013441536768877044, 'swo_ind_same_pesel_applications_count_p0_90'),
 (0.013212011091770542, 'swo_ind_loans_amount_p0_90'),
 (0.012741799751542089, 'swo_ind_loans_number_p0_210'),
 (0.0126934595217492, 'averagepaiddebt'),
 (0.01238538092216892, 'swo_ind_same_pesel_ap

###### Here we can see wchich variaables were most important during the validation process. It could be useful business information for future modeling.

### 6. Test sample - probability predictions

In [19]:
# Import data and replace missing values by mean
df_t = pd.read_csv ('data_DS_HW_test.csv', sep=None, decimal=',', index_col=0)
df_t.fillna(df_t.mean(), inplace=True)

In [20]:
# Divide columns due to type of data
numFeatures_t=df_t.select_dtypes(include='number').columns.tolist()
catFeatures_t=df_t.select_dtypes(exclude='number').columns.tolist()

In [21]:
# Encoding
from sklearn import preprocessing

# Dictionary including maps
mapy = {}
for feature in catFeatures_t:
    # Encoder initiation
    le = preprocessing.LabelEncoder()
    # Fitting the column
    df_t[feature] = le.fit_transform(df_t[feature])
    # Save the map
    mapy[feature] = le

In [22]:
# Create features set
features = df_t.columns.tolist()

In [23]:
# For the probabilities capture would be use model with best specification on the train set with belowed specification:
clf = RandomForestClassifier(n_estimators=100, max_depth=11, max_features=8, min_samples_split=4, min_samples_leaf=1, 
                             random_state=2022, n_jobs=-1)
clf.fit(df[features], df[target])

RandomForestClassifier(max_depth=11, max_features=8, min_samples_split=4,
                       n_jobs=-1, random_state=2022)

In [24]:
# Calculating probabilities
data=pd.DataFrame()
for i in range (5):
    preds = (clf.predict_proba(df_t.iloc[i].array.reshape(1, -1)))
    data = data.append(preds.tolist().copy())
    
data.rename(columns = {0:'probability_of_0', 1:'probability_of_1'}, inplace = True)

In [25]:
# Create a file with probabilities
result = pd.DataFrame(zip(df_t.index, data['probability_of_1']))
result.rename(columns = {0:'application_id', 1:'probability_of_1'}, inplace = True)
result.set_index('application_id')

Unnamed: 0_level_0,probability_of_1
application_id,Unnamed: 1_level_1
223,0.091164
20671,0.437221
2218,0.195571
19196,0.123065
13088,0.305594


In [26]:
# Save the file as 'xlsx' same path
result.to_excel("probabilities.xlsx")