In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
pd.set_option('display.max.columns',None)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [3]:
train_data = pd.read_csv("train_set_feature_engineering.csv")
valid_data = pd.read_csv("valid_set_feature_engineering.csv")

# Model building

In [38]:
# Version with all columns
X_train = train_data.drop('FINALIZED_LOAN', axis=1)

y_train = train_data['FINALIZED_LOAN']

X_test = valid_data.drop('FINALIZED_LOAN', axis =1 )

y_test = valid_data['FINALIZED_LOAN']

1. Support vector machine

In [7]:
# Initialize models
svm_model = SVC(kernel='linear', C=1)


# Fit models
svm_model.fit(X_train, y_train)


# Predict on the testing set
y_pred_svm = svm_model.predict(X_test)


# Calculate accuracies
accuracy_svm = accuracy_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
gini_svm = 2 * roc_auc_score(y_test, y_pred_svm) - 1



accuracy_svm, recall_svm, gini_svm

(0.855054432348367, 0.448692152917505, 0.37805197631706333)

2. K Nearest Neighbours

In [23]:
# # Initialize models
# knn_model = KNeighborsClassifier(n_neighbors=5)


# # Fit models
# knn_model.fit(X_train, y_train)

# # Predict on the testing set
# y_pred_knn = knn_model.predict(X_test)

# # Calculate accuracies
# accuracy_knn = accuracy_score(y_test, y_pred_knn)

# recall_knn = recall_score(y_test, y_pred_knn)

# gini_knn = 2 * roc_auc_score(y_test, y_pred_knn) - 1



# accuracy_knn, recall_knn, gini_knn

3. Gradient Boosting

In [10]:
# Initialize models
gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)

# Fit models
gbm_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_gbm = gbm_model.predict(X_test)

# Calculate accuracies
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)

recall_gbm = recall_score(y_test, y_pred_gbm)

gini_gbm = 2 * roc_auc_score(y_test, y_pred_gbm) - 1



accuracy_gbm, recall_gbm, gini_gbm

(0.8715396578538103, 0.5291750503018109, 0.4633178023253577)

4. Gaussian naive bayes

In [39]:
# Initialize models
nb_model = GaussianNB()

# Fit models
nb_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_nb = nb_model.predict(X_test)

# Calculate accuracies
accuracy_nb = accuracy_score(y_test, y_pred_nb)

recall_nb = recall_score(y_test, y_pred_nb)

gini_nb = 2 * roc_auc_score(y_test, y_pred_nb) - 1



accuracy_nb, recall_nb, gini_nb

(0.8055987558320373, 0.9114688128772636, 0.6977086951436358)

5. Multi-layer Perceptron (neural network)

In [13]:
# Initialize models
nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)

nn_model.fit(X_train, y_train)

y_pred_nn = nn_model.predict(X_test)

# Calculate accuracies
accuracy_nn = accuracy_score(y_test, y_pred_nn)

recall_nn = recall_score(y_test, y_pred_nn)

gini_nn = 2 * roc_auc_score(y_test, y_pred_nn) - 1

accuracy_nn, recall_nn, gini_nn

(0.8438569206842924, 0.028169014084507043, 0.021178579941754982)

6. Random Forest

In [21]:
# Re-initialize and train the model with correct data
rf_classifier_corrected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_corrected.fit(X_train, y_train)

# Predict on the corrected test set
y_pred_rf = rf_classifier_corrected.predict(X_test)

# Re-evaluate the model with corrected data
accuracy_rf = accuracy_score(y_test, y_pred_rf)

recall_rf = recall_score(y_test, y_pred_rf)

gini_rf = 2 * roc_auc_score(y_test, y_pred_rf) - 1

accuracy_rf, recall_rf, gini_rf

(0.8914463452566096, 0.5472837022132797, 0.5016619214921614)

# Insights

It seems that in terms of recall and gini the best model is Gaussian Naive Bayes. Let's how these statistics will change when we choose only subset of columns. Let's choose these the most important ones according to previous analysis.

In [25]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import recall_score, roc_auc_score
from itertools import chain, combinations

# Function to get all subsets of a list
def all_subsets(ss):
    return chain(*map(lambda x: combinations(ss, x), range(1, len(ss)+1)))

# Column names to consider for the model
columns_to_consider = [
    'LENGTH_RELATIONSHIP_WITH_CLIENT', 'LOYALTY_TO_AGE', 'SENIORITY_RELATIONSHIP_RATIO',
    'RELATIONSHIP_LENGTH_CAT', 'INCOME_AGE_INTERACTION', 'INCOME_SENIORITY_INTERACTION',
    'INCOME_LOG', 'AGE', 'INCOME', 'CAREER_STABILITY_RATIO'
]

# Split the dataset into features (X) and target (y)
X_train = train_data.drop('FINALIZED_LOAN', axis=1)
y_train = train_data['FINALIZED_LOAN']

# Initialize an empty list to store the results
results = []

# Iterate over all subsets of the specified columns
for subset in all_subsets(columns_to_consider):
    # Convert subset to list to index dataframe
    subset_list = list(subset)

    # Train a GaussianNB model on the subset
    nb_model = GaussianNB()
    nb_model.fit(X_train[subset_list], y_train)

    # Predict on the training set (for simplicity, we'll use the training set for evaluation)
    y_pred_nb = nb_model.predict(X_train[subset_list])

    # Calculate metrics
    accuracy_nb = accuracy_score(y_train, y_pred_nb)
    recall_nb = recall_score(y_train, y_pred_nb)
    gini_nb = 2 * roc_auc_score(y_train, y_pred_nb) - 1

    # Append the results
    results.append({
        'Columns': subset_list,
        'Gini': gini_nb,
        'Recall': recall_nb
    })

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(results)

results_df

Unnamed: 0,Columns,Gini,Recall
0,[LENGTH_RELATIONSHIP_WITH_CLIENT],0.409535,0.543681
1,[LOYALTY_TO_AGE],0.106720,0.194636
2,[SENIORITY_RELATIONSHIP_RATIO],0.000000,0.000000
3,[RELATIONSHIP_LENGTH_CAT],0.411989,0.546932
4,[INCOME_AGE_INTERACTION],0.000000,0.000000
...,...,...,...
1018,"[LENGTH_RELATIONSHIP_WITH_CLIENT, LOYALTY_TO_A...",0.697953,0.898415
1019,"[LENGTH_RELATIONSHIP_WITH_CLIENT, LOYALTY_TO_A...",0.666401,0.849248
1020,"[LENGTH_RELATIONSHIP_WITH_CLIENT, SENIORITY_RE...",0.696123,0.890289
1021,"[LOYALTY_TO_AGE, SENIORITY_RELATIONSHIP_RATIO,...",0.684272,0.887444


In [36]:
results_df = results_df.sort_values(by='Gini', ascending=False)
results_df


Unnamed: 0,Columns,Gini,Recall
1022,"[LENGTH_RELATIONSHIP_WITH_CLIENT, LOYALTY_TO_A...",0.704492,0.911012
1015,"[LENGTH_RELATIONSHIP_WITH_CLIENT, LOYALTY_TO_A...",0.704348,0.911824
1016,"[LENGTH_RELATIONSHIP_WITH_CLIENT, LOYALTY_TO_A...",0.703184,0.913450
971,"[LENGTH_RELATIONSHIP_WITH_CLIENT, LOYALTY_TO_A...",0.702795,0.909793
710,"[LENGTH_RELATIONSHIP_WITH_CLIENT, SENIORITY_RE...",0.702631,0.901260
...,...,...,...
8,[INCOME],0.000000,0.000000
135,"[SENIORITY_RELATIONSHIP_RATIO, INCOME_LOG, INC...",0.000000,0.000000
4,[INCOME_AGE_INTERACTION],0.000000,0.000000
5,[INCOME_SENIORITY_INTERACTION],0.000000,0.000000


In [37]:
top_columns = results_df.iloc[0]['Columns']
top_columns

['LENGTH_RELATIONSHIP_WITH_CLIENT',
 'LOYALTY_TO_AGE',
 'SENIORITY_RELATIONSHIP_RATIO',
 'RELATIONSHIP_LENGTH_CAT',
 'INCOME_AGE_INTERACTION',
 'INCOME_SENIORITY_INTERACTION',
 'INCOME_LOG',
 'AGE',
 'INCOME',
 'CAREER_STABILITY_RATIO']

So we can see that we should choose all of the 10 most important columns in that model

In [4]:
#Version with only most important columns

selected_features = ['LENGTH_RELATIONSHIP_WITH_CLIENT',
 'LOYALTY_TO_AGE',
 'SENIORITY_RELATIONSHIP_RATIO',
 'RELATIONSHIP_LENGTH_CAT',
 'INCOME_AGE_INTERACTION',
 'INCOME_SENIORITY_INTERACTION',
 'INCOME_LOG',
 'AGE',
 'INCOME',
 'CAREER_STABILITY_RATIO']

X_train = train_data[selected_features]

y_train = train_data['FINALIZED_LOAN']

X_test =  valid_data[selected_features]

y_test = valid_data['FINALIZED_LOAN']


In [5]:
# Initialize models
nb_model = GaussianNB()

# Fit models
nb_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_nb = nb_model.predict(X_test)

# Calculate accuracies
accuracy_nb = accuracy_score(y_test, y_pred_nb)

recall_nb = recall_score(y_test, y_pred_nb)

gini_nb = 2 * roc_auc_score(y_test, y_pred_nb) - 1



accuracy_nb, recall_nb, gini_nb

(0.8055987558320373, 0.9114688128772636, 0.6977086951436358)

Recall: 0,91 and Gini: 0,70 seem not that bad, but accuracy is now only 0,81