In [29]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve
import numpy as np
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [8]:
data = pd.read_csv("train_set_test_1.csv")
train_data = data
valid_data = pd.read_csv("valid_set_test_1.csv")

# Feature standarization

Let's start with feature standarization

In [18]:
# Identify numerical columns (excluding obvious binary/categorical columns)
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
binary_or_categorical_cols = ['HAS_DEPENDENTS', 'CURRENT_ACCOUNT', 'SALARY_ACCOUNT', 'FINALIZED_LOAN']
numerical_cols = [col for col in numerical_cols if col not in binary_or_categorical_cols]

# Apply StandardScaler to the numerical columns
scaler = preprocessing.StandardScaler().fit(data[numerical_cols])

# Scale the numerical features
data_scaled = data.copy()
data_scaled[numerical_cols] = scaler.transform(data[numerical_cols])

# Show the mean and scale of the scaler for verification and the first few rows of the scaled data
scaler_means = scaler.mean_
scaler_scales = scaler.scale_
data_scaled.head(), scaler_means, scaler_scales

(        AGE  HAS_DEPENDENTS    INCOME  WORK_SENIORITY  BUSINESS AGE  \
 0  1.590496               0 -0.462023       -0.293262     -0.151460   
 1  1.516167               0 -0.355947       -0.293262     -0.151460   
 2 -1.011048               0 -0.551598       -0.851101     -1.041664   
 3 -0.342080               0 -0.080937        0.450524     -0.418521   
 4 -0.416410               0 -0.506025       -0.293262     -0.151460   
 
    LENGTH_RELATIONSHIP_WITH_CLIENT  CURRENT_ACCOUNT  SALARY_ACCOUNT  \
 0                        -0.826098                0               0   
 1                         0.624576                0               0   
 2                        -0.826098                1               0   
 3                        -0.618859                0               0   
 4                        -0.826098                1               0   
 
    FINALIZED_LOAN  PRODUCT_B  ...  INCOME_SENIORITY_INTERACTION  \
 0               0  -0.541844  ...                     -0.211765

# Feature selection

We'll use some of the most popular feature selection techniques to determine which of the features that are in our dataset have a strong predictive potential in terms of feature FINALISED_LOAN

1. Let's start with Chi-square Test. This test is applicable for categorical variables. It evaluates whether the observed distribution of variables differs from the expected distribution. We'll use it for binary features (because we currently don't have any categorical features) in relation to the "FINALIZED_LOAN" target variable.

In [20]:
# Identify binary and continuous variables

# Assuming variables with a unique count of 2 are binary

binary_columns = [col for col in data.columns if data[col].nunique() == 2]

continuous_columns = [col for col in data.columns if col not in binary_columns]



# Separate the dataframe into binary and continuous dataframes

data_binary = data[binary_columns]

data_continuous = data[continuous_columns]



# Remove the target variable 'FINALIZED_LOAN' from continuous columns as it's our target

binary_columns.remove('FINALIZED_LOAN')



# Display the categorization

binary_columns, continuous_columns

(['HAS_DEPENDENTS',
  'CURRENT_ACCOUNT',
  'SALARY_ACCOUNT',
  'PRODUCT_B',
  'PRODUCT_C',
  'PRODUCT_E',
  'PRODUCT_F',
  'AREA_County capital',
  'AREA_Rural area',
  'AREA_Urban area',
  'RESIDENTIAL_PLACE_Living with family',
  'RESIDENTIAL_PLACE_Other',
  'RESIDENTIAL_PLACE_Owner with mortgage',
  'RESIDENTIAL_PLACE_Owner without mortgage',
  'RESIDENTIAL_PLACE_Rental',
  'EDUCATION_College',
  'EDUCATION_Highschool',
  'EDUCATION_Middle school',
  'EDUCATION_Missing',
  'EDUCATION_Other',
  'EDUCATION_Post secondary school',
  'EDUCATION_Post-graduate',
  'EDUCATION_Primary school',
  'EDUCATION_University',
  'EDUCATION_Vocational school',
  'MARITAL_STATUS_divorced',
  'MARITAL_STATUS_married',
  'MARITAL_STATUS_single',
  'MARITAL_STATUS_widow',
  'ECONOMIC_SECTOR_Accommodation and food service activities',
  'ECONOMIC_SECTOR_Agriculture, hunting and forestry',
  'ECONOMIC_SECTOR_Construction',
  'ECONOMIC_SECTOR_Education',
  'ECONOMIC_SECTOR_Electricity and gas',
  'ECONOMIC

In [10]:
# Preparing the data for Chi-square test

X_binary = data_binary.drop('FINALIZED_LOAN', axis=1)  # Features

y = data['FINALIZED_LOAN']  # Target



# Apply Chi-square test

chi_selector = SelectKBest(chi2, k='all')

chi_selector.fit(X_binary, y)



# Get the scores for each feature

chi_scores = pd.DataFrame({'Feature': X_binary.columns, 'Chi2 Score': chi_selector.scores_}).sort_values(by='Chi2 Score', ascending=False)


chi_scores

Unnamed: 0,Feature,Chi2 Score
10,RESIDENTIAL_PLACE_Living with family,253.365598
27,MARITAL_STATUS_single,233.988018
13,RESIDENTIAL_PLACE_Owner without mortgage,172.216559
2,SALARY_ACCOUNT,160.592788
4,PRODUCT_C,132.328507
...,...,...
53,EMPLOYEE_NO_between 251-500,0.084651
52,EMPLOYEE_NO_between 21-50,0.063611
34,ECONOMIC_SECTOR_Financial and insurance activi...,0.055915
30,"ECONOMIC_SECTOR_Agriculture, hunting and forestry",0.048540


The Chi-square test results provide a score for each binary feature, reflecting how strongly each feature is associated with the target variable "FINALIZED_LOAN". Higher scores suggest a stronger association.

Top features based on Chi-square scores include:
* RESIDENTIAL_PLACE_Living with family
* MARITAL_STATUS_single
* RESIDENTIAL_PLACE_Owner without mortgage
* SALARY_ACCOUNT
* PRODUCT_C

2. Next, we'll perform the ANOVA F-test for continuous features to evaluate their relationship with the target variable. The ANOVA F-test results show the relationship between each continuous feature and the target variable "FINALIZED_LOAN". The F-value measures the degree of variance between group means (for each feature), with a higher value indicating a stronger association. The P-value assesses the significance of this association, with values below a typical threshold (e.g., 0.05) suggesting statistical significance.

In [21]:
# Prepare the data for ANOVA F-test

X_continuous = data_continuous

# Apply ANOVA F-test

f_values, p_values = f_classif(X_continuous, y)


# Combine the results into a dataframe

anova_scores = pd.DataFrame({'Feature': X_continuous.columns, 'F-value': f_values, 'P-value': p_values}).sort_values(by='F-value', ascending=False)


anova_scores

Unnamed: 0,Feature,F-value,P-value
4,LENGTH_RELATIONSHIP_WITH_CLIENT,5466.097253,0.0
5,RELATIONSHIP_LENGTH_CAT,5130.548722,0.0
9,LOYALTY_TO_AGE,3350.447832,0.0
11,SENIORITY_RELATIONSHIP_RATIO,1867.896203,0.0
2,WORK_SENIORITY,735.290998,3.989156e-158
0,AGE,479.200627,1.372593e-104
10,CAREER_STABILITY_RATIO,415.97756,3.150052e-91
12,WORK_SENIORITY_TO_BUSINESS_AGE,300.767022,1.0020920000000001e-66
7,INCOME_AGE_INTERACTION,265.389118,3.6928490000000002e-59
8,INCOME_SENIORITY_INTERACTION,209.387947,3.879034e-47


* Top continuous feature based on the ANOVA F-test is LENGTH_RELATIONSHIP_WITH_CLIENT. 
* We can also see which of the features that we have created are the most important and have actually provided some information into our set. One of the best features are: RELATIONSHIP_LENGTH_CAT, LOYALTY_TO_AGE, SENIORITY_RELATIONSHIP_RATIO and CAREER_STABILITY_RATIO.
* We can notice that INCOME_LOG is better than a regular INCOME

3. The feature importance scores from the Random Forest model indicate the relative importance of each feature in predicting the target variable "FINALIZED_LOAN".

In [22]:
# Prepare the data for the Random Forest model

X = data.drop('FINALIZED_LOAN', axis=1)  # All features excluding the target


# Initialize and fit the Random Forest model

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X, y)


# Get feature importances

feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_}).sort_values(by='Importance', ascending=False)


feature_importances.head(10)

Unnamed: 0,Feature,Importance
5,LENGTH_RELATIONSHIP_WITH_CLIENT,0.128542
65,LOYALTY_TO_AGE,0.100464
67,SENIORITY_RELATIONSHIP_RATIO,0.09308
61,RELATIONSHIP_LENGTH_CAT,0.067719
0,AGE,0.050178
63,INCOME_AGE_INTERACTION,0.048185
66,CAREER_STABILITY_RATIO,0.048132
64,INCOME_SENIORITY_INTERACTION,0.046073
2,INCOME,0.045714
62,INCOME_LOG,0.044274


The top features with the highest importance scores are:
* LENGTH_RELATIONSHIP_WITH_CLIENT: This feature has the highest importance, emphasizing the significance of the client's relationship length with the finalization of a loan.
* LOYALTY_TO_AGE - This feature shows that the client has spent a significant part of his life cooperating with this bank, so he's probably satisfied with the service and more likely to finalise the loan.
* SENIORITY_RELATIONSHIP_RATIO 	: The age of the individual is another important predictor, which may reflect financial stability or reliability.
* RELATIONSHIP_LENGTH_CAT, AGE

4. Now let's compute GINI for each feature

In [23]:
def gini_binary(target_counts):
    total = sum(target_counts)
    sum_squares = sum((n / total) ** 2 for n in target_counts)
    return 1 - sum_squares

gini_indices = {}

# For binary features
for column in binary_columns:
    counts = data.groupby(column)['FINALIZED_LOAN'].value_counts().unstack(fill_value=0)
    gini_index = 0
    for _, row in counts.iterrows():
        gini_index += (row.sum() / data.shape[0]) * gini_binary(row.values)
    gini_indices[column] = gini_index

# For continuous features (using median as a simple split point)
for column in continuous_columns:
    median = data[column].median()
    below_median = data[data[column] <= median]['FINALIZED_LOAN'].value_counts()
    above_median = data[data[column] > median]['FINALIZED_LOAN'].value_counts()
    gini_below = gini_binary(below_median.values)
    gini_above = gini_binary(above_median.values)
    weighted_gini = (below_median.sum() / data.shape[0]) * gini_below + (above_median.sum() / data.shape[0]) * gini_above
    gini_indices[column] = weighted_gini
    
# Convert the Gini indices into a sorted DataFrame for better readability

gini_df = pd.DataFrame(list(gini_indices.items()), columns=['Feature', 'Gini Index']).sort_values(by='Gini Index')


gini_df

Unnamed: 0,Feature,Gini Index
66,RELATIONSHIP_LENGTH_CAT,0.198911
65,LENGTH_RELATIONSHIP_WITH_CLIENT,0.201750
70,LOYALTY_TO_AGE,0.224080
72,SENIORITY_RELATIONSHIP_RATIO,0.248284
10,RESIDENTIAL_PLACE_Living with family,0.266905
...,...,...
53,EMPLOYEE_NO_between 251-500,0.274193
52,EMPLOYEE_NO_between 21-50,0.274193
34,ECONOMIC_SECTOR_Financial and insurance activi...,0.274194
30,"ECONOMIC_SECTOR_Agriculture, hunting and forestry",0.274194


We can see that columns with the biggest Feature Importance in Random Forest Models have the smallest Gini index

Let's create logistic regression model to check what is predictability of particular sets of columns

In [26]:
# Separate features and target

X_train = train_data.drop('FINALIZED_LOAN', axis=1)

y_train = train_data['FINALIZED_LOAN']

X_test = valid_data.drop('FINALIZED_LOAN', axis =1 )

y_test = valid_data['FINALIZED_LOAN']

# Initialize the Logistic Regression model

logistic_model = LogisticRegression(max_iter=10000)



# Fit the model to the training data

logistic_model.fit(X_train, y_train)



# Predict on the testing set

y_pred = logistic_model.predict(X_test)



# Calculate evaluation metrics

accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)

roc_auc = roc_auc_score(y_test, y_pred)



accuracy, precision, recall, roc_auc

(0.8590979782270607,
 0.5604395604395604,
 0.4104627766599598,
 0.6757979814131292)

Ok, so logistic regression has on our set acurracy 85,91%. That is after preprocessing one hot encoding and standarization

Now let's see the acurracy of logistic regression model based only on the most important columns according to chi-square test, Anova-F Test and Random Forest Feature Importance

In [27]:
# List of selected features based on their importance

selected_features = [

    'LENGTH_RELATIONSHIP_WITH_CLIENT',

    'AGE',

    'WORK_SENIORITY',

    'PRODUCT_C',

    'RESIDENTIAL_PLACE_Owner without mortgage', 
    
    'RESIDENTIAL_PLACE_Living with family',
    
    'MARITAL_STATUS_single',
    
    'SALARY_ACCOUNT',
    
    'RELATIONSHIP_LENGTH_CAT',
    
    'LOYALTY_TO_AGE',
    
    'SENIORITY_RELATIONSHIP_RATIO'

]



# Subset the data to include only the selected features and the target variable

X_train = train_data[selected_features]

y_train = train_data['FINALIZED_LOAN']

X_test =  valid_data[selected_features]

y_test = valid_data['FINALIZED_LOAN']




# Initialize the Logistic Regression model

logistic_model_selected = LogisticRegression(max_iter=1000)



# Fit the model to the training data

logistic_model_selected.fit(X_train, y_train)



# Predict on the testing set

y_pred_selected = logistic_model_selected.predict(X_test)



# Calculate evaluation metrics

accuracy_selected = accuracy_score(y_test, y_pred_selected)

precision_selected = precision_score(y_test, y_pred_selected)

recall_selected = recall_score(y_test, y_pred_selected)

roc_auc_selected = roc_auc_score(y_test, y_pred_selected)



accuracy_selected, precision_selected, recall_selected, roc_auc_selected

(0.8572317262830482,
 0.5605095541401274,
 0.35412474849094566,
 0.6516760607796892)

In [28]:
# List of selected features based on their importance

selected_features = [

    'LENGTH_RELATIONSHIP_WITH_CLIENT'
]



# Subset the data to include only the selected features and the target variable

X_train = train_data[selected_features]

y_train = train_data['FINALIZED_LOAN']

X_test =  valid_data[selected_features]

y_test = valid_data['FINALIZED_LOAN']




# Initialize the Logistic Regression model

logistic_model_selected = LogisticRegression(max_iter=1000)



# Fit the model to the training data

logistic_model_selected.fit(X_train, y_train)



# Predict on the testing set

y_pred_selected = logistic_model_selected.predict(X_test)



# Calculate evaluation metrics

accuracy_selected = accuracy_score(y_test, y_pred_selected)

precision_selected = precision_score(y_test, y_pred_selected)

recall_selected = recall_score(y_test, y_pred_selected)

roc_auc_selected = roc_auc_score(y_test, y_pred_selected)



accuracy_selected, precision_selected, recall_selected, roc_auc_selected

(0.8171073094867807,
 0.33574007220216606,
 0.18712273641851107,
 0.5597129502548773)

In [15]:
y_pred_selected.sum()

277

Now let's compare our results to different models

In [59]:
#Version with only most important columns

# X_train = train_data[selected_features]

# y_train = train_data['FINALIZED_LOAN']

# X_test =  valid_data[selected_features]

# y_test = valid_data['FINALIZED_LOAN']

# Version with all columns
X_train = train_data.drop('FINALIZED_LOAN', axis=1)

y_train = train_data['FINALIZED_LOAN']

X_test = valid_data.drop('FINALIZED_LOAN', axis =1 )

y_test = valid_data['FINALIZED_LOAN']



In [1]:
# Initialize models
svm_model = SVC(kernel='linear', C=1)


# Fit models
svm_model.fit(X_train, y_train)


# Predict on the testing set
y_pred_svm = svm_model.predict(X_test)


# Calculate accuracies
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)

recall_svm = recall_score(y_test, y_pred_svm)

roc_auc_svm = roc_auc_score(y_test, y_pred_svm)



accuracy_svm, precision_svm, recall_svm, roc_auc_svm

NameError: name 'SVC' is not defined

In [55]:
# Initialize models
knn_model = KNeighborsClassifier(n_neighbors=5)


# Fit models
knn_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_knn = knn_model.predict(X_test)

# Calculate accuracies
accuracy_knn = accuracy_score(y_test, y_pred_knn)

precision_knn = precision_score(y_test, y_pred_knn)

recall_knn = recall_score(y_test, y_pred_knn)

roc_auc_knn = roc_auc_score(y_test, y_pred_knn)



accuracy_knn, precision_knn, recall_knn, roc_auc_knn

(0.8398133748055988,
 0.47715736040609136,
 0.3782696177062374,
 0.6512392974476736)

In [57]:
# Initialize models
gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)

# Fit models
gbm_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_gbm = gbm_model.predict(X_test)

# Calculate accuracies
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)

precision_gbm = precision_score(y_test, y_pred_gbm)

recall_gbm = recall_score(y_test, y_pred_gbm)

roc_auc_gbm = roc_auc_score(y_test, y_pred_gbm)



accuracy_gbm, precision_gbm, recall_gbm, roc_auc_gbm

(0.8429237947122862,
 0.4891304347826087,
 0.36217303822937624,
 0.6465022659873886)

In [62]:
# Initialize models
nb_model = GaussianNB()

# Fit models
nb_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_nb = nb_model.predict(X_test)

# Calculate accuracies
accuracy_nb = accuracy_score(y_test, y_pred_nb)

precision_nb = precision_score(y_test, y_pred_nb)

recall_nb = recall_score(y_test, y_pred_nb)

roc_auc_nb = roc_auc_score(y_test, y_pred_nb)

accuracy_nb, precision_nb, recall_nb, roc_auc_nb

(0.8205287713841368,
 0.4540229885057471,
 0.7947686116700201,
 0.8100038790506098)

In [63]:
# Initialize models
nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)

nn_model.fit(X_train, y_train)

y_pred_nn = nn_model.predict(X_test)

# Calculate accuracies
accuracy_nn = accuracy_score(y_test, y_pred_nn)

precision_nn = precision_score(y_test, y_pred_nn)

recall_nn = recall_score(y_test, y_pred_nn)

roc_auc_nn = roc_auc_score(y_test, y_pred_nn)

accuracy_nn, precision_nn, recall_nn,      hgg                                       n nmbbnbmbnnmbmbbbbb


     

(0.8423017107309487,
 0.32142857142857145,
 0.018108651911468814,
 0.5055591088843584)