In [1]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve

import numpy as np

In [2]:
data = pd.read_csv("train_set_one_hot_encoding.csv")
train_data = data
valid_data = pd.read_csv("valid_set_one_hot_encoding.csv")

# Feature selection

We'll use some of the most popular feature selection techniques to determine which of the features that are in our dataset have a strong predictive potential in terms of feature FINALISED_LOAN

1. Let's start with Chi-square Test. This test is applicable for categorical variables. It evaluates whether the observed distribution of variables differs from the expected distribution. We'll use it for binary features (because we currently don't have any categorical features) in relation to the "FINALIZED_LOAN" target variable.

In [3]:
# Identify binary and continuous variables

# Assuming variables with a unique count of 2 are binary

binary_columns = [col for col in data.columns if data[col].nunique() == 2]

continuous_columns = [col for col in data.columns if col not in binary_columns]



# Separate the dataframe into binary and continuous dataframes

data_binary = data[binary_columns]

data_continuous = data[continuous_columns]



# Remove the target variable 'FINALIZED_LOAN' from continuous columns as it's our target

binary_columns.remove('FINALIZED_LOAN')



# Display the categorization

binary_columns, continuous_columns

(['CURRENT_ACCOUNT',
  'SALARY_ACCOUNT',
  'PRODUCT_B',
  'PRODUCT_C',
  'PRODUCT_E',
  'PRODUCT_F',
  'AREA_County capital',
  'AREA_Rural area',
  'AREA_Urban area',
  'RESIDENTIAL_PLACE_Living with family',
  'RESIDENTIAL_PLACE_Other',
  'RESIDENTIAL_PLACE_Owner with mortgage',
  'RESIDENTIAL_PLACE_Owner without mortgage',
  'RESIDENTIAL_PLACE_Rental',
  'EDUCATION_College',
  'EDUCATION_Highschool',
  'EDUCATION_Middle school',
  'EDUCATION_Missing',
  'EDUCATION_Other',
  'EDUCATION_Post secondary school',
  'EDUCATION_Post-graduate',
  'EDUCATION_Primary school',
  'EDUCATION_University',
  'EDUCATION_Vocational school',
  'MARITAL_STATUS_divorced',
  'MARITAL_STATUS_married',
  'MARITAL_STATUS_single',
  'MARITAL_STATUS_widow',
  'ECONOMIC_SECTOR_Accommodation and food service activities',
  'ECONOMIC_SECTOR_Agriculture, hunting and forestry',
  'ECONOMIC_SECTOR_Construction',
  'ECONOMIC_SECTOR_Education',
  'ECONOMIC_SECTOR_Electricity and gas',
  'ECONOMIC_SECTOR_Financial an

In [4]:
# Preparing the data for Chi-square test

X_binary = data_binary.drop('FINALIZED_LOAN', axis=1)  # Features

y = data['FINALIZED_LOAN']  # Target



# Apply Chi-square test

chi_selector = SelectKBest(chi2, k='all')

chi_selector.fit(X_binary, y)



# Get the scores for each feature

chi_scores = pd.DataFrame({'Feature': X_binary.columns, 'Chi2 Score': chi_selector.scores_}).sort_values(by='Chi2 Score', ascending=False)


chi_scores

Unnamed: 0,Feature,Chi2 Score
9,RESIDENTIAL_PLACE_Living with family,253.594924
26,MARITAL_STATUS_single,234.200711
12,RESIDENTIAL_PLACE_Owner without mortgage,172.365385
1,SALARY_ACCOUNT,160.689517
3,PRODUCT_C,132.682056
25,MARITAL_STATUS_married,110.821141
41,ECONOMIC_SECTOR_Public administration and defence,102.385946
5,PRODUCT_F,97.992699
45,ECONOMIC_SECTOR_Wholesale and retail trade,52.258119
4,PRODUCT_E,45.698765


The Chi-square test results provide a score for each binary feature, reflecting how strongly each feature is associated with the target variable "FINALIZED_LOAN". Higher scores suggest a stronger association.

Top features based on Chi-square scores include:
* RESIDENTIAL_PLACE_Living with family
* MARITAL_STATUS_single
* RESIDENTIAL_PLACE_Owner without mortgage
* SALARY_ACCOUNT
* PRODUCT_C

2. Next, we'll perform the ANOVA F-test for continuous features to evaluate their relationship with the target variable. The ANOVA F-test results show the relationship between each continuous feature and the target variable "FINALIZED_LOAN". The F-value measures the degree of variance between group means (for each feature), with a higher value indicating a stronger association. The P-value assesses the significance of this association, with values below a typical threshold (e.g., 0.05) suggesting statistical significance.

In [5]:
# Prepare the data for ANOVA F-test

X_continuous = data_continuous

# Apply ANOVA F-test

f_values, p_values = f_classif(X_continuous, y)


# Combine the results into a dataframe

anova_scores = pd.DataFrame({'Feature': X_continuous.columns, 'F-value': f_values, 'P-value': p_values}).sort_values(by='F-value', ascending=False)


anova_scores

Unnamed: 0,Feature,F-value,P-value
5,LENGTH_RELATIONSHIP_WITH_CLIENT,5467.828269,0.0
3,WORK_SENIORITY,734.674372,5.348239e-158
0,AGE,478.880432,1.6026819999999998e-104
4,BUSINESS AGE,98.160256,4.5419680000000005e-23
2,INCOME,35.450449,2.6746e-09
1,NO_OF_DEPENDENTS,5.808996,0.01595634


Top continuous feature based on the ANOVA F-test is LENGTH_RELATIONSHIP_WITH_CLIENT

3. The feature importance scores from the Random Forest model indicate the relative importance of each feature in predicting the target variable "FINALIZED_LOAN".

In [6]:
# Prepare the data for the Random Forest model

X = data.drop('FINALIZED_LOAN', axis=1)  # All features excluding the target


# Initialize and fit the Random Forest model

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X, y)


# Get feature importances

feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_}).sort_values(by='Importance', ascending=False)


feature_importances.head(10)

Unnamed: 0,Feature,Importance
5,LENGTH_RELATIONSHIP_WITH_CLIENT,0.317025
2,INCOME,0.106691
0,AGE,0.095836
3,WORK_SENIORITY,0.062843
4,BUSINESS AGE,0.051338
6,CURRENT_ACCOUNT,0.016929
1,NO_OF_DEPENDENTS,0.013603
28,EDUCATION_University,0.013275
9,PRODUCT_C,0.01318
18,RESIDENTIAL_PLACE_Owner without mortgage,0.013177


The top features with the highest importance scores are:
* LENGTH_RELATIONSHIP_WITH_CLIENT: This feature has the highest importance, emphasizing the significance of the client's relationship length with the finalization of a loan.
* INCOME: The income of the individual also plays a crucial role in the prediction, highlighting its importance in the decision-making process for loan finalization.
* AGE: The age of the individual is another important predictor, which may reflect financial stability or reliability.
* WORK_SENIORITY and BUSINESS AGE: These features related to work experience and business longevity also contribute significantly to the prediction.

4. Now let's compute GINI for each feature

In [7]:
def gini_binary(target_counts):
    total = sum(target_counts)
    sum_squares = sum((n / total) ** 2 for n in target_counts)
    return 1 - sum_squares

gini_indices = {}

# For binary features
for column in binary_columns:
    counts = data.groupby(column)['FINALIZED_LOAN'].value_counts().unstack(fill_value=0)
    gini_index = 0
    for _, row in counts.iterrows():
        gini_index += (row.sum() / data.shape[0]) * gini_binary(row.values)
    gini_indices[column] = gini_index

# For continuous features (using median as a simple split point)
for column in continuous_columns:
    median = data[column].median()
    below_median = data[data[column] <= median]['FINALIZED_LOAN'].value_counts()
    above_median = data[data[column] > median]['FINALIZED_LOAN'].value_counts()
    gini_below = gini_binary(below_median.values)
    gini_above = gini_binary(above_median.values)
    weighted_gini = (below_median.sum() / data.shape[0]) * gini_below + (above_median.sum() / data.shape[0]) * gini_above
    gini_indices[column] = weighted_gini
    
# Convert the Gini indices into a sorted DataFrame for better readability

gini_df = pd.DataFrame(list(gini_indices.items()), columns=['Feature', 'Gini Index']).sort_values(by='Gini Index')


gini_df

Unnamed: 0,Feature,Gini Index
60,LENGTH_RELATIONSHIP_WITH_CLIENT,0.201772
9,RESIDENTIAL_PLACE_Living with family,0.266958
55,AGE,0.267032
12,RESIDENTIAL_PLACE_Owner without mortgage,0.267071
26,MARITAL_STATUS_single,0.267777
...,...,...
52,EMPLOYEE_NO_between 251-500,0.274253
51,EMPLOYEE_NO_between 21-50,0.274254
33,ECONOMIC_SECTOR_Financial and insurance activi...,0.274254
29,"ECONOMIC_SECTOR_Agriculture, hunting and forestry",0.274254


In [None]:
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA


# Standardizing the data

scaler = StandardScaler()

data_scaled = scaler.fit_transform(data) 



# Applying PCA - Let's retain components that explain 95% of the variance

pca = PCA(n_components=0.95)

data_pca = pca.fit_transform(data_scaled)



# Number of components chosen and variance explained by each component

n_components = pca.n_components_

explained_variance = pca.explained_variance_ratio_



n_components, explained_variance

Let's create logistic regression model to check what is predictability of particular sets of columns

In [9]:
# Separate features and target

X_train = train_data.drop('FINALIZED_LOAN', axis=1)

y_train = train_data['FINALIZED_LOAN']

X_test = valid_data.drop('FINALIZED_LOAN', axis =1 )

y_test = valid_data['FINALIZED_LOAN']

# Initialize the Logistic Regression model

logistic_model = LogisticRegression(max_iter=1000)



# Fit the model to the training data

logistic_model.fit(X_train, y_train)



# Predict on the testing set

y_pred = logistic_model.predict(X_test)



# Calculate evaluation metrics

accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)

roc_auc = roc_auc_score(y_test, y_pred)



accuracy, precision, recall, roc_auc

(0.8429237947122862,
 0.48787878787878786,
 0.323943661971831,
 0.6308827949299919)

Ok, so logistic regression has on our set acurracy 84,29%. That is after preprocessing and one hot encoding.

Now let's see the acurracy of logistic regression model based only on the most important columns according 

In [12]:
# List of selected features based on their importance

selected_features = [

    'LENGTH_RELATIONSHIP_WITH_CLIENT',

    'INCOME',

    'AGE',

    'WORK_SENIORITY',

    'BUSINESS AGE',

    'CURRENT_ACCOUNT',

    'NO_OF_DEPENDENTS',

    'EDUCATION_University',

    'PRODUCT_C',

    'RESIDENTIAL_PLACE_Owner without mortgage', 
    
    'RESIDENTIAL_PLACE_Living with family'

]



# Subset the data to include only the selected features and the target variable

X_train = train_data[selected_features]

y_train = train_data['FINALIZED_LOAN']

X_test =  valid_data[selected_features]

y_test = valid_data['FINALIZED_LOAN']




# Initialize the Logistic Regression model

logistic_model_selected = LogisticRegression(max_iter=1000)



# Fit the model to the training data

logistic_model_selected.fit(X_train, y_train)



# Predict on the testing set

y_pred_selected = logistic_model_selected.predict(X_test)



# Calculate evaluation metrics

accuracy_selected = accuracy_score(y_test, y_pred_selected)

precision_selected = precision_score(y_test, y_pred_selected)

recall_selected = recall_score(y_test, y_pred_selected)

roc_auc_selected = roc_auc_score(y_test, y_pred_selected)



accuracy_selected, precision_selected, recall_selected, roc_auc_selected

(0.838258164852255,
 0.46647230320699706,
 0.32193158953722334,
 0.6273013356074637)

In [13]:
# List of selected features based on their importance

selected_features = [

    'LENGTH_RELATIONSHIP_WITH_CLIENT'
]



# Subset the data to include only the selected features and the target variable

X_train = train_data[selected_features]

y_train = train_data['FINALIZED_LOAN']

X_test =  valid_data[selected_features]

y_test = valid_data['FINALIZED_LOAN']




# Initialize the Logistic Regression model

logistic_model_selected = LogisticRegression(max_iter=1000)



# Fit the model to the training data

logistic_model_selected.fit(X_train, y_train)



# Predict on the testing set

y_pred_selected = logistic_model_selected.predict(X_test)



# Calculate evaluation metrics

accuracy_selected = accuracy_score(y_test, y_pred_selected)

precision_selected = precision_score(y_test, y_pred_selected)

recall_selected = recall_score(y_test, y_pred_selected)

roc_auc_selected = roc_auc_score(y_test, y_pred_selected)



accuracy_selected, precision_selected, recall_selected, roc_auc_selected

(0.8171073094867807,
 0.33574007220216606,
 0.18712273641851107,
 0.5597129502548773)

In [15]:
y_pred_selected.sum()

277

Now let's compare our results to different models

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

X_train = train_data.drop('FINALIZED_LOAN', axis=1)

y_train = train_data['FINALIZED_LOAN']

X_test = valid_data.drop('FINALIZED_LOAN', axis =1 )

y_test = valid_data['FINALIZED_LOAN']

# Initialize models
svm_model = SVC(kernel='linear', C=1)
knn_model = KNeighborsClassifier(n_neighbors=5)
gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
nb_model = GaussianNB()
nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)

# Fit models
svm_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)
gbm_model.fit(X_train, y_train)
nb_model.fit(X_train, y_train)
nn_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_svm = svm_model.predict(X_test)
y_pred_knn = knn_model.predict(X_test)
y_pred_gbm = gbm_model.predict(X_test)
y_pred_nb = nb_model.predict(X_test)
y_pred_nn = nn_model.predict(X_test)

# Calculate accuracies
accuracy_svm = accuracy_score(y_test, y_pred_svm)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
accuracy_nn = accuracy_score(y_test, y_pred_nn)

accuracy_svm, accuracy_knn, accuracy_gbm, accuracy_nb, accuracy_nn