# 1. Loading of the libraries

In [None]:
!pip install ydata-profiling --quiet

In [None]:
import pandas as pd

In [None]:
from ydata_profiling import ProfileReport

# Load of the dataset

In [None]:
df = pd.read_csv("bankloan.csv")


# Display of the Data

In [None]:
print("Data first rows")
print(df.head())

Data first rows
   Row ID        id  member_id  loan_amnt  funded_amnt        term  int_rate  \
0     1.0  60516983   64537751      20000        20000   36 months     12.29   
1     2.0  60187139   64163931      11000        11000   36 months     12.69   
2     3.0  60356453   64333218       7000         7000   36 months      9.99   
3     4.0  59955769   63900496      10000        10000   36 months     10.99   
4     5.0  58703693   62544456       9550         9550   36 months     19.99   

   installment grade sub_grade  ... application_type annual_inc_joint  \
0       667.06     C        C1  ...       INDIVIDUAL              NaN   
1       369.00     C        C2  ...       INDIVIDUAL              NaN   
2       225.84     B        B3  ...       INDIVIDUAL              NaN   
3       327.34     B        B4  ...       INDIVIDUAL              NaN   
4       354.87     E        E4  ...       INDIVIDUAL              NaN   

  dti_joint  acc_now_delinq tot_coll_amt tot_cur_bal total_rev_h

In [None]:
print("Basic statistics")
print(df.describe(include='all'))

Basic statistics
            Row ID            id     member_id      loan_amnt    funded_amnt  \
count   368.000000  2.129990e+05  2.129990e+05  212999.000000  212999.000000   
unique         NaN           NaN           NaN            NaN            NaN   
top            NaN           NaN           NaN            NaN            NaN   
freq           NaN           NaN           NaN            NaN            NaN   
mean    184.500000  6.103515e+07  6.515927e+07   15257.965530   15257.965530   
std     106.376689  4.734904e+06  5.215173e+06    8611.713377    8611.713377   
min       1.000000  5.670500e+04  7.082500e+04    1000.000000    1000.000000   
25%      92.750000  5.783411e+07  6.158651e+07    8500.000000    8500.000000   
50%     184.500000  6.137900e+07  6.549753e+07   14000.000000   14000.000000   
75%     276.250000  6.503778e+07  6.956436e+07   20000.000000   20000.000000   
max     368.000000  6.861687e+07  7.351969e+07   35000.000000   35000.000000   

              term    

# Creation of the profiling report

In [None]:
profile = ProfileReport( df,
    title="Bank Loan Dataset Profiling Report",
    explorative=True)

αφαιρεσα το profile.to_notebook_iframe() που εφτιαχνε το μεγαλο αρχειο, γιατί δεν μπορούσα να το ανεβάσω στο GitHub.

# 2. Loan amount

I want to see what the column is called for the loan

In [None]:
print(df.columns)

In [30]:
mean_loan = df["loan_amnt"].mean()
max_loan = df["loan_amnt"].max()
min_loan = df["loan_amnt"].min()


print("Loan Amount Statistics")
print(f"Mean loan amount: {mean_loan:.2f}")
print(f"Maximum loan amount: {max_loan}")
print(f"Minimum loan amount: {min_loan}")


Loan Amount Statistics
Mean loan amount: 15257.97
Maximum loan amount: 35000
Minimum loan amount: 1000


What variables are not needed

I believe that we do not need to include Identifiers in the dataset, as they cannot be used in any way for prediction. Furthermore, variables that describe what happened after the loan was granted to the customer. Also, columns that have many missing values, text and descriptions, and many unesecary.

In [None]:
columns_to_drop = [ 'Row ID', 'id', 'member_id','loan_status', 'out_prncp', 'total_pymnt', 'total_rec_prncp',
    'total_rec_int', 'total_rec_late_fee', 'recoveries',
    'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
    'next_pymnt_d','title', 'emp_title','mths_since_last_delinq', 'mths_since_last_major_derog',
    'annual_inc_joint', 'dti_joint','Unnamed: 50', '36months', '60months'
]

In [None]:
columns_to_drop = [c for c in columns_to_drop if c in df.columns]

df_clean = df.drop(columns=columns_to_drop)

4. Deciding of the customer will take the loan or not

In [None]:
accepted_categories = ["A", "B1", "B2"]

In [None]:
df_clean["target"] = df_clean["grade"].apply(lambda x: 1 if x in accepted_categories else 0)

In [None]:
print(df_clean["target"].value_counts())

# Loan amount ranges with >= 15% approval

In [21]:
bins = range(0, int(df_clean["loan_amnt"].max()) + 5000, 5000)

df_clean["loan_range"] = pd.cut(df_clean["loan_amnt"], bins=bins)


approval_rates = df_clean.groupby("loan_range")["target"].mean()

valid_ranges = approval_rates[approval_rates >= 0.15]

print("Approval Rate per Loan Amount Range:")
print(approval_rates)

print("\nRanges with at least 15% approval rate:")
print(valid_ranges)

Approval Rate per Loan Amount Range:
loan_range
(0, 5000]         0.128269
(5000, 10000]     0.205032
(10000, 15000]    0.177184
(15000, 20000]    0.172313
(20000, 25000]    0.185653
(25000, 30000]    0.197253
(30000, 35000]    0.066064
Name: target, dtype: float64

Ranges with at least 15% approval rate:
loan_range
(5000, 10000]     0.205032
(10000, 15000]    0.177184
(15000, 20000]    0.172313
(20000, 25000]    0.185653
(25000, 30000]    0.197253
Name: target, dtype: float64


  approval_rates = df_clean.groupby("loan_range")["target"].mean()


#3. Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df["target"] = df["grade"].apply(lambda x: 1 if x in accepted_categories else 0)

X = df.select_dtypes(include=["int64", "float64"])
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

print("Training set:", X_train.shape[0])
print("Test set:", X_test.shape[0])
print("Target distribution in training set:\n", y_train.value_counts(normalize=True))
print("Target distribution in test set:\n", y_test.value_counts(normalize=True))




Training set: 149099
Test set: 63900
Target distribution in training set:
 target
0    0.826894
1    0.173106
Name: proportion, dtype: float64
Target distribution in test set:
 target
0    0.826886
1    0.173114
Name: proportion, dtype: float64


# Classifier for the prediction on the loan

The variable we need to take ot from the data set is grade because target A,B1,B2 is in relatuon with this variable.

In [22]:
df = df.drop(columns=["grade"])

In [23]:
X = df.select_dtypes(include=["int64", "float64"])
y = df["target"]

print("Selected features:", X.columns.tolist())

Selected features: ['Row ID', 'id', 'member_id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'total_pymnt', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'annual_inc_joint', 'dti_joint', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 'Unnamed: 50', '36months', '60months', 'target']


In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

Normalization

In [26]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


I use Random Forest

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42)


clf.fit(X_train_scaled, y_train)


y_pred = clf.predict(X_test_scaled)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))


cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:", cm)

Accuracy: 1.0
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     52838
           1       1.00      1.00      1.00     11062

    accuracy                           1.00     63900
   macro avg       1.00      1.00      1.00     63900
weighted avg       1.00      1.00      1.00     63900

Confusion Matrix: [[52838     0]
 [    0 11062]]


# 4. Feature Importance

In [33]:
importances = clf.feature_importances_
feature_names = X.columns

feat_imp_df = pd.DataFrame({ "feature": feature_names,"importance": importances})

top15 = feat_imp_df.sort_values(by="importance", ascending=False).head(15)

print("Top 15 important features:")
print(top15)

Top 15 important features:
             feature  importance
5           int_rate    0.463039
36            target    0.439439
20     total_rec_int    0.026480
32  total_rev_hi_lim    0.014268
19   total_rec_prncp    0.013021
15        revol_util    0.009383
18       total_pymnt    0.008294
10    inq_last_6mths    0.004136
24   last_pymnt_amnt    0.002603
8                dti    0.002578
17         out_prncp    0.002452
7         annual_inc    0.002246
6        installment    0.002204
3          loan_amnt    0.001763
4        funded_amnt    0.001603


Correlation matrix for the 15 features

In [29]:
top15_features = top15["feature"].tolist()

X_top15 = X_train[top15_features]

corr_matrix = X_top15.corr()

print("Correlation matrix for top 15 features:")
print(corr_matrix)


Correlation matrix for top 15 features:
                  int_rate    target  total_rec_int  total_rev_hi_lim  \
int_rate          1.000000 -0.599040       0.461684         -0.194113   
target           -0.599040  1.000000      -0.218710          0.213460   
total_rec_int     0.461684 -0.218710       1.000000          0.117930   
total_rev_hi_lim -0.194113  0.213460       0.117930          1.000000   
total_rec_prncp  -0.108021  0.113593       0.470648          0.189852   
revol_util        0.205924 -0.219213       0.154621         -0.107706   
total_pymnt       0.091080  0.005292       0.729886          0.189222   
inq_last_6mths    0.229509 -0.142613       0.044660          0.016647   
last_pymnt_amnt   0.068155  0.006876       0.357710          0.173296   
dti               0.067632 -0.046820       0.024613          0.025821   
out_prncp         0.133876 -0.026933       0.555183          0.345209   
annual_inc       -0.090421  0.089303       0.116758          0.273516   
installment