# Bachelor's thesis: Shallow Self-learning

### The corresponding paper is: Kozodoi, Nikita, et al. "Shallow self-learning for reject inference in credit scoring." Machine Learning and Knowledge Discovery in Databases: European Conference, ECML PKDD 2019, Würzburg, Germany, September 16–20, 2019, Proceedings, Part III. Springer International Publishing, 2020.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score,roc_curve, brier_score_loss

In [2]:
accepts = pd.DataFrame()
accepts = pd.read_csv('../data/New_accepts.csv',encoding = "ISO-8859-1", low_memory=False)
rejects = pd.DataFrame()
rejects = pd.read_csv('../data/New_rejects.csv',encoding = "ISO-8859-1", low_memory=False)

In [3]:
X_acc = accepts.copy()
X_rej = rejects.copy()
y_rej = X_rej.pop("loan_status")
y_acc = X_acc.pop("loan_status")

## Preprocessing

### I store a reference dataframe, because I compare this reference with the true outcomes to my predicted probability of default.

In [4]:
scaler=StandardScaler()
scaler.fit(X_acc[['loan_amnt', 'emp_length', 'dti', 'fico']])
X_acc[['loan_amnt', 'emp_length', 'dti', 'fico']] = scaler.transform(X_acc[['loan_amnt', 'emp_length', 'dti', 'fico']])
X_rej[['loan_amnt', 'emp_length', 'dti', 'fico']] = scaler.transform(X_rej[['loan_amnt', 'emp_length', 'dti', 'fico']])

In [5]:
X_acc = pd.get_dummies(data=X_acc, columns = ['addr_state'], drop_first = True)
X_rej = pd.get_dummies(data=X_rej, columns = ['addr_state'], drop_first = True)

In [6]:
scaler.fit(X_acc[['loan_amnt', 'emp_length', 'dti', 'fico']])
X_acc[['loan_amnt', 'emp_length', 'dti', 'fico']] = scaler.transform(X_acc[['loan_amnt', 'emp_length', 'dti', 'fico']])
X_rej[['loan_amnt', 'emp_length', 'dti', 'fico']] = scaler.transform(X_rej[['loan_amnt', 'emp_length', 'dti', 'fico']])

In [7]:
accepts_reference = np.column_stack([X_acc, y_acc])
accepts_reference = pd.DataFrame(accepts_reference)
rejects_reference = np.column_stack([X_rej, y_rej])
rejects_reference = pd.DataFrame(rejects_reference)

In [8]:
accepts_reference.columns = ['loan_amnt', 'emp_length', 'dti', 'fico','addr_state_1',"addr_state_2","addr_state_3",'loan_status']
rejects_reference.columns = ['loan_amnt', 'emp_length', 'dti', 'fico','addr_state_1',"addr_state_2","addr_state_3",'loan_status']

## Shallow Self-learning

### First you can include the Isolation Forest to filter the data

isolation_forest = IsolationForest(random_state=42)
isolation_forest.fit(X_rej)
scores = isolation_forest.decision_function(X_rej)
b_top = np.percentile(scores, 0)  # Adjust percentile as required
b_bot = np.percentile(scores, 100)

Xr = (X_rej[scores > b_top] | X_rej[scores < b_bot])

# Self-learning

### We also need to set up a list to append the predicted probabilities

In [9]:
Pred_probas = y_acc.copy()

In [10]:
X_star = X_rej.copy()

### Here are the parameters to tune. Alpha is the labeled percentage of rejects which are sampled per loop. Theta is the imbalance parameter, max_iteration is how often the algorithm samples rejects and you can set the penalty parameter for the regression.

In [11]:
alpha = 0.01
theta = 1 
max_iteration = 5
penalty_parameter = 2**2

### This is the final model

In [12]:
while (len(X_star) > 0 and max_iteration >= 0):

    
    # Train Logistic Regression in all data in X_star
    
    model = LogisticRegression(C = penalty_parameter, solver='liblinear', random_state=42)
    model.fit(X_acc, y_acc)
    y_rejected_pred = model.predict_proba(X_star)[:, 1]

    
    # Establish the parameters and caluclate thresholds for appending
    
    # Define as per your requirements
    # Imbalance parameter
    
    cg = np.percentile(y_rejected_pred, alpha * 100)
    cb = np.percentile(y_rejected_pred, (1 - alpha * theta) * 100)
    
    # Select the filtered observations by the thresholds
    
    selected_indices = (y_rejected_pred <= cg) | (y_rejected_pred >= cb)
    X_selected = X_star[selected_indices]
    y_selected_probs = (y_rejected_pred[selected_indices])
    y_selected = [int(i >= cg) for i in y_selected_probs]
        
    # If there are no selected observations break

    if len(y_selected) == 0:
        print("Stopping criterion met: No new samples exceed the thresholds.")
        break
    
    
    # Remove observations from X_star and append them to X_acc and their inferred labels to y_acc for further model training in the next steps

    X_star = X_star[~selected_indices] 
    X_acc = np.vstack([X_acc, X_selected])
    y_acc = np.concatenate([y_acc, y_selected])
    Pred_probas = np.concatenate([Pred_probas, y_selected_probs])

    max_iteration = max_iteration-1
    



## Results

### First I create my unbiased set with the inferred labels

In [13]:
data_predicted = np.column_stack([X_acc, Pred_probas])
data_predicted = pd.DataFrame(data_predicted)

In [15]:
data_predicted.columns = ['loan_amnt', 'emp_length', 'dti', 'fico','addr_state_1',"addr_state_2","addr_state_3",'loan_status']

### Then I map my rejects with the predicted labels to the true labels. You can also merge on the accepts and it will show perfect metrics because we know the true labels of the accepts.

In [16]:
columns = [col for col in accepts_reference.columns if col != 'loan_status']

# Zusammenführen der beiden DataFrames auf Basis der Schlüsselspalten
comparison = data_predicted.merge(
    rejects_reference, 
    on=columns, 
    suffixes=('_pred', '_true'), 
    how='inner'
)

# Vergleich der Vergleichsspalte
#vergleich_df['Vergleich_stimmt'] = vergleich_df['loan_status_klein'] == vergleich_df['loan_status_gross']

In [17]:
comparison['loan_status_true'] = comparison['loan_status_true'].astype(float)
comparison['loan_status_pred'] = comparison['loan_status_pred'].astype(float)

In [19]:
auc = roc_auc_score(comparison['loan_status_true'], comparison['loan_status_pred'])
print("AUC:", auc)

AUC: 0.5839945963124618


In [20]:
brier = brier_score_loss(comparison['loan_status_true'], comparison['loan_status_pred'],pos_label=1)
print("Brier score:", brier)

Brier score: 0.3520623601650927


In [21]:
fpr, tpr, thresholds = roc_curve(comparison['loan_status_true'], comparison['loan_status_pred']) 
ks_statistic = max(tpr - fpr)
print("KS-Statistic:",ks_statistic)

KS-Statistic: 0.16931435210878132
