Data Preprocessing Tools

Importing the libraries

In [1]:
import numpy as np
import pandas as pd

Importing the training dataset

In [2]:
dataset = pd.read_csv('bank_loan_main.csv')
#Though there are many independent variables in Dataset, we will use only columns:
#Current Loan Amount, Term, Credit Score, Annual Income, Years in current job, Number of Credit Problems, Current Credit Balance, Bankruptcies,Tax Liens.
X = dataset.iloc[:, [2,3,4,5,7,14,15,17,18]].values  #exluded some columns
y = dataset.iloc[:, -1].values    #status of loan

Taking care of missing data (col: All)

In [3]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')  #use average for missing scores - can be applied only to numerical values
#imputer.fit(X[:, [2,3,4,5,6,7,8]])
imputer.fit(X[:, 2:9])
X[:, 2:9] = np.round(imputer.transform(X[:, 2:9]),0) #upper boundary 9 is excluded

Encoding the Independent Variable (col: Loan_Term)

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder  #class that will proceed with encoding
ColTrnsfm = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ColTrnsfm.fit_transform(X))  #dummy variables were creted here
print(X)

[[0.0 1.0 445412 ... 228190.0 1.0 0.0]
 [0.0 1.0 262328 ... 229976.0 0.0 0.0]
 [0.0 1.0 99999999 ... 297996.0 0.0 0.0]
 ...
 [0.0 1.0 103136 ... 109554.0 1.0 0.0]
 [0.0 1.0 530332 ... 404225.0 0.0 0.0]
 [0.0 1.0 99999999 ... 45600.0 1.0 0.0]]


Encoding the Dependent Variable (Col: Loan_Status)

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

**Splitting the Dataset into the training and Test Sets**

In [6]:
#3. Splitting the Dataset into the training and Test Sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

**Feature Scaling - Standardisation-recommended [-3:3] vs Normalisation [0:1] for specifc situations**

In [7]:
#note: dont apply to encoded indep variables
from sklearn.preprocessing import StandardScaler #Standartisation 
sc = StandardScaler()
#X[:, 2:] = sc.fit_transform(X[:, 2:])
X_train[:, 2:] = sc.fit_transform(X_train[:, 2:])
X_test[:, 2:] = sc.transform(X_test[:, 2:])


**Making the Confusion Matrix and Printing Accuracy Scores**

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
import datetime

In [9]:
class calculate_score:
    def __init__(self, X_test, y_test, Model_name):
        self.X_test=X_test
        self.y_test=y_test
        self.Model_name=Model_name

        y_pred = classifier.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        #print(cm)
        #accuracy_score_final = accuracy_score(y_test, y_pred)
        #print(f"{Model_name} Model Accuracy Rate = {accuracy_score_final}")
        print(f"{Model_name} Model Evaluation: " )
        print(classification_report(y_test,y_pred))
        auc = roc_auc_score(y_test,y_pred)
        print(f"AUC score: {round(auc,2)}" )   

        #accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
        #print(f"k-Fold Cross Validation Accuracy: {round(accuracies.mean()*100,2)} %")
        #print(f"k-Fold Cross Validation Standard Deviation: {round(accuracies.std()*100,2)} % \n")

        run_time = datetime.datetime.now() - time_start
        print(f"Time: {run_time}")


**Training Models in order to check Accuracy Scores**

In [10]:
# 5.1 Training the model on the Training Set (Naïve Bayes)
time_start = datetime.datetime.now()
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

Model_name = 'Naïve Bayes'
Testing_Set = calculate_score(X_test, y_test, Model_name)

Naïve Bayes Model Evaluation: 
              precision    recall  f1-score   support

           0       0.31      0.76      0.44      5647
           1       0.88      0.50      0.64     19353

    accuracy                           0.56     25000
   macro avg       0.59      0.63      0.54     25000
weighted avg       0.75      0.56      0.59     25000

AUC score: 0.63
Time: 0:00:00.172955


In [11]:
# 5.2 Training the model on the Training Set (Logistic Regression)
time_start = datetime.datetime.now()
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

Model_name = 'Logistic Regression'
Testing_Set = calculate_score(X_test, y_test, Model_name)

Logistic Regression Model Evaluation: 
              precision    recall  f1-score   support

           0       1.00      0.20      0.34      5647
           1       0.81      1.00      0.90     19353

    accuracy                           0.82     25000
   macro avg       0.91      0.60      0.62     25000
weighted avg       0.85      0.82      0.77     25000

AUC score: 0.6
Time: 0:00:01.207955


In [12]:
# 5.3 Training the model on the Training Set (SVM)
time_start = datetime.datetime.now()
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

Model_name = 'SVM'
Testing_Set = calculate_score(X_test, y_test, Model_name)

SVM Model Evaluation: 
              precision    recall  f1-score   support

           0       1.00      0.20      0.34      5647
           1       0.81      1.00      0.90     19353

    accuracy                           0.82     25000
   macro avg       0.91      0.60      0.62     25000
weighted avg       0.85      0.82      0.77     25000

AUC score: 0.6
Time: 0:04:39.505874


In [13]:
# 5.4 Training the model on the Training Set (Kernel SVM)
time_start = datetime.datetime.now()
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

Model_name = 'Kernel SVM'
Testing_Set = calculate_score(X_test, y_test, Model_name)

Kernel SVM Model Evaluation: 
              precision    recall  f1-score   support

           0       1.00      0.20      0.34      5647
           1       0.81      1.00      0.90     19353

    accuracy                           0.82     25000
   macro avg       0.90      0.60      0.62     25000
weighted avg       0.85      0.82      0.77     25000

AUC score: 0.6
Time: 0:08:21.058099


In [14]:
# 5.5 Training the model on the Training Set (K-Nearest Neighbors)
time_start = datetime.datetime.now()
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

Model_name = 'K-Nearest Neighbors'
Testing_Set = calculate_score(X_test, y_test, Model_name)

K-Nearest Neighbors Model Evaluation: 
              precision    recall  f1-score   support

           0       0.57      0.30      0.39      5647
           1       0.82      0.93      0.87     19353

    accuracy                           0.79     25000
   macro avg       0.69      0.62      0.63     25000
weighted avg       0.76      0.79      0.76     25000

AUC score: 0.62
Time: 0:00:08.763829


In [15]:
# 5.6 Training the model on the Training Set (Decision Tree Classification)
time_start = datetime.datetime.now()
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

Model_name = 'Decision Tree Classifier'
Testing_Set = calculate_score(X_test, y_test, Model_name)

Decision Tree Classifier Model Evaluation: 
              precision    recall  f1-score   support

           0       0.44      0.42      0.43      5647
           1       0.83      0.85      0.84     19353

    accuracy                           0.75     25000
   macro avg       0.64      0.63      0.64     25000
weighted avg       0.75      0.75      0.75     25000

AUC score: 0.63
Time: 0:00:01.238916


In [16]:
# 5.7 Training the model on the Training Set (Random Forest Classification)
time_start = datetime.datetime.now()
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

Model_name = 'Random Forest Classification'
Testing_Set = calculate_score(X_test, y_test, Model_name)

Random Forest Classification Model Evaluation: 
              precision    recall  f1-score   support

           0       0.55      0.33      0.41      5647
           1       0.83      0.92      0.87     19353

    accuracy                           0.79     25000
   macro avg       0.69      0.63      0.64     25000
weighted avg       0.76      0.79      0.77     25000

AUC score: 0.63
Time: 0:00:02.794041


In [17]:
# 5.8 Training the model on the Training Set (XGBoost Classification)
from xgboost import XGBClassifier
classifier = XGBClassifier(UserWarning=None)
classifier.fit(X_train, y_train)

Model_name = 'XGBoost Classification'
Testing_Set = calculate_score(X_test, y_test, Model_name)

XGBoost Classification Model Evaluation: 
              precision    recall  f1-score   support

           0       1.00      0.20      0.34      5647
           1       0.81      1.00      0.90     19353

    accuracy                           0.82     25000
   macro avg       0.91      0.60      0.62     25000
weighted avg       0.85      0.82      0.77     25000

AUC score: 0.6
Time: 0:00:07.179171
