# ***Libraries & Tools***

In [77]:
import numpy as np
import pandas as pd
import re
import time
import json
from itertools import product
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, RocCurveDisplay, PrecisionRecallDisplay, make_scorer
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from collections import Counter
from sklearn.compose import make_column_transformer, make_column_selector
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# ***Data Overview and Exploration***




Two datasets are provided.  The original dataset, in the form provided by Prof. Hofmann, contains categorical/symbolic attributes and is in the file "german.data". In this project, we utilize the original dataset and convert each categorical/symbolic value of a variable to a numerical value using one hot encoding. One hot encoding is the ideal method for converting variable values because the variables exhibit no inherent order among the categories.

In [33]:
gcd = pd.read_csv('German_Credit_Data.data', delimiter='\s+', header=None)

In [None]:
gcd.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


Replace the column number with column names. The column names are specified in the dataset description.

In [34]:
column_names = [
    'Status of existing checking account',
    'Duration in month',
    'Credit history',
    'Purpose',
    'Credit amount',
    'Savings account/bonds',
    'Present employment since',
    'Installment rate in percentage of disposable income',
    'Personal status and sex',
    'Other debtors / guarantors',
    'Present residence since',
    'Property',
    'Age in years',
    'Other installment plans',
    'Housing',
    'Number of existing credits at this bank',
    'Job',
    'Number of people being liable to provide maintenance for',
    'Telephone',
    'foreign worker',
    'Class'
  ]

In [35]:
columns_mapping = {old_name: new_name for old_name, new_name in zip(gcd.columns, column_names)}

gcd.rename(columns=columns_mapping, inplace=True)

In [None]:
gcd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                                                    Non-Null Count  Dtype 
---  ------                                                    --------------  ----- 
 0   Status of existing checking account                       1000 non-null   object
 1   Duration in month                                         1000 non-null   int64 
 2   Credit history                                            1000 non-null   object
 3   Purpose                                                   1000 non-null   object
 4   Credit amount                                             1000 non-null   int64 
 5   Savings account/bonds                                     1000 non-null   object
 6   Present employment since                                  1000 non-null   object
 7   Installment rate in percentage of disposable income       1000 non-null   int64 
 8   Personal status and sex      

No missing values

In [None]:
gcd.dropna().shape

(1000, 21)

No duplicates

In [None]:
gcd.drop_duplicates().shape

(1000, 21)

There is an imbalance between the classes which we will not address in this notebook.

In [None]:
gcd['Class'].value_counts()

1    700
2    300
Name: Class, dtype: int64

Use one hot encoding to convert the existing variable values to numerical values.

In [36]:
object_cols = gcd.select_dtypes(include=['object']).columns

gcd = pd.get_dummies(gcd, columns=object_cols)

In [37]:
gcd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 62 columns):
 #   Column                                                    Non-Null Count  Dtype
---  ------                                                    --------------  -----
 0   Duration in month                                         1000 non-null   int64
 1   Credit amount                                             1000 non-null   int64
 2   Installment rate in percentage of disposable income       1000 non-null   int64
 3   Present residence since                                   1000 non-null   int64
 4   Age in years                                              1000 non-null   int64
 5   Number of existing credits at this bank                   1000 non-null   int64
 6   Number of people being liable to provide maintenance for  1000 non-null   int64
 7   Class                                                     1000 non-null   int64
 8   Status of existing checking account_A11

# ***Data Mining and Classification***
***Note: Class 1 = Good Customer, Class 2 = Bad Customer***

The rows represent the actual classification and The columns the predicted classification.

It is worse to class a customer as good when they are bad (5), than it is to class a customer as bad when they are good (1).

In [38]:
ORIGINAL_COST = [
    [0,1],
    [5,0]
]

## ***Feature Selection using SVM***
Feature selection with a linear SVM is time consuming. However, we can use undersampling to take all the instances of the minority class and reduce the size of the majority class. Use this methodology only if you want to improve the results of the classification. Otherwise, this methodology takes a lot of time to run.


In [None]:
minority_class = gcd[gcd['Class'] == 2]
majority_class = gcd[gcd['Class'] == 1]

minority_sample = minority_class

majority_sample = majority_class.sample(n=300, random_state=42)

gcd_balanced = pd.concat([minority_sample, majority_sample])

In [None]:
gcd_balanced['Class'].value_counts()

2    300
1    300
Name: Class, dtype: int64

In [None]:
def calc_cost(y_true, y_pred, cost_matrix):
    conf = confusion_matrix(y_true,y_pred).T
    return np.sum(conf * cost_matrix)

In [None]:
def find_best_params(estimator, params, train_set, validation_set, starting_point = 0):
    train_x = train_set[0]
    train_y = train_set[1]
    test_x = validation_set[0]
    test_y = validation_set[1]

    min_cost = np.inf
    best_params = {}
    full_params_set = []

    if type(params) == dict:

        for values in product(*params.values()):
            point = dict(zip(params.keys(), values))
            full_params_set.append(point)

    elif type(params) == list:

        for params_subset in params:
            for values in product(*params_subset.values()):
                point = dict(zip(params_subset.keys(), values))
                full_params_set.append(point)
    else:
        return []

    steps = len(full_params_set)
    counter = starting_point
    print(f"Testing {steps} models in total.")
    start = time.time()

    for params in full_params_set[starting_point:]:

        estimator.set_params(**params)
        estimator.fit(train_x,train_y)
        pred_y = estimator.predict(test_x)
        cost_matrix = np.matrix([[0,1], [5,0]])
        cost = calc_cost(test_y, pred_y, cost_matrix)

        if cost < min_cost:
            min_cost = cost
            best_params = params

        print("________________________________________________________________________________________")
        print(f"{counter}/{steps} | Cost: {cost} | Elapsed: {int((time.time()-start)*100)/100}s | {params}")
        print("________________________________________________________________________________________")

    return best_params

In [None]:
linear_params = {"C":[0.1,1.0,10.0,100.0],"kernel":["linear"]}

y = gcd['Class']
x = gcd.drop(columns = ['Class'])

x_train,x_test,y_train,y_test = train_test_split(x, y, test_size = 0.20)
x_train,x_val,y_train,y_val = train_test_split(x_train, y_train, test_size = 0.25)

In [None]:
best_params = find_best_params(SVC(), linear_params, [x_train, y_train], [x_val, y_val])

In [None]:
print(best_params)

In [None]:
classifier = SVC(*best_params)
classifier.fit(x_train, y_train)
weights = zip(list(gcd.columns), classifier.coef_.todense().data)
weights_sorted = {k: v for k, v in sorted(weights.items(), key=lambda item: item[1])}
with open("Important_SVC_Features.json","w") as f:
    json.dump(weights_sorted,f)

## ***Feature Selection using GridSearch with RF***
We can use the GridSearch method with the Random Forest algorithm to evaluate the importance of each variable and identify the optimal features for classifying customer types more effectively.

In [None]:
y = gcd['Class']
x = gcd.drop(columns = ['Class'])

In [None]:
def custom_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    cost = np.sum(cm * ORIGINAL_COST)
    return -cost  # Minimize cost, so negative of cost is returned

In [None]:
custom_scorer = make_scorer(custom_score, greater_is_better = False)

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1, scoring=custom_scorer)


In [None]:
grid_search.fit(x, y)

In [None]:
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

best_rf_classifier = grid_search.best_estimator_

best_rf_classifier.fit(x, y)

feature_importances = best_rf_classifier.feature_importances_

Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}


In [None]:
feature_importance_sum = {}

for feature, importance in zip(x.columns, feature_importances):
    feature_name = re.sub(r'^(.*?)_.+', r'\1', feature)

    if feature_name in feature_importance_sum:
        feature_importance_sum[feature_name] += importance
    else:
        feature_importance_sum[feature_name] = importance

sorted_feature_importance_sum = sorted(feature_importance_sum.items(), key=lambda x: x[1], reverse=True)

for feature, importance_sum in sorted_feature_importance_sum:
    print(f"{feature}: {importance_sum}")

Status of existing checking account: 0.18127052680713
Credit amount: 0.09963120887039098
Duration in month: 0.08502755277969097
Credit history: 0.07943903009449499
Purpose: 0.07526519681385582
Age in years: 0.07131138040402563
Savings account/bonds: 0.05696442042004994
Present employment since: 0.05243765670601543
Property: 0.044531448912716966
Other installment plans: 0.03890869221327936
Personal status and sex: 0.03212342377332732
Job: 0.030707166210147484
Housing: 0.030348743444134216
Installment rate in percentage of disposable income: 0.026335110898120195
Other debtors / guarantors: 0.025730195747058995
Present residence since: 0.022868905519949165
Telephone: 0.017995911650055568
Number of existing credits at this bank: 0.013141854015556249
foreign worker: 0.008117241945819281
Number of people being liable to provide maintenance for: 0.007844332774181536


In [None]:
top_features = [sorted_feature_importance_sum[i][0] for i in range(13)]

complete_features = []

for feature in top_features:
  for column in gcd.columns:
      if column.startswith(feature):
          suffix = column[len(feature):].lstrip('_')
          if suffix != '':
            complete_part = feature + '_' + suffix
          else:
            complete_part = feature

          complete_features.append(complete_part)

top_features = complete_features
for complete_feat in top_features: print(complete_feat)

Status of existing checking account_A11
Status of existing checking account_A12
Status of existing checking account_A13
Status of existing checking account_A14
Credit amount
Duration in month
Credit history_A30
Credit history_A31
Credit history_A32
Credit history_A33
Credit history_A34
Purpose_A40
Purpose_A41
Purpose_A410
Purpose_A42
Purpose_A43
Purpose_A44
Purpose_A45
Purpose_A46
Purpose_A48
Purpose_A49
Age in years
Savings account/bonds_A61
Savings account/bonds_A62
Savings account/bonds_A63
Savings account/bonds_A64
Savings account/bonds_A65
Present employment since_A71
Present employment since_A72
Present employment since_A73
Present employment since_A74
Present employment since_A75
Property_A121
Property_A122
Property_A123
Property_A124
Other installment plans_A141
Other installment plans_A142
Other installment plans_A143
Personal status and sex_A91
Personal status and sex_A92
Personal status and sex_A93
Personal status and sex_A94
Job_A171
Job_A172
Job_A173
Job_A174
Housing_A151


## ***Classification***

Split the dataset into a training set (75%) and a testing set (25%).



In [79]:
x_train, x_test, y_train, y_test = train_test_split(gcd.drop(columns=['Class']), gcd['Class'], test_size=0.25, random_state=42, stratify=gcd['Class'])

In [80]:
classifier_names = [
    'Random Forests', 'Linear SVM', 'Naive Bayes'
]

classifiers = [RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, min_samples_split=10),
               SVC(kernel='linear'),
               GaussianNB()
               ]

### ***Cost-based Evaluation***
We run three classifiers, evaluate their performance, and calculate the total cost of their results.

In [None]:
for name, clf in zip(classifier_names, classifiers):
  print(name)
  clf.fit(x_train, y_train)
  y_pred = clf.predict(x_test)
  print(classification_report(y_test, y_pred))
  conf_m = confusion_matrix(y_test, y_pred)
  print(f'Confusion Matrix: \n{conf_m}\n')
  print(f'Total Cost: {np.sum(conf_m * ORIGINAL_COST)}\n')

Random Forests
              precision    recall  f1-score   support

           1       0.79      0.93      0.85       175
           2       0.70      0.41      0.52        75

    accuracy                           0.77       250
   macro avg       0.75      0.67      0.69       250
weighted avg       0.76      0.77      0.75       250

Confusion Matrix: 
[[162  13]
 [ 44  31]]

Total Cost: 233

Linear SVM
              precision    recall  f1-score   support

           1       0.79      0.88      0.83       175
           2       0.62      0.47      0.53        75

    accuracy                           0.76       250
   macro avg       0.71      0.67      0.68       250
weighted avg       0.74      0.76      0.74       250

Confusion Matrix: 
[[154  21]
 [ 40  35]]

Total Cost: 221

Naive Bayes
              precision    recall  f1-score   support

           1       0.82      0.75      0.79       175
           2       0.52      0.63      0.57        75

    accuracy            

Based on the above results we can conclude the following:
- The NB (Naive Bayes) has the lowest cost
- The NB classifies 28 customers as "Good" even though they are "Bad". Compared to the other algorithms, NB has the lowest count for the FP (False Positives) but the highest count for the FN (False Negatives - Customers that are "Good" but are classified as "Bad")
- The RF (Random Forest) algorithm has the lowest count for the FN, which means that it is very good at classifying "Good" customers
- The performance of the SVM falls somewhere in between the other two algorithms




### ***Minimizing the Expected Cost***

In [72]:
min_expected_cost_classifiers = [
    RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, min_samples_split=10),
    SVC(kernel='linear', probability=True),
    GaussianNB()
]

**Note: Before we use .predict_proba, we have to replace the category values of the target variable. So 1 ("Good") will be 0 and 2 ("Bad") will be 1.**

In [62]:
# Create a new dataframe to encode (replace) the target variable values
gcd_encoded_classes = gcd.copy()

In [63]:
gcd_encoded_classes['Class'] = gcd_encoded_classes['Class'].replace({1: 0, 2: 1})

In [64]:
# Create a new training and testing set using the new dataframe
x_train, x_test, y_train, y_test = train_test_split(gcd_encoded_classes.drop(columns=['Class']), gcd_encoded_classes['Class'], test_size=0.25, random_state=42, stratify=gcd_encoded_classes['Class'])

**Cost minimization without probability calibration**

In [73]:
for name, clf in zip(classifier_names, min_expected_cost_classifiers):
  print(name)
  model = clf.fit(x_train, y_train)
  y_pred_prob = model.predict_proba(x_test)
  y_pred = np.argmin(np.matmul(y_pred_prob, np.array(ORIGINAL_COST)), axis=1)
  print(classification_report(y_test, y_pred))
  conf_m = confusion_matrix(y_test, y_pred)
  print(f'Confusion Matrix: \n{conf_m}\n')
  print(f'Total Cost: {np.sum(conf_m * ORIGINAL_COST)}\n')

Random Forests
              precision    recall  f1-score   support

           0       0.96      0.38      0.54       175
           1       0.40      0.96      0.56        75

    accuracy                           0.55       250
   macro avg       0.68      0.67      0.55       250
weighted avg       0.79      0.55      0.55       250

Confusion Matrix: 
[[ 66 109]
 [  3  72]]

Total Cost: 124

Linear SVM
              precision    recall  f1-score   support

           0       0.86      0.46      0.60       175
           1       0.39      0.83      0.53        75

    accuracy                           0.57       250
   macro avg       0.63      0.64      0.57       250
weighted avg       0.72      0.57      0.58       250

Confusion Matrix: 
[[80 95]
 [13 62]]

Total Cost: 160

Naive Bayes
              precision    recall  f1-score   support

           0       0.86      0.68      0.76       175
           1       0.50      0.75      0.60        75

    accuracy                

Based on the above results we can conclude that:
- The Naive Bayes (NB) and Support Vector Machine (SVM) algorithms are the primary contenders for minimizing cost without the need for probability calibration.
- The Naive Bayes (NB) algorithm exhibits a higher count of False Positives compared to SVM. However, it also demonstrates the lowest count of False Negatives and the most significant difference in True Positives. If one of our priorities is to minimize the loss of "Good" clients while effectively identifying and managing "Bad" clients, then NB emerges as the preferred choice.
- The Support Vector Machine (SVM) displays a lower count of False Positives compared to NB, albeit with a significantly higher count of False Negatives. If prioritizing the preservation of "Good" clients is not our primary concern, then SVM emerges as the preferred choice.

**Cost minimization with sigmoid calibration**

In [74]:
for name, clf in zip(classifier_names, min_expected_cost_classifiers):
  print(name)
  cc = CalibratedClassifierCV(clf, method="sigmoid", cv=3)
  model = cc.fit(x_train, y_train)
  y_pred_prob = model.predict_proba(x_test)
  y_pred = np.argmin(np.matmul(y_pred_prob, np.array(ORIGINAL_COST)), axis=1)
  print(classification_report(y_test, y_pred))
  conf_m = confusion_matrix(y_test, y_pred)
  print(f'Confusion Matrix: \n{conf_m}\n')
  print(f'Total Cost: {np.sum(conf_m * ORIGINAL_COST)}\n')

Random Forests
              precision    recall  f1-score   support

           0       0.91      0.46      0.61       175
           1       0.42      0.89      0.57        75

    accuracy                           0.59       250
   macro avg       0.66      0.68      0.59       250
weighted avg       0.76      0.59      0.60       250

Confusion Matrix: 
[[81 94]
 [ 8 67]]

Total Cost: 134

Linear SVM
              precision    recall  f1-score   support

           0       0.92      0.32      0.47       175
           1       0.37      0.93      0.53        75

    accuracy                           0.50       250
   macro avg       0.64      0.63      0.50       250
weighted avg       0.75      0.50      0.49       250

Confusion Matrix: 
[[ 56 119]
 [  5  70]]

Total Cost: 144

Naive Bayes
              precision    recall  f1-score   support

           0       0.88      0.46      0.61       175
           1       0.41      0.85      0.55        75

    accuracy                

Based on the above results we can conclude that
- The Random Forest (RF) emerges as the optimal algorithm in this scenario, as it effectively balances True Positives and False Negatives while simultaneously minimizing the count of False Positives.

**Cost Minimization with isotonic calibration**

In [75]:
for name, clf in zip(classifier_names, min_expected_cost_classifiers):
  print(name)
  cc = CalibratedClassifierCV(clf, method="isotonic", cv=3)
  model = cc.fit(x_train, y_train)
  y_pred_prob = model.predict_proba(x_test)
  y_pred = np.argmin(np.matmul(y_pred_prob, np.array(ORIGINAL_COST)), axis=1)
  print(classification_report(y_test, y_pred))
  conf_m = confusion_matrix(y_test, y_pred)
  print(f'Confusion Matrix: \n{conf_m}\n')
  print(f'Total Cost: {np.sum(conf_m * ORIGINAL_COST)}\n')

Random Forests
              precision    recall  f1-score   support

           0       0.91      0.50      0.65       175
           1       0.43      0.88      0.58        75

    accuracy                           0.62       250
   macro avg       0.67      0.69      0.61       250
weighted avg       0.76      0.62      0.63       250

Confusion Matrix: 
[[88 87]
 [ 9 66]]

Total Cost: 132

Linear SVM
              precision    recall  f1-score   support

           0       0.92      0.38      0.53       175
           1       0.39      0.92      0.55        75

    accuracy                           0.54       250
   macro avg       0.65      0.65      0.54       250
weighted avg       0.76      0.54      0.54       250

Confusion Matrix: 
[[ 66 109]
 [  6  69]]

Total Cost: 139

Naive Bayes
              precision    recall  f1-score   support

           0       0.89      0.50      0.64       175
           1       0.42      0.85      0.56        75

    accuracy                

Based on the above results we can conclude that:
- All algorithms exhibit a low count of False Positives but a high count of False Negatives with the SVM having the highest count and the biggest difference between True Positives and False Negatives. Nevertheless, the Naive Bayes and Random Forest algorithms excel in this regard by effectively balancing True Positives with False Negatives, a balance not achieved by the SVM.

### ***Rebalancing***

**Undersampling**

In [81]:
print(Counter(y_train))

Counter({1: 525, 2: 225})


We reduce the size of the first class ("Good") with undersampling to be equal to the size of the second class ("Bad").

In [82]:
sampler = RandomUnderSampler(sampling_strategy={1: 225, 2: 225}, random_state=42)
x_rs, y_rs = sampler.fit_resample(x_train, y_train)

for name, clf in zip(classifier_names, classifiers):
  print(name)
  clf.fit(x_rs, y_rs)
  y_pred = clf.predict(x_test)
  print(classification_report(y_test, y_pred))
  conf_m = confusion_matrix(y_test, y_pred)
  print(f'Confusion Matrix: \n{conf_m}\n')
  print(f'Total Cost: {np.sum(conf_m * ORIGINAL_COST)}\n')

Random Forests
              precision    recall  f1-score   support

           1       0.87      0.68      0.76       175
           2       0.50      0.76      0.61        75

    accuracy                           0.70       250
   macro avg       0.69      0.72      0.68       250
weighted avg       0.76      0.70      0.72       250

Confusion Matrix: 
[[119  56]
 [ 18  57]]

Total Cost: 146

Linear SVM
              precision    recall  f1-score   support

           1       0.87      0.59      0.71       175
           2       0.46      0.80      0.58        75

    accuracy                           0.66       250
   macro avg       0.67      0.70      0.65       250
weighted avg       0.75      0.66      0.67       250

Confusion Matrix: 
[[104  71]
 [ 15  60]]

Total Cost: 146

Naive Bayes
              precision    recall  f1-score   support

           1       0.83      0.71      0.77       175
           2       0.50      0.67      0.57        75

    accuracy            

**Oversampling**:
We increase the size of the minority class ("Bad") with oversampling to be equal to the size of the majority class ("Good").

In [83]:
sampler = RandomOverSampler(sampling_strategy={1: 525, 2: 525}, random_state=42)
x_rs, y_rs = sampler.fit_resample(x_train, y_train)

for name, clf in zip(classifier_names, classifiers):
  print(name)
  clf.fit(x_rs, y_rs)
  y_pred = clf.predict(x_test)
  print(classification_report(y_test, y_pred))
  conf_m = confusion_matrix(y_test, y_pred)
  print(f'Confusion Matrix: \n{conf_m}\n')
  print(f'Total Cost: {np.sum(conf_m * ORIGINAL_COST)}\n')

Random Forests
              precision    recall  f1-score   support

           1       0.84      0.76      0.80       175
           2       0.54      0.65      0.59        75

    accuracy                           0.73       250
   macro avg       0.69      0.71      0.69       250
weighted avg       0.75      0.73      0.73       250

Confusion Matrix: 
[[133  42]
 [ 26  49]]

Total Cost: 172

Linear SVM
              precision    recall  f1-score   support

           1       0.85      0.61      0.71       175
           2       0.45      0.76      0.57        75

    accuracy                           0.65       250
   macro avg       0.65      0.68      0.64       250
weighted avg       0.73      0.65      0.67       250

Confusion Matrix: 
[[106  69]
 [ 18  57]]

Total Cost: 159

Naive Bayes
              precision    recall  f1-score   support

           1       0.85      0.70      0.76       175
           2       0.50      0.71      0.59        75

    accuracy            

**Combination of undersampling and oversampling**: We reduce the number of samples in the majority class and increase the number of samples in the minority class.

In [84]:
sampler = RandomUnderSampler(sampling_strategy={1: 225, 2: 225}, random_state=42)
x_rs, y_rs = sampler.fit_resample(x_train, y_train)
sampler = RandomOverSampler(sampling_strategy={1: 225, 2: 525}, random_state=42)
x_rs, y_rs = sampler.fit_resample(x_rs, y_rs)

for name, clf in zip(classifier_names, classifiers):
  print(name)
  clf.fit(x_rs, y_rs)
  y_pred = clf.predict(x_test)
  print(classification_report(y_test, y_pred))
  conf_m = confusion_matrix(y_test, y_pred)
  print(f'Confusion Matrix: \n{conf_m}\n')
  print(f'Total Cost: {np.sum(conf_m * ORIGINAL_COST)}\n')



Random Forests
              precision    recall  f1-score   support

           1       0.91      0.52      0.66       175
           2       0.44      0.88      0.59        75

    accuracy                           0.63       250
   macro avg       0.68      0.70      0.62       250
weighted avg       0.77      0.63      0.64       250

Confusion Matrix: 
[[91 84]
 [ 9 66]]

Total Cost: 129

Linear SVM
              precision    recall  f1-score   support

           1       0.89      0.42      0.57       175
           2       0.40      0.88      0.55        75

    accuracy                           0.56       250
   macro avg       0.64      0.65      0.56       250
weighted avg       0.74      0.56      0.57       250

Confusion Matrix: 
[[ 74 101]
 [  9  66]]

Total Cost: 146

Naive Bayes
              precision    recall  f1-score   support

           1       0.85      0.63      0.73       175
           2       0.46      0.73      0.57        75

    accuracy                

### ***Cost-based Evaluation using Sample Weights***
Now we want to use sample weights to evaluate the cost of the classification.

First we assign weights to each training instance. The weights can be defined using the default cost matrix or creating a custom cost matrix based on the credit amount.

In [None]:
weights = np.zeros(y_train.shape[0])
weights[np.where(y_train == 1)] = 1;
weights[np.where(y_train == 2)] = 5;

In [None]:
for name, clf in zip(classifier_names, classifiers):
  print(name)
  clf.fit(x_train, y_train, weights)
  y_pred = clf.predict(x_test)
  print(classification_report(y_test, y_pred))
  conf_m = confusion_matrix(y_test, y_pred)
  print(f'Confusion Matrix: \n{conf_m}\n')
  print(f'Total Cost: {np.sum(conf_m * ORIGINAL_COST)}\n')

Random Forests
              precision    recall  f1-score   support

           1       0.87      0.70      0.77       175
           2       0.51      0.75      0.61        75

    accuracy                           0.71       250
   macro avg       0.69      0.72      0.69       250
weighted avg       0.76      0.71      0.72       250

Confusion Matrix: 
[[122  53]
 [ 19  56]]

Total Cost: 148

Linear SVM
              precision    recall  f1-score   support

           1       0.88      0.41      0.55       175
           2       0.38      0.87      0.53        75

    accuracy                           0.54       250
   macro avg       0.63      0.64      0.54       250
weighted avg       0.73      0.54      0.55       250

Confusion Matrix: 
[[ 71 104]
 [ 10  65]]

Total Cost: 154

Naive Bayes
              precision    recall  f1-score   support

           1       0.86      0.68      0.76       175
           2       0.50      0.75      0.60        75

    accuracy            

Based on the above results we can conclude the following:
- The Support Vector Machine (SVM) demonstrates a favorable outcome with minimal False Positives (FP); however, it concurrently exhibits the highest count of False Negatives (FN). This performance is less than ideal because while the model makes fewer errors in categorizing "Bad" customers as "Good," it overlooks a significant number of customers by misclassifying them as "Bad."
- The Random Forest (RF) algorithm boasts the lowest cost and proves to be the more favorable choice. Although it slightly underperforms compared to the SVM in terms of False Positives (FP), with a difference of nine, it manages to retain a considerable number of "Good" customers, making it a more reliable option.






Next, we evaluate the performance of the classifiers using the credit amount as weight. We keep the weight for class 1 ("Good") the same as before to be consistent with the documentation.

In [None]:
weights = np.zeros(y_train.shape[0])
weights[np.where(y_train == 1)] = 1
weights[np.where(y_train == 2)] = x_train.loc[y_train == 2, 'Credit amount']

In [None]:
for name, clf in zip(classifier_names, classifiers):
  print(name)
  clf.fit(x_train, y_train, weights)
  y_pred = clf.predict(x_test)
  print(classification_report(y_test, y_pred))
  conf_m = confusion_matrix(y_test, y_pred)
  print(f'Confusion Matrix: \n{conf_m}\n')
  print(f'Total Cost: {np.sum(conf_m * ORIGINAL_COST)}\n')

Random Forests
              precision    recall  f1-score   support

           1       0.91      0.17      0.29       175
           2       0.33      0.96      0.49        75

    accuracy                           0.41       250
   macro avg       0.62      0.57      0.39       250
weighted avg       0.74      0.41      0.35       250

Confusion Matrix: 
[[ 30 145]
 [  3  72]]

Total Cost: 160

Linear SVM
              precision    recall  f1-score   support

           1       0.85      0.19      0.31       175
           2       0.33      0.92      0.48        75

    accuracy                           0.41       250
   macro avg       0.59      0.55      0.40       250
weighted avg       0.69      0.41      0.36       250

Confusion Matrix: 
[[ 33 142]
 [  6  69]]

Total Cost: 172

Naive Bayes
              precision    recall  f1-score   support

           1       0.88      0.41      0.55       175
           2       0.38      0.87      0.53        75

    accuracy            

Based on the above results we can conclude the following:
- The Naive Bayes (NB) algorithm demonstrates the lowest cost among the models evaluated. While it exhibits a slightly higher count of False Positives (FP) compared to the Random Forest (RF) by seven and the Support Vector Machine (SVM) by four, it compensates with the lowest count of False Negatives (FN). This characteristic makes NB the preferred choice over the other algorithms.
- The low count of False Positives (FP) in the Random Forest and SVM algorithms aligns with expectations, considering the potentially significant weight associated with misclassifying the second class.