## German Credit Data

Model development with imbalanced data for classification

Example will initial use the german credit data.

[ML tutorial](https://machinelearningmastery.com/imbalanced-classification-of-good-and-bad-credit/)

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# import data
df = pd.read_csv("german_credit.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe(include='all').T

In [None]:
# test harness and baseline model evaluation for the german credit dataset
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import fbeta_score
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyClassifier

# load the dataset
def load_dataset(full_path):
    # load the dataset as a numpy array
    dataframe = pd.read_csv(full_path)
    # split into inputs and outputs
    X, y = dataframe.drop(['Credit_risk'], axis=1), dataframe['Credit_risk']
    # select categorical features
    cat_ix = X.select_dtypes(include=['object', 'bool']).columns
    # one hot encode cat features only
    ct = ColumnTransformer([('o',OneHotEncoder(),cat_ix)], remainder='passthrough')
    X = ct.fit_transform(X)
    # label encode the target variable to have the classes 0 and 1
    mapping = {'GOOD': 0, 'BAD': 1}
    # y = LabelEncoder().fit_transform(y)
    y = y.replace(mapping)
    return X, y

# calculate f2 score
def f2(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=2)

# evaluate a model
def evaluate_model(X, y, model):
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    # define the model evaluation metric
    metric = make_scorer(f2)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
    return scores

In [None]:
# define the location of the dataset
full_path = 'german_credit.csv'
# load the dataset
X, y = load_dataset(full_path)
# summarize the loaded dataset
print(X.shape, y.shape, Counter(y))
# define the reference model
model = DummyClassifier(strategy='constant', constant=1)
# evaluate the model
scores = evaluate_model(X, y, model)
# summarize performance
print('Mean F2: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

### Evaluate models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# model dictionary
model_dict = {
    'LR': {'model': LogisticRegression(solver='liblinear')},
    'LDA': {'model': LinearDiscriminantAnalysis()},
    'NB': {'model': GaussianNB()},
    'GPC': {'model': GaussianProcessClassifier()},
    'SVM': {'model': SVC(gamma='scale')},
}

In [None]:
for m, d in model_dict.items():
    print(d['model'])

In [None]:
# load the dataset
def load_dataset1(full_path):
    # load the dataset as a numpy array
    dataframe = pd.read_csv(full_path)
    # split into inputs and outputs
    X, y = dataframe.drop(['Credit_risk'], axis=1), dataframe['Credit_risk']
    # select categorical features
    cat_ix = X.select_dtypes(include=['object', 'bool']).columns
    num_ix = X.select_dtypes(include=['int64', 'float64']).columns
    # label encode the target variable to have the classes 0 and 1
    mapping = {'GOOD': 0, 'BAD': 1}
    y = y.replace(mapping)
    return X, y, cat_ix, num_ix

In [None]:
# Method to create scores from model dictionary
def model_run(X, y, cat_ix, num_ix, ):
    result = list()
    for model_name, model_param in model_dict.items():
        ct = ColumnTransformer([('c',OneHotEncoder(),cat_ix), ('n',MinMaxScaler(),num_ix)])
        pipeline = Pipeline(steps=[('t',ct), ('m',model_param['model'])])
        scores = evaluate_model(X, y, pipeline)
        result.append({
            'model':model_name,
            'mean_score':np.mean(scores),
            'std_score':np.std(scores)
        })
    return result

In [None]:
# define the location of the dataset
full_path = 'german_credit.csv'
# load the dataset
X, y, cat_ix, num_ix = load_dataset1(full_path)
# evaluate the model
scores = model_run(X, y, cat_ix, num_ix)
# summarize performance
df_scores = pd.DataFrame(scores, columns=['model', 'mean_score', 'std_score'])
df_scores

In [None]:
# plot the results - only showing the mean as the model_run function has taken average from all scores
df_scores.boxplot(column='mean_score', by='model')
plt.show()