In [124]:
from sklearn.datasets import fetch_openml
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
from tensorflow.keras.models import Sequential
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scikeras.wrappers import KerasClassifier



In [125]:
# fetch dataset 
statlog_german_credit_data = fetch_openml(data_id=31, as_frame=True)
# data (as pandas dataframes) 

X = statlog_german_credit_data.data
y = statlog_german_credit_data.target
# metadata 
print(X.columns)


Index(['checking_status', 'duration', 'credit_history', 'purpose',
       'credit_amount', 'savings_status', 'employment',
       'installment_commitment', 'personal_status', 'other_parties',
       'residence_since', 'property_magnitude', 'age', 'other_payment_plans',
       'housing', 'existing_credits', 'job', 'num_dependents', 'own_telephone',
       'foreign_worker'],
      dtype='object')


In [126]:
print(X.shape)
print(y.shape)

(1000, 20)
(1000,)


In [127]:
print(X)
y = y.apply(lambda x: 1 if x == 'bad' else 0)
y


    checking_status  duration                  credit_history  \
0                <0         6  critical/other existing credit   
1          0<=X<200        48                   existing paid   
2       no checking        12  critical/other existing credit   
3                <0        42                   existing paid   
4                <0        24              delayed previously   
..              ...       ...                             ...   
995     no checking        12                   existing paid   
996              <0        30                   existing paid   
997     no checking        12                   existing paid   
998              <0        45                   existing paid   
999        0<=X<200        45  critical/other existing credit   

                 purpose  credit_amount    savings_status  employment  \
0               radio/tv           1169  no known savings         >=7   
1               radio/tv           5951              <100      1<=X<4   


0      0
1      1
2      0
3      0
4      1
      ..
995    0
996    0
997    0
998    1
999    0
Name: class, Length: 1000, dtype: category
Categories (2, int64): [1, 0]

In [202]:
x_train, x_, y_train, y_ = train_test_split(X, y, test_size=0.30, random_state=1)

# Split the 40% subset above into two: one half for cross validation and the other for the test set
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.5, random_state=1)

# Delete temporary variables
del x_, y_

print(f"the shape of the training set (input) is: {x_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the cross validation set (input) is: {x_cv.shape}")
print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")
print(f"the shape of the test set (input) is: {x_test.shape}")
print(f"the shape of the test set (target) is: {y_test.shape}")


the shape of the training set (input) is: (700, 20)
the shape of the training set (target) is: (700,)

the shape of the cross validation set (input) is: (150, 20)
the shape of the cross validation set (target) is: (150,)

the shape of the test set (input) is: (150, 20)
the shape of the test set (target) is: (150,)


In [203]:
x_train

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
731,<0,24,existing paid,radio/tv,1987,<100,1<=X<4,2,male single,none,4,real estate,21,none,rent,1,unskilled resident,2,none,yes
716,no checking,30,critical/other existing credit,radio/tv,3077,no known savings,>=7,3,male single,none,2,car,40,none,own,2,skilled,2,yes,yes
640,<0,18,existing paid,education,750,<100,unemployed,4,female div/dep/mar,none,1,real estate,27,none,own,1,unemp/unskilled non res,1,none,yes
804,0<=X<200,12,existing paid,new car,7472,no known savings,unemployed,1,female div/dep/mar,none,2,real estate,24,none,rent,1,unemp/unskilled non res,1,none,yes
737,<0,18,existing paid,new car,4380,100<=X<500,1<=X<4,3,male single,none,4,car,35,none,own,1,unskilled resident,2,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,no checking,10,existing paid,used car,2901,no known savings,<1,1,female div/dep/mar,none,4,real estate,31,none,rent,1,skilled,1,none,yes
72,<0,8,critical/other existing credit,other,1164,<100,>=7,3,male single,none,4,no known property,51,bank,for free,2,high qualif/self emp/mgmt,2,yes,yes
908,no checking,15,delayed previously,used car,3594,<100,<1,1,female div/dep/mar,none,2,life insurance,46,none,own,2,unskilled resident,1,none,yes
235,<0,24,existing paid,radio/tv,1823,<100,unemployed,4,male single,none,2,car,30,stores,own,1,high qualif/self emp/mgmt,2,none,yes


In [204]:
numerical_features = ['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
categorical_features = list(set(X.columns) - set(numerical_features))


In [205]:
# Preprocessing for numerical data: StandardScaler
numerical_transformer = Pipeline(steps=[
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: OneHotEncoder
categorical_transformer = OneHotEncoder(drop='first')  # drop='first' to avoid the dummy variable trap

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
preprocessor.fit(x_train)


In [206]:
X_train_transformed = preprocessor.transform(x_train)

In [207]:
X_train_transformed.shape

(700, 76)

In [208]:

# def createModel(optimizer='adam', init='glorot_uniform'):
#     model = Sequential([
#         Dense(256, input_shape=(X_train_transformed.shape[1],), activation='relu', kernel_initializer=init),
#         Dense(128, activation='relu', kernel_initializer=init),
#         Dense(64, activation='relu', kernel_initializer=init),
#         Dense(32, activation='relu', kernel_initializer=init),
#         Dense(16, activation='relu', kernel_initializer=init),
#         Dense(1, activation='sigmoid', kernel_initializer=init)
#     ])
#     model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
#     return model

In [209]:
keras_model = KerasClassifier(build_fn=createModel,  epochs=100, batch_size=10, verbose=0)
# Create and fit the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', LogisticRegression(max_iter=1000))])


In [210]:
pipeline.fit(x_train, y_train)

In [211]:
# keras_model_ = pipeline.named_steps['model'].model_
# keras_model_.summary()

In [212]:

# Predict and calculate training error
yhat_train = pipeline.predict(x_train)
train_error = np.mean(yhat_train != y_train)
print(f"Fraction of misclassified training data: {train_error}")

# Predict and calculate cross-validation error
yhat_cv = pipeline.predict(x_cv)
cv_error = np.mean(yhat_cv != y_cv)
print(f"Fraction of misclassified cross-validation data: {cv_error}")


Fraction of misclassified training data: 0.18571428571428572
Fraction of misclassified cross-validation data: 0.22


In [213]:
yhat_test = pipeline.predict(x_test)
test_error = np.mean(yhat_test != y_test)
print(f"Fraction of misclassified test data: {test_error}")
df_new = pd.DataFrame({
    'Prediction': yhat_test,
    'Actual Result': y_test
})
finalDataFrame = pd.concat([x_test, df_new], axis=1)

Fraction of misclassified test data: 0.26666666666666666


In [176]:
finalDataFrame[:50]

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,Prediction,Actual Result
486,no checking,12,existing paid,radio/tv,3077,<100,1<=X<4,2,male single,none,...,52,none,own,1,skilled,1,yes,yes,0,0
872,<0,24,critical/other existing credit,business,1382,100<=X<500,4<=X<7,4,male single,none,...,26,none,own,2,skilled,1,yes,yes,0,0
404,0<=X<200,18,delayed previously,new car,2899,no known savings,>=7,4,male single,none,...,43,none,own,1,skilled,2,none,yes,0,0
911,0<=X<200,24,critical/other existing credit,furniture/equipment,4736,<100,<1,2,female div/dep/mar,none,...,25,bank,own,1,unskilled resident,1,none,yes,0,1
531,0<=X<200,15,existing paid,new car,2631,100<=X<500,1<=X<4,2,female div/dep/mar,none,...,28,none,rent,2,skilled,1,yes,yes,0,1
608,no checking,18,existing paid,radio/tv,2051,<100,<1,4,male single,none,...,33,none,own,1,skilled,1,none,yes,0,0
671,no checking,36,existing paid,business,5742,100<=X<500,4<=X<7,2,male single,none,...,31,none,own,2,skilled,1,yes,yes,0,0
242,<0,48,no credits/all paid,used car,4605,<100,>=7,3,male single,none,...,24,none,for free,2,skilled,2,none,yes,1,1
374,0<=X<200,60,all paid,other,14782,100<=X<500,>=7,3,female div/dep/mar,none,...,60,bank,for free,2,high qualif/self emp/mgmt,1,yes,yes,1,1
797,no checking,12,critical/other existing credit,furniture/equipment,1258,<100,<1,2,female div/dep/mar,none,...,22,none,rent,2,unskilled resident,1,none,yes,0,0


In [177]:
finalDataFrame[50:]

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,Prediction,Actual Result
258,no checking,15,existing paid,used car,3812,100<=X<500,<1,1,female div/dep/mar,none,...,23,none,own,1,skilled,1,yes,yes,0,0
194,0<=X<200,45,existing paid,radio/tv,3031,100<=X<500,1<=X<4,4,male single,guarantor,...,21,none,rent,1,skilled,1,none,yes,1,1
794,no checking,24,existing paid,furniture/equipment,3062,500<=X<1000,>=7,4,male single,none,...,32,none,rent,1,skilled,1,yes,yes,0,0
35,0<=X<200,45,critical/other existing credit,radio/tv,4746,<100,<1,4,male single,none,...,25,none,own,2,unskilled resident,1,none,yes,1,1
494,<0,12,critical/other existing credit,new car,2122,<100,1<=X<4,3,male single,none,...,39,none,rent,2,unskilled resident,2,none,no,0,0
967,no checking,15,existing paid,radio/tv,3568,<100,>=7,4,female div/dep/mar,none,...,54,bank,rent,1,high qualif/self emp/mgmt,1,yes,yes,0,0
50,0<=X<200,24,delayed previously,furniture/equipment,2333,no known savings,<1,4,male single,none,...,29,bank,own,1,unskilled resident,1,none,yes,0,0
267,no checking,24,existing paid,radio/tv,1533,<100,<1,4,female div/dep/mar,none,...,38,stores,own,1,skilled,1,yes,yes,0,0
207,0<=X<200,12,critical/other existing credit,domestic appliance,1424,<100,4<=X<7,4,male single,none,...,26,none,own,1,skilled,1,none,yes,0,0
358,no checking,12,existing paid,radio/tv,776,<100,1<=X<4,4,male mar/wid,none,...,28,none,own,1,skilled,1,none,yes,0,0
