In [69]:
from sklearn.datasets import fetch_openml
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [70]:
# fetch dataset 
statlog_german_credit_data = fetch_openml(data_id=31, as_frame=True)
# data (as pandas dataframes) 

X = statlog_german_credit_data.data
y = statlog_german_credit_data.target
# metadata 
print(X.columns)

Index(['checking_status', 'duration', 'credit_history', 'purpose',
       'credit_amount', 'savings_status', 'employment',
       'installment_commitment', 'personal_status', 'other_parties',
       'residence_since', 'property_magnitude', 'age', 'other_payment_plans',
       'housing', 'existing_credits', 'job', 'num_dependents', 'own_telephone',
       'foreign_worker'],
      dtype='object')


In [71]:
print(X.shape)
print(y.shape)

(1000, 20)
(1000,)


In [72]:
print(X)
y = y.apply(lambda x: 1 if x == 'bad' else 0)
y


    checking_status  duration                  credit_history  \
0                <0         6  critical/other existing credit   
1          0<=X<200        48                   existing paid   
2       no checking        12  critical/other existing credit   
3                <0        42                   existing paid   
4                <0        24              delayed previously   
..              ...       ...                             ...   
995     no checking        12                   existing paid   
996              <0        30                   existing paid   
997     no checking        12                   existing paid   
998              <0        45                   existing paid   
999        0<=X<200        45  critical/other existing credit   

                 purpose  credit_amount    savings_status  employment  \
0               radio/tv           1169  no known savings         >=7   
1               radio/tv           5951              <100      1<=X<4   


0      0
1      1
2      0
3      0
4      1
      ..
995    0
996    0
997    0
998    1
999    0
Name: class, Length: 1000, dtype: category
Categories (2, int64): [1, 0]

In [73]:
numerical_features = ['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
categorical_features = list(set(X.columns) - set(numerical_features))

In [74]:
# Preprocessing for numerical data: StandardScaler
numerical_transformer = StandardScaler()

# Preprocessing for categorical data: OneHotEncoder
categorical_transformer = OneHotEncoder(drop='first')  # drop='first' to avoid the dummy variable trap

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create and fit the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', LinearRegression())])

In [77]:
# Get 60% of the dataset as the training set. Put the remaining 40% in temporary variables: x_ and y_.
x_train, x_, y_train, y_ = train_test_split(X, y, test_size=0.40, random_state=1)

# Split the 40% subset above into two: one half for cross validation and the other for the test set
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=1)

# Delete temporary variables
del x_, y_

print(f"the shape of the training set (input) is: {x_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the cross validation set (input) is: {x_cv.shape}")
print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")
print(f"the shape of the test set (input) is: {x_test.shape}")
print(f"the shape of the test set (target) is: {y_test.shape}")
#PS: Data doesn't need scaling

the shape of the training set (input) is: (600, 20)
the shape of the training set (target) is: (600,)

the shape of the cross validation set (input) is: (200, 20)
the shape of the cross validation set (target) is: (200,)

the shape of the test set (input) is: (200, 20)
the shape of the test set (target) is: (200,)


In [78]:
x_train

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
371,no checking,18,critical/other existing credit,radio/tv,6070,<100,>=7,3,male single,none,4,car,33,none,own,2,skilled,1,yes,yes
45,no checking,11,critical/other existing credit,new car,1393,<100,<1,4,female div/dep/mar,none,4,car,35,none,own,2,high qualif/self emp/mgmt,1,none,yes
560,<0,24,existing paid,used car,2964,no known savings,>=7,4,male single,none,4,no known property,49,bank,for free,1,skilled,2,yes,yes
748,no checking,21,existing paid,used car,5248,no known savings,1<=X<4,1,male single,none,3,car,26,none,own,1,skilled,1,none,yes
419,0<=X<200,18,existing paid,new car,1042,no known savings,1<=X<4,4,female div/dep/mar,none,2,life insurance,33,none,own,1,skilled,1,none,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,no checking,10,existing paid,used car,2901,no known savings,<1,1,female div/dep/mar,none,4,real estate,31,none,rent,1,skilled,1,none,yes
72,<0,8,critical/other existing credit,other,1164,<100,>=7,3,male single,none,4,no known property,51,bank,for free,2,high qualif/self emp/mgmt,2,yes,yes
908,no checking,15,delayed previously,used car,3594,<100,<1,1,female div/dep/mar,none,2,life insurance,46,none,own,2,unskilled resident,1,none,yes
235,<0,24,existing paid,radio/tv,1823,<100,unemployed,4,male single,none,2,car,30,stores,own,1,high qualif/self emp/mgmt,2,none,yes
