In [66]:
from ucimlrepo import fetch_ucirepo

# General
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [2]:
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id = 144) 

In [3]:
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets 

# EDA

In [4]:
X

Unnamed: 0,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Attribute11,Attribute12,Attribute13,Attribute14,Attribute15,Attribute16,Attribute17,Attribute18,Attribute19,Attribute20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,A14,12,A32,A42,1736,A61,A74,3,A92,A101,4,A121,31,A143,A152,1,A172,1,A191,A201
996,A11,30,A32,A41,3857,A61,A73,4,A91,A101,4,A122,40,A143,A152,1,A174,1,A192,A201
997,A14,12,A32,A43,804,A61,A75,4,A93,A101,4,A123,38,A143,A152,1,A173,1,A191,A201
998,A11,45,A32,A43,1845,A61,A73,4,A93,A101,4,A124,23,A143,A153,1,A173,1,A192,A201


In [51]:
y

Unnamed: 0,class
0,1
1,2
2,1
3,1
4,2
...,...
995,1
996,1
997,1
998,2


In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Attribute1   1000 non-null   object
 1   Attribute2   1000 non-null   int64 
 2   Attribute3   1000 non-null   object
 3   Attribute4   1000 non-null   object
 4   Attribute5   1000 non-null   int64 
 5   Attribute6   1000 non-null   object
 6   Attribute7   1000 non-null   object
 7   Attribute8   1000 non-null   int64 
 8   Attribute9   1000 non-null   object
 9   Attribute10  1000 non-null   object
 10  Attribute11  1000 non-null   int64 
 11  Attribute12  1000 non-null   object
 12  Attribute13  1000 non-null   int64 
 13  Attribute14  1000 non-null   object
 14  Attribute15  1000 non-null   object
 15  Attribute16  1000 non-null   int64 
 16  Attribute17  1000 non-null   object
 17  Attribute18  1000 non-null   int64 
 18  Attribute19  1000 non-null   object
 19  Attribute20  1000 non-null  

In [37]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   class   1000 non-null   int64
dtypes: int64(1)
memory usage: 7.9 KB


In [13]:
data_dic = pd.DataFrame(statlog_german_credit_data.variables)
print(data_dic)

           name     role         type     demographic  \
0    Attribute1  Feature  Categorical            None   
1    Attribute2  Feature      Integer            None   
2    Attribute3  Feature  Categorical            None   
3    Attribute4  Feature  Categorical            None   
4    Attribute5  Feature      Integer            None   
5    Attribute6  Feature  Categorical            None   
6    Attribute7  Feature  Categorical           Other   
7    Attribute8  Feature      Integer            None   
8    Attribute9  Feature  Categorical  Marital Status   
9   Attribute10  Feature  Categorical            None   
10  Attribute11  Feature      Integer            None   
11  Attribute12  Feature  Categorical            None   
12  Attribute13  Feature      Integer             Age   
13  Attribute14  Feature  Categorical            None   
14  Attribute15  Feature  Categorical           Other   
15  Attribute16  Feature      Integer            None   
16  Attribute17  Feature  Categ

## Data Cleaning

In [46]:
# Seperate Features by type

categorical_features = data_dic[(data_dic["type"] == "Categorical" ) | (data_dic["type"] == "Binary") & (data_dic["role"] == "feature")]["name"].values
print(categorical_features)
#categorical_features.info()
#binary_features = data_dic[data_dic["type"] == "Binary"]["name"]

numerical_features = data_dic[data_dic["type"] == "Integer"]["name"].values
print(numerical_features)

['Attribute1' 'Attribute3' 'Attribute4' 'Attribute6' 'Attribute7'
 'Attribute9' 'Attribute10' 'Attribute12' 'Attribute14' 'Attribute15'
 'Attribute17']
['Attribute2' 'Attribute5' 'Attribute8' 'Attribute11' 'Attribute13'
 'Attribute16' 'Attribute18']


In [47]:
numeric_transformer = Pipeline(
    steps = [("imputer", SimpleImputer(strategy = "median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps = [
        ("encoder", OneHotEncoder(handle_unknown = "ignore")),
        ("selector", SelectPercentile(chi2, percentile = 50)),
    ]
)
    
preprocessor = ColumnTransformer(
    transformers = [
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [48]:
logistic_pipeline = Pipeline(
    steps = [("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y["class"].values, test_size = 0.30, random_state = 12345)

In [65]:
logistic_pipeline.fit(X_train, y_train)
print("model score: %.3f" % logistic_pipeline.score(X_test, y_test))

model score: 0.757


In [19]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X)
enc.categories_

[array(['A11', 'A12', 'A13', 'A14'], dtype=object),
 array([ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22,
        24, 26, 27, 28, 30, 33, 36, 39, 40, 42, 45, 47, 48, 54, 60, 72],
       dtype=int64),
 array(['A30', 'A31', 'A32', 'A33', 'A34'], dtype=object),
 array(['A40', 'A41', 'A410', 'A42', 'A43', 'A44', 'A45', 'A46', 'A48',
        'A49'], dtype=object),
 array([  250,   276,   338,   339,   343,   362,   368,   385,   392,
          409,   426,   428,   433,   448,   454,   458,   484,   518,
          522,   571,   585,   590,   601,   609,   618,   625,   626,
          629,   639,   640,   652,   654,   660,   662,   666,   672,
          674,   682,   683,   684,   685,   691,   697,   700,   701,
          707,   708,   709,   717,   719,   727,   730,   731,   741,
          745,   750,   753,   754,   759,   760,   763,   766,   776,
          781,   783,   790,   795,   797,   802,   804,   806,   836,
          841,   846,   860,   866,   874,   882

In [None]:
rf = RandomForestRegressor(n_estimators = 100,
                                  random_state = 12345,
                                  oob_score = True)

rf.fit(X_train, y_train)

In [67]:
random_forest_pipeline = Pipeline(
    steps = [("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)

In [68]:
random_forest_pipeline.fit(X_train, y_train)
print("model score: %.3f" % random_forest_pipeline.score(X_test, y_test))

model score: 0.793
