In [2]:
import pandas as pd
import numpy as np

app_test = pd.read_csv('dataset/application_test_aai.csv')
app_train = pd.read_csv('dataset/application_train_aai.csv')
column_description = pd.read_csv('dataset/HomeCredit_columns_description.csv')

In [1]:
import sklearn

sklearn.__version__

'1.2.1'

In [3]:
df_cat = app_train.select_dtypes(object)
bin_cols = []
multilabel_cols = []
for column in df_cat.columns:
    if df_cat[column].value_counts().count() == 2:
        bin_cols.append(column)
    else:
        multilabel_cols.append(column)

In [4]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

In [5]:
train_df = app_train.drop("TARGET", axis=1)
train_y = app_train["TARGET"]

In [15]:
X_train, X_val, y_train, y_val = train_test_split(train_df, train_y, test_size = 0.2, random_state=42, shuffle=True)

In [32]:
X_test = app_test.drop('TARGET', axis=1)
y_test = app_test['TARGET']

In [12]:
bin_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
one_hot = OneHotEncoder(handle_unknown='ignore')
transformer=ColumnTransformer([('bin_enc', bin_enc, bin_cols),
                               ('one_hot', one_hot, multilabel_cols)],
                                remainder='passthrough')

In [25]:
X_val = transformer.transform(X_val)
X_val

array([[0., 0., 1., ..., 0., 1., 2.],
       [0., 1., 0., ..., 1., 0., 2.],
       [1., 0., 1., ..., 2., 0., 1.],
       ...,
       [0., 1., 1., ..., 1., 1., 1.],
       [0., 0., 1., ..., 1., 0., 2.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [34]:
X_test = transformer.transform(X_test)
X_test

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 5.],
       [0., 1., 1., ..., 0., 2., 3.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 1., 0., 2.],
       [0., 0., 1., ..., 0., 0., 8.]])

In [16]:
X_train = transformer.fit_transform(X_train)
X_train

array([[0., 1., 0., ..., 0., 1., 6.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 0., 2.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 5.]])

In [35]:
df = pd.DataFrame(X_train)
df_val = pd.DataFrame(X_val)
df_test = pd.DataFrame(X_test)

In [36]:
df.shape, df_val.shape, df_test.shape

((196806, 246), (49202, 246), (61503, 246))

In [37]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(strategy='median')
X_train = imp.fit_transform( X_train )
X_train

array([[0., 1., 0., ..., 0., 1., 6.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 0., 2.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 5.]])

In [41]:
X_val = imp.transform(X_val)
X_test = imp.transform(X_test)

In [42]:
df = pd.DataFrame(X_train)
df_val = pd.DataFrame(X_val)
df_test = pd.DataFrame(X_test)

In [43]:
df_val.isna().sum().sum()

0

In [44]:
df_test.isna().sum().sum()

0

In [45]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_train

array([[0.        , 1.        , 0.        , ..., 0.        , 0.05263158,
        0.24      ],
       [0.        , 0.        , 0.        , ..., 0.03703704, 0.        ,
        0.04      ],
       [0.        , 0.        , 0.        , ..., 0.03703704, 0.        ,
        0.04      ],
       ...,
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.08      ],
       [0.        , 1.        , 1.        , ..., 0.        , 0.05263158,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.2       ]])

In [47]:
np.max(X_train), np.min(X_train)

(1.0, 0.0)

In [48]:
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
[(np.max(X_val), np.min(X_val)), (np.max(X_test), np.min(X_test))]

[(1.0104807473228525, -0.0028756290438534116),
 (13.736842105263158, -0.02851665468487896)]

In [49]:
X_val

array([[0.        , 0.        , 1.        , ..., 0.        , 0.05263158,
        0.08      ],
       [0.        , 1.        , 0.        , ..., 0.03703704, 0.        ,
        0.08      ],
       [1.        , 0.        , 1.        , ..., 0.07407407, 0.        ,
        0.04      ],
       ...,
       [0.        , 1.        , 1.        , ..., 0.03703704, 0.05263158,
        0.04      ],
       [0.        , 0.        , 1.        , ..., 0.03703704, 0.        ,
        0.08      ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [51]:
np.max(X_test)

13.736842105263158