In [10]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [11]:
data = pd.read_csv("https://raw.githubusercontent.com/Kolo-Naukowe-Data-Science-PW/ML_intro_24-25/refs/heads/main/data/Loan_Default.csv")

## 1. Podział na zbiory Train - Test
* Zbiór Train 80% danych, zbiór Test 20% danych, 
  ustalamy random_state (losowo wybrane dane nie np. pierwsze 80% do Train) aby podział był powtarzalny
* Jako target (y) do przewidywania bierzemy kolumnę Status
* Pozbywamy się Kolumn ID oraz year, poniewa nic nie wniosą do modelu 

In [12]:
y = data['Status']
x = data.drop(['Status', 'ID', 'year'], axis=1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## 2. Przetwarzanie danych

In [14]:
categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object']
numerical_columns = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]

print("Braki w danych Kateogrycznych")
data[categorical_columns].isnull().sum() 

Braki w danych Kateogrycznych


loan_limit                   3344
Gender                          0
approv_in_adv                 908
loan_type                       0
loan_purpose                  134
Credit_Worthiness               0
open_credit                     0
business_or_commercial          0
Neg_ammortization             121
interest_only                   0
lump_sum_payment                0
construction_type               0
occupancy_type                  0
Secured_by                      0
total_units                     0
credit_type                     0
co-applicant_credit_type        0
age                           200
submission_of_application     200
Region                          0
Security_Type                   0
dtype: int64

In [15]:
print("Braki w danych Numerycznych")
data[numerical_columns].isnull().sum()

Braki w danych Numerycznych


loan_amount                 0
rate_of_interest        36439
Interest_rate_spread    36639
Upfront_charges         39642
term                       41
property_value          15098
income                   9150
Credit_Score                0
LTV                     15098
dtir1                   24121
dtype: int64

In [16]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

pipeline = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [None]:
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

X_train_transformed = pd.DataFrame(X_train_transformed)
X_test_transformed = pd.DataFrame(X_test_transformed)

X_train_transformed.columns = numerical_columns + list(pipeline.named_transformers_['cat']['onehot'].get_feature_names_out())
X_test_transformed.columns = numerical_columns + list(pipeline.named_transformers_['cat']['onehot'].get_feature_names_out())

In [18]:
X_train_transformed.head()

Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,property_value,income,Credit_Score,LTV,dtir1,...,x17_<25,x17_>74,x18_not_inst,x18_to_inst,x19_North,x19_North-East,x19_central,x19_south,x20_Indriect,x20_direct
0,-1.382543,-1.089765,-0.838457,-0.154489,0.425386,-1.110773,-0.68322,-0.818669,-0.053303,-2.684156,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1.220632,-0.066885,-0.680613,-0.164956,0.425386,1.274777,0.046618,0.25087,-0.315776,0.524081,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,-1.111379,-0.834045,2.380745,0.035311,-2.658811,-0.994405,-0.738651,-0.784168,0.300334,0.42059,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,-0.460585,-0.087342,-0.08713,-0.164956,0.425386,-0.587116,0.203672,-0.861796,0.303168,0.317098,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.841002,0.188835,-1.107733,-0.416427,0.425386,0.983856,0.314533,0.009361,-0.345044,-0.303851,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [19]:
X_train_transformed.describe(include='all')

Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,property_value,income,Credit_Score,LTV,dtir1,...,x17_<25,x17_>74,x18_not_inst,x18_to_inst,x19_North,x19_North-East,x19_central,x19_south,x20_Indriect,x20_direct
count,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,...,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0
mean,5.705323e-18,-7.305801e-16,8.038233000000001e-17,2.067059e-16,-4.571428e-16,-4.9391370000000006e-17,-5.693375000000001e-17,3.718317e-16,1.65305e-16,-2.56172e-16,...,0.008954,0.048589,0.353736,0.646264,0.50211,0.008399,0.058452,0.431039,0.00021,0.99979
std,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,...,0.094203,0.215008,0.478131,0.478131,0.499998,0.091263,0.234596,0.495224,0.014497,0.014497
min,-1.70794,-8.249927,-9.118744,-1.089982,-4.098103,-1.401694,-1.061997,-1.724327,-1.740993,-3.408597,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.7317496,-0.5783249,-0.5534866,-0.6445091,0.425386,-0.587116,-0.4707358,-0.870421,-0.2360485,-0.5108341,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,-0.1894214,-0.08734241,-0.08713045,-0.1649559,0.425386,-0.2089189,-0.1751051,-0.00788952,0.05062967,0.1101151,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
75%,0.5698381,0.4445553,0.4278792,0.2927654,0.425386,0.3147385,0.2129101,0.8632672,0.2831916,0.6275727,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
max,17.59894,8.116157,6.564722,20.29271,0.425386,46.60024,88.02446,1.725799,187.4085,2.386929,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
X_train_transformed.isnull().sum()

loan_amount             0
rate_of_interest        0
Interest_rate_spread    0
Upfront_charges         0
term                    0
                       ..
x19_North-East          0
x19_central             0
x19_south               0
x20_Indriect            0
x20_direct              0
Length: 69, dtype: int64