### Importing libraries & functions





In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression


### Importing dataset

In [2]:
dataset=pd.read_csv("../Downloads/home-credit-default-risk/application_train.csv")

### Data preparation

In [3]:

dataset.shape

(307511, 122)

In [4]:

dataset.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:

dataset=dataset.drop('SK_ID_CURR',axis=1)
dataset.shape

(307511, 121)

In [6]:

dataset.isna().sum()

TARGET                            0
NAME_CONTRACT_TYPE                0
CODE_GENDER                       0
FLAG_OWN_CAR                      0
FLAG_OWN_REALTY                   0
                              ...  
AMT_REQ_CREDIT_BUREAU_DAY     41519
AMT_REQ_CREDIT_BUREAU_WEEK    41519
AMT_REQ_CREDIT_BUREAU_MON     41519
AMT_REQ_CREDIT_BUREAU_QRT     41519
AMT_REQ_CREDIT_BUREAU_YEAR    41519
Length: 121, dtype: int64

In [7]:
# filling missing values with mean


In [8]:

dataset['TARGET'].value_counts()

TARGET
0    282686
1     24825
Name: count, dtype: int64

In [9]:
categorical_columns = dataset.select_dtypes(include=['object','string']).columns.tolist()

#Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.fit_transform(dataset[categorical_columns])

#Create a DataFrame with the one-hot encoded columns
#We use get_feature_names_out() to get the column names for the encoded data
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded dataframe with the original dataframe
df_encoded = pd.concat([dataset, one_hot_df], axis=1)

# Drop the original categorical columns
df_encoded = df_encoded.drop(categorical_columns, axis=1)

# Display the resulting dataframe
print(f"Encoded Employee data : \n{df_encoded}")

Encoded Employee data : 
        TARGET  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0            1             0          202500.0    406597.5      24700.5   
1            0             0          270000.0   1293502.5      35698.5   
2            0             0           67500.0    135000.0       6750.0   
3            0             0          135000.0    312682.5      29686.5   
4            0             0          121500.0    513000.0      21865.5   
...        ...           ...               ...         ...          ...   
307506       0             0          157500.0    254700.0      27558.0   
307507       0             0           72000.0    269550.0      12001.5   
307508       0             0          153000.0    677664.0      29979.0   
307509       1             0          171000.0    370107.0      20205.0   
307510       0             0          157500.0    675000.0      49117.5   

        AMT_GOODS_PRICE  REGION_POPULATION_RELATIVE  DAYS_BIRTH  \
0      

In [10]:
dataset=df_encoded.fillna(0)


In [11]:
from sklearn.feature_selection import VarianceThreshold

# Mengatur threshold untuk VarianceThreshold
# Variance yang rendah berarti data tersebut kurang variatif, dan mungkin tidak berguna
selector = VarianceThreshold(threshold=0.02)

# Fit dan transform data
dataset_reduced = selector.fit_transform(dataset)

# Mengubah kembali ke DataFrame
dataset_reduced = pd.DataFrame(dataset_reduced, columns=dataset.columns[selector.get_support(indices=True)])

# Menampilkan jumlah fitur sebelum dan sesudah
print(f"Jumlah fitur sebelum reduksi: {dataset.shape[1]}")
print(f"Jumlah fitur setelah reduksi: {dataset_reduced.shape[1]}")

# Mengganti dataset lama dengan yang baru
dataset = dataset_reduced




Jumlah fitur sebelum reduksi: 251
Jumlah fitur setelah reduksi: 115


In [12]:
dataset.isna().sum()

TARGET                             0
CNT_CHILDREN                       0
AMT_INCOME_TOTAL                   0
AMT_CREDIT                         0
AMT_ANNUITY                        0
                                  ..
WALLSMATERIAL_MODE_Panel           0
WALLSMATERIAL_MODE_Stone, brick    0
WALLSMATERIAL_MODE_nan             0
EMERGENCYSTATE_MODE_No             0
EMERGENCYSTATE_MODE_nan            0
Length: 115, dtype: int64

### Train Test Split

In [24]:
y = dataset.iloc[:, 0].values
X = dataset.iloc[:, 1:115].values

In [25]:

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

In [26]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Risk Model building

In [27]:
classifier =  LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Model *performance*

In [28]:
print(confusion_matrix(y_test,y_pred))

[[56526    12]
 [ 4954    11]]


In [29]:
print(accuracy_score(y_test, y_pred))

0.9192559712534348


In [30]:
r2_score(y_test,y_pred)

-0.08803614049273478

In [31]:
mean_squared_error(y_test,y_pred) 
 

0.08074402874656521

In [32]:
predictions = classifier.predict_proba(X_test)
predictions

array([[0.96253967, 0.03746033],
       [0.93714906, 0.06285094],
       [0.89593005, 0.10406995],
       ...,
       [0.91358809, 0.08641191],
       [0.86797155, 0.13202845],
       [0.93503382, 0.06496618]])

In [33]:


df_prediction_prob = pd.DataFrame(predictions, columns = ['prob_0', 'prob_1'])
df_prediction_target = pd.DataFrame(classifier.predict(X_test), columns = ['predicted_TARGET'])
df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])

dfx=pd.concat([df_test_dataset, df_prediction_prob, df_prediction_target], axis=1)



dfx.tail()

Unnamed: 0,Actual Outcome,prob_0,prob_1,predicted_TARGET
61498,0.0,0.684214,0.315786,0.0
61499,0.0,0.939933,0.060067,0.0
61500,1.0,0.913588,0.086412,0.0
61501,0.0,0.867972,0.132028,0.0
61502,0.0,0.935034,0.064966,0.0


In [34]:
df_prediction_target['predicted_TARGET'].value_counts()

predicted_TARGET
0.0    61480
1.0       23
Name: count, dtype: int64

### Coding ends here!