In [1]:
import sys
sys.path.append('../')  


In [2]:
import pandas as pd 
from utils.constants import Constants

In [3]:
df = pd.read_csv('../data/loan_data.csv')
df

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


## Step 1: Perform data quality check

In [4]:
# Step 1: Perform data quality check

# Check for missing values
missing_values = df.isnull().sum()
missing_values


SK_ID_CURR                        0
TARGET                            0
NAME_CONTRACT_TYPE                0
CODE_GENDER                       0
FLAG_OWN_CAR                      0
                              ...  
AMT_REQ_CREDIT_BUREAU_DAY     41519
AMT_REQ_CREDIT_BUREAU_WEEK    41519
AMT_REQ_CREDIT_BUREAU_MON     41519
AMT_REQ_CREDIT_BUREAU_QRT     41519
AMT_REQ_CREDIT_BUREAU_YEAR    41519
Length: 122, dtype: int64

In [5]:
# Impute missing values for numerical columns with the mean
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# Impute missing values for categorical columns with the mode
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

In [6]:
# Verify if there are any missing values left
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

Series([], dtype: int64)


In [7]:
df.isna().sum()

SK_ID_CURR                    0
TARGET                        0
NAME_CONTRACT_TYPE            0
CODE_GENDER                   0
FLAG_OWN_CAR                  0
                             ..
AMT_REQ_CREDIT_BUREAU_DAY     0
AMT_REQ_CREDIT_BUREAU_WEEK    0
AMT_REQ_CREDIT_BUREAU_MON     0
AMT_REQ_CREDIT_BUREAU_QRT     0
AMT_REQ_CREDIT_BUREAU_YEAR    0
Length: 122, dtype: int64

In [8]:
df.isnull().sum()
df = df.dropna()

In [9]:
from utils.utils import Utils

In [10]:
balanced_df = Utils.balance_dataset(df)

In [11]:
X_train, X_test, y_train, y_test = Utils.train_test_split_data(balanced_df.drop(columns=['TARGET']), balanced_df['TARGET'])

In [12]:
correlation_float = df[numerical_cols].corrwith(df['TARGET'])


In [13]:
correlation_float

SK_ID_CURR                   -0.002108
TARGET                        1.000000
CNT_CHILDREN                  0.019187
AMT_INCOME_TOTAL             -0.003982
AMT_CREDIT                   -0.030369
                                ...   
AMT_REQ_CREDIT_BUREAU_DAY     0.002464
AMT_REQ_CREDIT_BUREAU_WEEK    0.000718
AMT_REQ_CREDIT_BUREAU_MON    -0.011356
AMT_REQ_CREDIT_BUREAU_QRT    -0.001842
AMT_REQ_CREDIT_BUREAU_YEAR    0.018160
Length: 106, dtype: float64

In [14]:
# Find the first maximum correlation
max_corr_column_1 = correlation_float.idxmax()
max_corr_value_1 = correlation_float[max_corr_column_1]

# Drop the column with the first maximum correlation
correlation_float = correlation_float.drop(max_corr_column_1)

# Find the second maximum correlation
max_corr_column_2 = correlation_float.idxmax()
max_corr_value_2 = correlation_float[max_corr_column_2]

# Drop the column with the second maximum correlation
correlation_float = correlation_float.drop(max_corr_column_2)

# Find the third maximum correlation
max_corr_column_3 = correlation_float.idxmax()
max_corr_value_3 = correlation_float[max_corr_column_3]

# Print the results
print("Top 3 columns with the highest correlation values:")
print("1. Column:", max_corr_column_1, "Meaning", Constants.COLUMN_MEANING[max_corr_column_1], "Correlation:", max_corr_value_1)
print("2. Column:", max_corr_column_2, "Meaning", Constants.COLUMN_MEANING[max_corr_column_2], "Correlation:", max_corr_value_2)
print("3. Column:", max_corr_column_3, "Meaning", Constants.COLUMN_MEANING[max_corr_column_3], "Correlation:", max_corr_value_3)


Top 3 columns with the highest correlation values:
1. Column: TARGET Meaning Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases) Correlation: 1.0
2. Column: DAYS_BIRTH Meaning Client's age in days at the time of application Correlation: 0.07823930830982692
3. Column: REGION_RATING_CLIENT_W_CITY Meaning Approximately at what hour did the client apply for the loan Correlation: 0.060892667564823415


In [15]:
# Create a new feature by combining time spent at the company and number of projects
import numpy as np
df['Engagement'] = np.where(df['CNT_CHILDREN'] == 0, np.nan, df['DAYS_EMPLOYED'] / df['CNT_CHILDREN'])
df['Engagement']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
307506   NaN
307507   NaN
307508   NaN
307509   NaN
307510   NaN
Name: Engagement, Length: 307511, dtype: float64

In [16]:
# Calculate the correlation of the new feature with the target variable
correlation_engagement = df['Engagement'].corr(df['TARGET'])
print("Correlation between Engagement and TARGET:", correlation_engagement)


Correlation between Engagement and TARGET: -0.003142777045031686


In [20]:

# Step 3: Define the m
# Step 4: Train the models using k-fold cross-validation
trained_models = Utils.train_models(Constants.MODELS, X_train, y_train)

# Step 5: Evaluate the models on the test set
evaluation_results = Utils.evaluate_models(trained_models, X_test, y_test)

# Step 6: Determine the best-performing model
best_model = max(evaluation_results, key=lambda k: evaluation_results[k]['Average Accuracy'])
print(f"Best Model: {best_model}")
print("Evaluation Results:")
for metric, value in evaluation_results[best_model].items():
    print(f"{metric}: {value:.4f}")

AttributeError: type object 'Constants' has no attribute 'MODELS'

In [None]:
# Step 7: Train the best model using all available data in the train set
best_model.fit(X_train, y_train)

# Step 8: Test the trained model against the test set
y_pred = best_model.predict(X_test)

# Step 9: Output the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Evaluation Results on Test Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
