# Credit Scoring

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

In [2]:
# Load the train and test datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Data Inspection: Show first few rows of the data
print("\nTrain Data Head:")
print(train_data.head())
print("\nTest Data Head:")
print(test_data.head())

# Data Inspection: Show summary statistics
print("\nTrain Data Summary Statistics:")
print(train_data.describe())
print("\nTest Data Summary Statistics:")
print(test_data.describe())

# Data Inspection: Check for missing values
print("\nMissing Values in Train Data:")
print(train_data.isnull().sum())
print("\nMissing Values in Test Data:")
print(test_data.isnull().sum())


Train Data Head:
          id target  day month  duration  contactId  age  gender         job  \
0  432148809     no   27   may       166        623   30  female      worker   
1  432184318     no   26   oct       183       1992   42  female     manager   
2  432182482     no    5   jun       227       2778   26  female    services   
3  432150520     no    2   jun        31       3070   34    male  unemployed   
4  432145870     no   15   may      1231       6583   48    male      worker   

  maritalStatus        education creditFailure  accountBalance house credit  \
0       married       highSchool            no            -202    no     no   
1       married     uniGraduated            no            2463    no     no   
2        single       highSchool            no            2158   yes    yes   
3      divorced     uniGraduated           yes              75   yes     no   
4       married  secondarySchool            no             559   yes     no   

  contactType  numberOfCon

In [3]:
# Handle Missing Data
numerical_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
imputer = SimpleImputer(strategy='median')
train_data[numerical_cols] = imputer.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = imputer.transform(test_data[numerical_cols])

categorical_cols = train_data.select_dtypes(include=['object']).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
train_data[categorical_cols] = imputer_cat.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = imputer_cat.transform(test_data[categorical_cols])

# Data Inspection: Check for missing values again after imputation
print("\nMissing Values After Imputation in Train Data:")
print(train_data.isnull().sum())
print("\nMissing Values After Imputation in Test Data:")
print(test_data.isnull().sum())


Missing Values After Imputation in Train Data:
id                              0
target                          0
day                             0
month                           0
duration                        0
contactId                       0
age                             0
gender                          0
job                             0
maritalStatus                   0
education                       0
creditFailure                   0
accountBalance                  0
house                           0
credit                          0
contactType                     0
numberOfContacts                0
daySinceLastCampaign            0
numberOfContactsLastCampaign    0
lastCampaignResult              0
dtype: int64

Missing Values After Imputation in Test Data:
id                              0
target                          0
day                             0
month                           0
duration                        0
contactId                       0
age     

In [4]:
# Drop the target column from the datasets before one-hot encoding
train_data = train_data.drop(columns=['target_yes'], errors='ignore')
test_data = test_data.drop(columns=['target_yes'], errors='ignore')

# Handle potential mismatch in categorical columns between train and test datasets
train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=pd.NA)

# Data Inspection: Check the shape and columns after alignment
print("\nTrain Data Shape After Alignment:")
print(train_data.shape)
print("\nTest Data Shape After Alignment:")
print(test_data.shape)



Train Data Shape After Alignment:
(31480, 20)

Test Data Shape After Alignment:
(13731, 20)


In [5]:
# One-hot encoding for categorical variables
train_data_encoded = pd.get_dummies(train_data, drop_first=True)
test_data_encoded = pd.get_dummies(test_data, drop_first=True)

# Ensure the train and test data have the same columns after encoding
train_data_encoded, test_data_encoded = train_data_encoded.align(test_data_encoded, join='left', axis=1, fill_value=0)

# Data Inspection: Check the shape and columns after one-hot encoding
print("\nTrain Data Shape After One-Hot Encoding:")
print(train_data_encoded.shape)
print("\nTest Data Shape After One-Hot Encoding:")
print(test_data_encoded.shape)


Train Data Shape After One-Hot Encoding:
(31480, 46)

Test Data Shape After One-Hot Encoding:
(13731, 46)


In [6]:
# Feature Scaling for numerical columns
scaler = StandardScaler()
numerical_cols_for_scaling = train_data_encoded.select_dtypes(include=['float64', 'int64']).columns

train_data_encoded[numerical_cols_for_scaling] = scaler.fit_transform(train_data_encoded[numerical_cols_for_scaling])
test_data_encoded[numerical_cols_for_scaling] = scaler.transform(test_data_encoded[numerical_cols_for_scaling])

# Data Inspection: Check the first few rows after scaling
print("\nTrain Data After Scaling:")
print(train_data_encoded.head())
print("\nTest Data After Scaling:")
print(test_data_encoded.head())


Train Data After Scaling:
         id       day  duration  contactId       age  accountBalance  \
0 -1.237755  1.345768 -0.360515  -1.722418 -1.028856       -0.521309   
1  1.483857  1.225620 -0.294257  -1.722350  0.100128        0.374697   
2  1.343135 -1.297472 -0.122766  -1.722311 -1.405183        0.272152   
3 -1.106615 -1.657914 -0.886682  -1.722296 -0.652528       -0.428179   
4 -1.463017 -0.095999  3.790353  -1.722122  0.664620       -0.265452   

   numberOfContacts  daySinceLastCampaign  numberOfContactsLastCampaign  \
0         -0.248364             -0.108017                     -0.236603   
1         -0.248364             -0.108017                     -0.236603   
2         -0.566915             -0.108017                     -0.236603   
3          0.070186             -0.108017                     -0.236603   
4         -0.248364             -0.108017                     -0.236603   

   target_yes  ...  education_uniGraduated  education_unknown  \
0       False  ...      

In [7]:
# Ensure that the 'target' column exists and is correctly prepared
y = train_data_encoded['target_yes']
X = train_data_encoded.drop('target_yes', axis=1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally, inspect the shapes of the training and testing data
print(f"Training Features Shape: {X_train.shape}")
print(f"Testing Features Shape: {X_test.shape}")
print(f"Training Target Shape: {y_train.shape}")
print(f"Testing Target Shape: {y_test.shape}")

Training Features Shape: (25184, 45)
Testing Features Shape: (6296, 45)
Training Target Shape: (25184,)
Testing Target Shape: (6296,)


In [8]:
# Model training using Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Model evaluation
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")



Classification Report:
              precision    recall  f1-score   support

       False       0.93      0.97      0.95      5580
        True       0.64      0.45      0.53       716

    accuracy                           0.91      6296
   macro avg       0.79      0.71      0.74      6296
weighted avg       0.90      0.91      0.90      6296

ROC-AUC Score: 0.9357


In [9]:
# Hyperparameter tuning (optional)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
