## Predicting Default on Payments of Credit Card Clients

# 3. Model Training & Model Evaluation

Within the ML Analytics Recruitment Challenge, the goal of this notebook is to train machine learning models to predict whether a credit card client will default in the following month and evaluate the performance of these models.


***

### Main Insights:

<div class="alert alert-block alert-info">
    Write stuff.
</div>

<div class="alert alert-block alert-info">
    Write stuff.
</div>


***


### Imports

In [47]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings

from typing import Dict, List

seed = 17
np.random.seed(seed)

# warnings.filterwarnings("ignore")

### Utils

#### Data processor

In [67]:
class DataProcessor:
    """
    Class that aggregates a set of functions for loading, cleaning, feature engineering, encoding,
    and normalize data, among others, with the goal of preparing the data to be feed into a model.
    
    Parameters
    ----------
    mappings : Dict<str, Dict>
        Dictionary that specifies the mapping of categorical features. The keys in the dictionary 
        are the names of the columns that represent the categorical features, and the values are 
        also dictionaries that specify the mapping of the categorical values. 
    
    seed : int, optional
        Integer used to control the random state, default is 17. 
        
    target_col_name : str, optional
        Name of the target variable, default is 'target'.
        
    val_size : float, optional
        Represents the propotion of the train dataset to include in the validation split, by default
        is 0.2.
    """
    def __init__(
        self,
        mappings: Dict,
        seed: int = 17,
        target_col_name: str = "target",
        val_size: float = 0.2
    ):
        self.target_name = target_col_name
        
        self._mappings = mappings
        self._seed = seed
        self._val_size = val_size
        
        
        # Variables to be initialized later:
        self.train_data = None
        self.test_data = None
        self.X_train = None
        self.y_train = None
        self.X_val = None
        self.y_val = None
        self.X_test = None
        self.y_test = None
        self.standard_scaler = None
    
    
    def load_train_data(self, file_directory: str):
        """Loads the train data from a given directory."""
        self.train_data = pd.read_csv(file_directory)
        self._initial_dataset_uniformization(dataset_type="train")
    
    def load_test_data(self, file_directory: str):
        """Loads the test data from a given directory."""
        self.test_data = pd.read_csv(file_directory)
        self._initial_dataset_uniformization(dataset_type="test")
    
    def split_data(self):
        """Splits the train set into train and validation sets, and decomposes all sets into X and y."""
        self.X_train = self.train_data.iloc[:, :-1]
        self.y_train = self.train_data.iloc[:, -1]
        
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            self.X_train, 
            self.y_train, 
            test_size=self._val_size, 
            shuffle=True,
            random_state=self._seed,
            stratify=self.y_train
        )
        
        self.val_data = pd.concat([self.X_val, self.y_val], axis=1)
        
        self.X_test = self.test_data.iloc[:, :-1]
        self.y_test = self.test_data.iloc[:, -1]
        
    def treat_categorical_variables(self, drop_original_vars: bool = True):
        """Reencodes of 'education' variable and creation of flags for clients who are male, married, or single."""
        self.X_train = self._treat_categorical_variables(self.X_train, drop_original_vars)
        self.X_val = self._treat_categorical_variables(self.X_val, drop_original_vars)
        self.X_test = self._treat_categorical_variables(self.X_test, drop_original_vars)
        
    def feature_engineering(
        self, 
        calculate_bill_to_limit_bal_ratio: bool = False,
        calculate_pay_to_bill_ratio: bool = False,
        calculate_num_negative_bill_statements: bool = False,
        calculate_payment_delays: bool = False,
        calculate_payment_change_rate: bool = False,
        calculate_bill_change_rate: bool = False,
        calculate_total_payment: bool = False,
        list_vars_to_drop: List = None
    ):
        """ """
        self._calculate_bill_to_limit_bal_ratio = calculate_bill_to_limit_bal_ratio
        self._calculate_pay_to_bill_ratio = calculate_pay_to_bill_ratio
        self._calculate_num_negative_bill_statements = calculate_num_negative_bill_statements
        self._calculate_payment_delays = calculate_payment_delays
        self._calculate_payment_change_rate = calculate_payment_change_rate
        self._calculate_bill_change_rate = calculate_bill_change_rate
        self._calculate_total_payment = calculate_total_payment
        self._list_vars_to_drop = list_vars_to_drop
        
        # Perform feature engineering:
        self.X_train = self._feature_engineering(self.X_train)
        self.X_val = self._feature_engineering(self.X_val)
        self.X_test = self._feature_engineering(self.X_test)
        
    def perform_oversampling(self):
        """ """
        pass
    
    def treat_outliers(self):
        """Filters outliers according to IQR criteria and replaces the values by Q1 or Q3."""
        pass
    
    def standardize_data(self):
        """Standardize features by removing the mean and scaling to unit variance."""
        self.standard_scaler = StandardScaler()
        
        self.X_train = self.standard_scaler.fit_transform(self.X_train)
        self.X_val = self.standard_scaler.transform(self.X_val)
        self.X_test = self.standard_scaler.transform(self.X_test)
    
    def print_dataset_stats(self, normalize: bool = False):
        """Prints number of samples per set and target classes proportion."""
        print(f"Train dataset: {len(self.X_train)} samples, with the target classes as follow:")
        display(self.train_data.target.value_counts(normalize=normalize))
        
        print(f"Val dataset: {len(self.X_val)} samples, with the target classes as follow:")
        display(self.val_data.target.value_counts(normalize=normalize))
        
        print(f"Test dataset: {len(self.X_test)} samples, with the target classes as follow:")
        display(self.test_data.target.value_counts(normalize=normalize))
        
    def _map_values(self, col, mapping):
        """Function to encode a column in a dataframe."""
        return col.map(mapping)
    
    def _initial_dataset_uniformization(self, dataset_type: str):
        """Performs initial steps in the dataset such as renaming columns and removing 'id' column."""
        rename_cols_map = {"default.payment.next.month": "target", "PAY_0": "PAY_1"}
        
        if dataset_type == "train":
            self.train_data.rename(columns=rename_cols_map, inplace=True)
            self.train_data.columns = [col.lower() for col in self.train_data.columns]
            self.train_data.drop(columns=["id"], axis=1, inplace=True)
        elif dataset_type == "test":
            self.test_data.rename(columns=rename_cols_map, inplace=True)
            self.test_data.columns = [col.lower() for col in self.test_data.columns]
            self.test_data.drop(columns=["id"], axis=1, inplace=True)
        else:
            raise ValueError(f"dataset_type arg must be 'train' or 'test', got {dataset_type} instead.")
            
    def _treat_categorical_variables(self, data: pd.DataFrame, drop_original_vars: bool = True):
        """Internal function that reencodes of 'education' variable and creation of flags for clients who are male,
        married, or single."""
        assert "education" in self._mappings, "There is no mapping to reencode 'education' variable."
        
        data.education = self._map_values(data.education, self._mappings.get("education"))
        
        data["is_male"] = np.where(data.sex == 1, 1, 0)
        data["is_married"] = np.where(data.marriage == 1, 1, 0)
        data["is_single"] = np.where(data.marriage == 2, 1, 0)
        
        if drop_original_vars:
            data.drop(["sex", "marriage"], axis=1, inplace=True)
                    
        return data
            
    def _feature_engineering(self, data: pd.DataFrame):
        """ """
        def avoid_zero_division(row, pay_amt, bill_amt):
            """"Util function that returns the pay_amtX when bill_amtX is equal to zero."""
            if row[bill_amt] == 0:
                # If pay_amt is positive, it means the client has overpaid:
                return -row[pay_amt]
        
            return 0.0 if row[pay_amt] == 0 else row[pay_amt] / row[bill_amt]
        
        def calculate_change_rate(row, col_amt1, col_amt2):
            """Calculates the change rate (pay_amt or bill_amt) between two consecutive months."""
            if row[col_amt1] == 0:
                return 0.0
            else:
                return (row[col_amt2] - row[col_amt1] / row[col_amt1])

        if self._calculate_bill_to_limit_bal_ratio:
            data["bill_amt1_limit_bal_ratio"] = data.bill_amt1 / data.limit_bal
            data["bill_amt2_limit_bal_ratio"] = data.bill_amt2 / data.limit_bal
            data["bill_amt3_limit_bal_ratio"] = data.bill_amt3 / data.limit_bal
            data["bill_amt4_limit_bal_ratio"] = data.bill_amt4 / data.limit_bal
            data["bill_amt5_limit_bal_ratio"] = data.bill_amt5 / data.limit_bal
            data["bill_amt6_limit_bal_ratio"] = data.bill_amt6 / data.limit_bal
        
        if self._calculate_pay_to_bill_ratio:
            data["pay_amt1_bill_amt1_ratio"] = data.apply(lambda x: avoid_zero_division(x, "pay_amt1", "bill_amt1"), axis=1)
            data["pay_amt2_bill_amt2_ratio"] = data.apply(lambda x: avoid_zero_division(x, "pay_amt2", "bill_amt2"), axis=1)
            data["pay_amt3_bill_amt3_ratio"] = data.apply(lambda x: avoid_zero_division(x, "pay_amt3", "bill_amt3"), axis=1)
            data["pay_amt4_bill_amt4_ratio"] = data.apply(lambda x: avoid_zero_division(x, "pay_amt4", "bill_amt4"), axis=1)
            data["pay_amt5_bill_amt5_ratio"] = data.apply(lambda x: avoid_zero_division(x, "pay_amt5", "bill_amt5"), axis=1)
            data["pay_amt6_bill_amt6_ratio"] = data.apply(lambda x: avoid_zero_division(x, "pay_amt6", "bill_amt6"), axis=1)
        
        if self._calculate_num_negative_bill_statements:
            bill_amt_cols = ["bill_amt1", "bill_amt2", "bill_amt3", "bill_amt4", "bill_amt5", "bill_amt6"]
            data["num_overpays"] = (data[bill_amt_cols] < 0).sum(axis=1)
        
        if self._calculate_payment_delays:
            data["payment_delay_amt1"] = (data.bill_amt1 - data.pay_amt1).apply(lambda x: max(0, x))
            data["payment_delay_amt2"] = (data.bill_amt2 - data.pay_amt2).apply(lambda x: max(0, x))
            data["payment_delay_amt3"] = (data.bill_amt3 - data.pay_amt3).apply(lambda x: max(0, x))
            data["payment_delay_amt4"] = (data.bill_amt4 - data.pay_amt4).apply(lambda x: max(0, x))
            data["payment_delay_amt5"] = (data.bill_amt5 - data.pay_amt5).apply(lambda x: max(0, x))
            data["payment_delay_amt6"] = (data.bill_amt6 - data.pay_amt6).apply(lambda x: max(0, x))
        
        if self._calculate_payment_change_rate:
            data["payment_change_rate_amt1_amt2"] = data.apply(lambda x: calculate_change_rate(x, "pay_amt1", "pay_amt2"), axis=1)
            data["payment_change_rate_amt2_amt3"] = data.apply(lambda x: calculate_change_rate(x, "pay_amt2", "pay_amt3"), axis=1)
            data["payment_change_rate_amt3_amt4"] = data.apply(lambda x: calculate_change_rate(x, "pay_amt3", "pay_amt4"), axis=1)
            data["payment_change_rate_amt4_amt5"] = data.apply(lambda x: calculate_change_rate(x, "pay_amt4", "pay_amt5"), axis=1)
            data["payment_change_rate_amt5_amt6"] = data.apply(lambda x: calculate_change_rate(x, "pay_amt5", "pay_amt6"), axis=1)
        
        if self._calculate_bill_change_rate:
            data["bill_change_rate_amt1_amt2"] = data.apply(lambda x: calculate_change_rate(x, "bill_amt1", "bill_amt2"), axis=1)
            data["bill_change_rate_amt2_amt3"] = data.apply(lambda x: calculate_change_rate(x, "bill_amt2", "bill_amt3"), axis=1)
            data["bill_change_rate_amt3_amt4"] = data.apply(lambda x: calculate_change_rate(x, "bill_amt3", "bill_amt4"), axis=1)
            data["bill_change_rate_amt4_amt5"] = data.apply(lambda x: calculate_change_rate(x, "bill_amt4", "bill_amt5"), axis=1)
            data["bill_change_rate_amt5_amt6"] = data.apply(lambda x: calculate_change_rate(x, "bill_amt5", "bill_amt6"), axis=1)
        
        if self._calculate_total_payment:
            pay_amt_cols = ["pay_amt1", "pay_amt2", "pay_amt3", "pay_amt4", "pay_amt5", "pay_amt6"]
            data["total_payment"] = (data[pay_amt_cols]).sum(axis=1)
        
        if self._list_vars_to_drop and len(self._list_vars_to_drop) > 0:
            data.drop(self._list_vars_to_drop, axis=1, inplace=True)
        
        return data
        

#### Model Trainer

In [61]:
class ModelTrainer():
    """"""
    pass  # feature selection should be here (and not in data processor), as well as random search CV!

#### Model Evaluator

In [62]:
class ModelEvaluator():
    """"""
    pass  # classification reports 

### Data loading, preprocessing and feature engineering

In [63]:
train_dir = "../data/train_data.csv"
test_dir = "../data/test_data.csv"

bill_amt_cols = ["bill_amt1", "bill_amt2", "bill_amt3", "bill_amt4", "bill_amt5", "bill_amt6"]
pay_amt_cols = ["pay_amt1", "pay_amt2", "pay_amt3", "pay_amt4", "pay_amt5", "pay_amt6"]

mappings = {
    "education": {
        1: 1, #"Graduate School", 
        2: 2, #"University", 
        3: 3, #"High School", 
        4: 4, #"Others",
        5: 4, #"Unknown",
        6: 4, #"Unknown"     
    }
}

In [68]:
data_processor = DataProcessor(
    mappings=mappings
)

# Load train and test data:
data_processor.load_train_data(train_dir)
data_processor.load_test_data(test_dir)

# Split train into train and val datasets:
data_processor.split_data()

# Deal with categorical variables:
data_processor.treat_categorical_variables(drop_original_vars=True)

# Check datasets composition:
data_processor.print_dataset_stats()

# Perform feature engineering:
data_processor.feature_engineering(
    calculate_bill_to_limit_bal_ratio=True,
    calculate_pay_to_bill_ratio=True,
    calculate_num_negative_bill_statements=True,
    calculate_payment_delays=True,
    calculate_payment_change_rate=True,
    calculate_bill_change_rate=True,
    calculate_total_payment=True,
    list_vars_to_drop=None
)

# Deal with outliers:
data_processor.treat_outliers() # TODO: implement

# Apply smote to oversample the minority class:
data_processor.perform_oversampling() # TODO: implement

# Standardize data:
# data_processor.standardize_data()

data_processor.X_train.sample(10)

Train dataset: 19200 samples, with the target classes as follow:


0    18691
1     5309
Name: target, dtype: int64

Val dataset: 4800 samples, with the target classes as follow:


0    3738
1    1062
Name: target, dtype: int64

Test dataset: 6000 samples, with the target classes as follow:


0    4673
1    1327
Name: target, dtype: int64

AttributeError: 'numpy.ndarray' object has no attribute 'sample'

### Feature selection

In [69]:
data_processor.X_val

# data_processor.print_datasets_stats()

array([[ 0.09916111,  0.21860313,  0.60424911, ...,  0.94317227,
         0.96657501, -0.09379415],
       [-0.9030341 ,  1.57270063, -1.13841718, ..., -0.42527236,
        -0.35872959,  0.25168828],
       [-1.13430992,  1.57270063,  0.49533247, ..., -0.44909367,
        -0.6464422 , -0.30432708],
       ...,
       [ 0.48462081, -1.13549437, -0.7027506 , ...,  0.31565163,
         0.06756384,  0.48903852],
       [ 1.48681603,  0.21860313,  0.38641583, ..., -0.65778231,
        -0.64642532, -0.51427384],
       [ 0.87008051, -1.13549437, -0.59383396, ..., -0.65778231,
        -0.64642532, -0.51427384]])

### Models training

### Models evaluation

### Conclusions and next steps