**A step by step guide on how to preprocess your data, train models, and evaluate their performance using the utilities in each file.**

# Import Libraries

In [1]:
import pandas as pd
from pathlib import Path
from train_utils.feature_engineering import preprocess_dataframe, extract_historical_loan_features, calculate_loan_repayment_features
from train_utils.run_training import run_training
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier

# Feature Engineering

In [2]:
# File paths
data_dir = Path('machine-learning/')

# Load and preprocess the loan data
train_loan_data = preprocess_dataframe(pd.read_csv(data_dir / "train_loan_data.csv"))
train_repayment_data = preprocess_dataframe(pd.read_csv(data_dir / "train_payment_data.csv", parse_dates=['paid_at']))

# Extract features
historical_loan_features = extract_historical_loan_features(
    train_loan_data, 
    known_approval_statuses=['Approved', 'Declined', 'Cancelled', 'Expired']
)

historical_repayment_features = calculate_loan_repayment_features(
    train_repayment_data, 
    transaction_types=['Deposit', 'Discount']
)

# Add target and metadata to aid with feature data train test splitting.
historical_loan_features = pd.merge(
    left=historical_loan_features,
    right=train_loan_data[['loan_id', 'business_id', 'application_number', 'loan_number', 'paid_late', 'sector']]
)

# Merge feature data
feature_data = pd.merge(
    left=historical_loan_features,
    right=historical_repayment_features,
    left_on='last_approved_loan_id',
    right_on='loan_id'
)
feature_data['paid_late'] = feature_data['paid_late'].astype(int)

# Model Training

In [17]:
numerical_features = [
    'num_previous_applications', 'mean_principal_previous', 'mean_owing_previous', 
    'mean_employee_count_previous', 'num_approved_previous', 'num_declined_previous', 
    'num_cancelled_previous', 'num_expired_previous', 'total_amount_paid', 'num_payments', 
    'max_payment_amount', 'duration', 'single_repayment', 'num_Deposit', 'total_Deposit_amount', 
    'num_Discount', 'total_Discount_amount'
]

categorical_features = []  # Add categorical feature names.

# Set model
model = LogisticRegressionCV()
# model = RandomForestClassifier()

model_params = {
    'logisticregressioncv__class_weight': ['balanced'],
    'logisticregressioncv__random_state': [42],
    'logisticregressioncv__max_iter': [1000]
}
# model_params = {
#     'randomforestclassifier__n_estimators':[100, 200]
# }

# Loss function
scoring = 'neg_log_loss'
# scoring='accuracy'

# Call the training function
trained_model, evaluation = run_training(
    feature_data, 
    numerical_features, 
    categorical_features, 
    model, 
    model_params, 
    scoring=scoring
)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


# Evaluation

In [18]:
evaluation.test_metrics

{'precision': 0.034812880765883375,
 'recall': 0.5517241379310345,
 'f1_score': 0.06549324600900532,
 'confusion_matrix': array([[2593, 2218],
        [  65,   80]])}

In [19]:
evaluation.train_metrics

{'precision': 0.03437024146182293,
 'recall': 0.42934782608695654,
 'f1_score': 0.06364551863041289,
 'confusion_matrix': array([[7976, 4439],
        [ 210,  158]])}