In [None]:
#import packages#
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import  train_test_split, cross_val_score
import lightgbm as lgb
import optuna
import optuna.integration.lightgbm as opt_lgb
from sklearn.metrics import log_loss, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix, brier_score_loss
import joblib
from lightgbm import LGBMClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import os
import json
import shap
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [None]:
# training_helper #

## load dictionary
def load_dictionary(dictionary_path):
    data_dictionary = pd.read_csv(dictionary_path)
    columns_for_training = data_dictionary[data_dictionary["use_for_training"] == "Y"]["columns_cleaned"].tolist()
    hold_out_columns = data_dictionary[data_dictionary["hold_out_columns"] == "Y"]["columns_cleaned"].tolist()
    prediction_target = data_dictionary[data_dictionary["prediction_target"] == "Y"]["columns_cleaned"].tolist()
    return columns_for_training, hold_out_columns,prediction_target

## select features
def select_data(data, dictionary_path):
    columns_for_training, hold_out_columns,prediction_target = load_dictionary(dictionary_path)
    data["label"] = data[prediction_target] >= 0
    target_column = "label"
    data = data[columns_for_training + [target_column]]

    # Define catgorical features #
    cat_features = list(data.select_dtypes(include=['object']).columns)
    data[cat_features] = data[cat_features].astype("category") 
    return data, hold_out_columns, target_column

## train vs. test split
def balanced_train_validation_test(data, target_column,random_state):
    X = data.loc[:, data.columns != target_column]
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_state, stratify=y)
    return X_train, X_test, y_train, y_test

## split function
def fundtap_train_test_split(data, dictionary_path,random_state=456):  
    data, hold_out_columns,target_column = select_data(data, dictionary_path)
    X_train, X_test, y_train, y_test = balanced_train_validation_test(data.loc[:, ~data.columns.isin(hold_out_columns)],
                                                                      target_column,random_state)
    
    train_instance_weight = np.abs(data.loc[X_train.index]["fundtap_profit_loss"])
    test_instance_weight = np.abs(data.loc[X_test.index]["fundtap_profit_loss"])

    train_holdout = data.loc[X_train.index, hold_out_columns]
    test_holdout = data.loc[X_test.index, hold_out_columns]

    return X_train, X_test, y_train, y_test, train_instance_weight, test_instance_weight,train_holdout,test_holdout



In [None]:
# hp_tuning_helper #

def learning_rate_decay_power_0995(current_iter): 
    base_learning_rate = 0.1 
    lr = base_learning_rate * np.power(.995, current_iter) 
    return lr if lr > 1e-3 else 1e-3


def hp_tuning_init_param(X_train, y_train, metric, k_folds, feval,random_state):
    optuna.logging.set_verbosity(optuna.logging.WARNING) 
    dtrain = opt_lgb.Dataset(X_train, label=y_train)
    
    fixed_param =  params = {
        "objective": "binary",
        "metric": metric,
        "boosting_type": "gbdt",
        'random_state' : random_state,
        'verbose':-1,
        'is_unbalance': 'true'
    }

    tuner = opt_lgb.LightGBMTunerCV(
        fixed_param, dtrain, verbose_eval=False,
        early_stopping_rounds=100, 
        nfold = k_folds,
        stratified = True,
        show_progress_bar = False,
        optuna_seed = random_state,
        feval=feval,
        callbacks=[lgb.reset_parameter(learning_rate = learning_rate_decay_power_0995) ]
    )

    

    tuner.run()
    return tuner.best_params 


In [None]:
# result_interpretation_helper #

def accuracy_analysis(classifier, X_train, X_test, y_train, y_test, test_instance_weight):
    pred_proba = classifier.predict(X_test)
    yhat = np.where(pred_proba < 0.5, 0, 1)

    # Calculate standard metrics
    mcc = matthews_corrcoef(y_test, yhat)
    logloss = log_loss(y_test, pred_proba)
    bs = brier_score_loss(y_test, pred_proba)

    # Calculate weighted metrics
    weighted_mcc = matthews_corrcoef(y_test, yhat, sample_weight = test_instance_weight)
    weighted_logloss = log_loss(y_test, pred_proba,sample_weight=test_instance_weight)
    weighted_bs= brier_score_loss(y_test, pred_proba, sample_weight= test_instance_weight)

    # Confusion matrix and rates
    tn, fp, fn, tp = confusion_matrix(y_test, yhat).ravel()
    fpr = (fp) / float(fp + tn)
    fnr = (fn) / float(fn + tp)

    # Training set confusion matrix and rates
    train_yhat = np.where(classifier.predict(X_train) < 0.5, 0, 1)
    train_tn, train_fp, train_fn, train_tp = confusion_matrix(y_train, train_yhat).ravel()
    train_fpr = (train_fp) / float(train_fp + train_tn)
    train_fnr = (train_fn) / float(train_fn + train_tp)

    # Create the DataFrame
    accuracy_df = pd.DataFrame([weighted_mcc, weighted_logloss, weighted_bs, mcc, logloss, bs, tn, fp, fn, tp, fpr, fnr, train_tn, train_fp, train_fn, train_tp, train_fpr, train_fnr])
    accuracy_df = accuracy_df.transpose()
    accuracy_df.columns = ["weighted_mcc", "weighted_logloss", "weighted_bs", "mcc", "logloss", "brier_score_loss", "tn", "fp", "fn", "tp", "fpr", "fnr", "train_tn", 'train_fp', "train_fn", "train_tp", "train_fpr", "train_fnr"]

    # Calculate and include total loss, fp loss, and fn loss if test_instance_weight is not uniform
    if test_instance_weight.unique().size > 1:
        total_loss = test_instance_weight[yhat != y_test].sum()
        fp_loss = test_instance_weight[(yhat != y_test) & (yhat == 1)].sum()
        fn_loss = test_instance_weight[(yhat != y_test) & (yhat == 0)].sum()
        accuracy_df["total_loss"] = total_loss
        accuracy_df["fp_loss"] = fp_loss
        accuracy_df["fn_loss"] = fn_loss

    return accuracy_df



def multiclass_accuracy_analysis(classifier, X_test, y_test ):
        pred_proba = classifier.predict(X_test)
        yhat = list(pred_proba.argmax(axis = 1))
        return pd.DataFrame(confusion_matrix(y_test, yhat))


    
def get_feature_importance(classifier):
        feature_importance = pd.DataFrame({'Features': classifier.feature_name(),'Importances': classifier.feature_importances()})
        feature_importance.sort_values(by='Importances', inplace=True,ascending = False )
        return feature_importance

In [None]:
# training #

def get_warm_start_parameter(warm_start_file):
    """
    Checks if the warm start parameter JSON file exists.
    If it exists, reads and returns the parameters.
    If it does not exist, returns an empty string.

    Args:
    warm_start_file (str): Path to the warm start parameter JSON file.

    Returns:
    dict or str: Parameters from the file if it exists, otherwise an empty string.
    """
    if os.path.exists(warm_start_file):
        with open(warm_start_file, 'r') as file:
            parameters = json.load(file)
            return parameters
    else:
        return ""
    



def train_profit_loss_binary(data_path, dictionary_path,warm_start_file, model_dir, new_customer):
    # Load and label data 
    df = pd.read_csv(data_path)
    
    # create components
    hyperparameters = get_warm_start_parameter(warm_start_file)
    if hyperparameters != "":
        warm_starting_param = get_warm_start_parameter(warm_start_file)
    else:
        warm_starting_param = {"bagging_fraction": 0.5492535456145099, "bagging_freq": 2, "feature_fraction": 0.88, "lambda_l1": 0.00011548574578690704, "lambda_l2": 1.3199945533897172e-06, "max_depth": 12, "min_child_samples": 10, "num_leaves": 48}
    random_state = 456   
    fixed_param = {
    'objective': 'binary',
    'metric':  "binary_logloss",
    'boosting_type': 'gbdt',
    'random_state': random_state,
    'verbose': -1,
    'feature_pre_filter': False
    }

    if new_customer:
        columns_to_drop = ["funded_outstanding","priorfundtaphistoryfundedsum","priorfundtaphistorycompletedsum", "priorfundtaphistoryduesum", "priorfundtaphistorypendingsum"]
        df = df.drop(columns = columns_to_drop, errors = 'ignore')
    # full-model 
    X_train, X_test, y_train, y_test, train_instance_weight, test_instance_weight,train_holdout,test_holdout = fundtap_train_test_split(df, 
                                                                                                              dictionary_path,
                                                                                                              random_state=456)
    weight = train_instance_weight

    def learning_rate_decay_power_0995(current_iter): 
        base_learning_rate = 0.1 
        lr = base_learning_rate * np.power(.995, current_iter) 
        return lr if lr > 1e-3 else 1e-3

    def objective(trial):
        max_depth = trial.suggest_int('max_depth', 2, 14)
        max_num_leaves = (2 ** max_depth) - 1
        dtrain = lgb.Dataset(X_train, label=y_train, weight=weight)
        param = {
             'objective': 'binary',
             'metric': "binary_logloss",
             'verbosity': -1,
             'boosting_type': 'gbdt',
             'max_depth': max_depth,
             'num_leaves': trial.suggest_int('num_leaves', 2, min(128, max_num_leaves)),
             'lambda_l1': trial.suggest_float('lambda_l1', 1e-3, 10.0, log=True),
             'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 10.0, log=True),
             'feature_fraction': trial.suggest_float('feature_fraction', 0.3, 0.8),
             'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
             'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
             'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
             'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-3, 1.0),
             'seed': random_state,            
         }

        lgbcv = lgb.cv(param,
                    dtrain,
                    nfold=5,
                    stratified=True,
                    num_boost_round=10000,
                    callbacks=[
                        lgb.early_stopping(100),
                        lgb.reset_parameter(learning_rate=learning_rate_decay_power_0995)
                    ]
                    )
        print("CV Results Keys:", lgbcv.keys())  # Debugging: Print the keys of the CV results
    
        score_mean = "binary_logloss-mean"
        score_stdv = "binary_logloss-stdv"

        if score_mean in lgbcv and score_stdv in lgbcv:
            cv_score = lgbcv[score_mean][-1] + lgbcv[score_stdv][-1]
        else:
            print(f"Keys {score_mean} and {score_stdv} not found in CV results.")
            cv_score = float('inf')  # Assign a large value to ensure this trial is not selected

        return cv_score
       
    
    study = optuna.create_study(direction="minimize")  # default TPE sampleR
    if warm_starting_param == "":
        study.enqueue_trial({**fixed_param})
    else:
        study.enqueue_trial({**fixed_param, **warm_starting_param})
    n_trials =35
    study.optimize(objective, n_trials=n_trials, gc_after_trial=True, n_jobs = 3)

    best_params = study.best_params

    dtrain = lgb.Dataset(X_train, label=y_train, weight=weight)
    classifier = lgb.train({**fixed_param, **best_params}, dtrain)
    accuracy = accuracy_analysis(
        classifier, X_train, X_test, y_train, y_test, test_instance_weight)

    # save components
    final_dtrain = lgb.Dataset(pd.concat([X_train, X_test]), label=pd.concat([y_train, y_test]), weight=pd.concat([train_instance_weight, test_instance_weight]))
    final_clf = lgb.train({**fixed_param, **best_params}, final_dtrain)
    
    model_dir = Path(model_dir)
    model_dir.mkdir(exist_ok=True, parents=True)
    if new_customer:
        file_prefix = "new_customer_profitloss"
    else:
        file_prefix = "existing_customer_profitloss"
    joblib.dump(final_clf, Path(model_dir, file_prefix+"classifier.joblib"))
    accuracy.to_csv(Path(model_dir, file_prefix+"accuracy.csv"))
    with open(Path(model_dir, file_prefix+"hyperparamter.json"), 'w') as fp:
        json.dump({**best_params}, fp)
    


In [None]:
# Function to clean dtype #
# Load the DataFrame
df = pd.read_csv('../data/train.csv', low_memory=False)


# Specify the column numbers with mixed data types
mixed_type_column_indices = [7, 16]  # Replace with your actual column indices

# Convert the column indices to column names
mixed_type_columns = df.columns[mixed_type_column_indices]

# Function to convert non-int64 values to 0
def convert_to_int64(df, columns):
    for col in columns:
        # Convert column to numeric, setting errors='coerce' will turn non-numeric values to NaN
        df[col] = pd.to_numeric(df[col], errors='coerce')
        # Fill NaN values with 0 and convert the column to int64
        df[col] = df[col].fillna(0).astype('int64')
    return df

# Convert non-int64 values to 0 for the specified columns
df = convert_to_int64(df, mixed_type_columns)

# Save the cleaned DataFrame to a new CSV file
cleaned_data_path = '../data/train_cleaned.csv'
df.to_csv(cleaned_data_path, index=False)


In [None]:
# Function to clean column names #
def clean_column_names(df):
    df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)
    df= df.dropna(subset=['fundtap_profit_loss'])
    return df

# Assuming processed_data is a DataFrame
train_data = pd.read_csv('../data/train_cleaned.csv')
processed_data = clean_column_names(train_data)

# Save the cleaned DataFrame to a CSV file
processed_data_path = '../data/processed_data.csv'
processed_data.to_csv(processed_data_path, index=False)

In [None]:
# Train according to the data-dictionary #
dictionary_path = "../data/fundtap-data-dictionary.csv"
warm_start_file = 'path/to/warm_start_parameters.json'
model_dir = Path('../model_output')
model_dir.mkdir(exist_ok=True, parents=True)
new_customer = True

# Run the training function
train_profit_loss_binary(processed_data_path, dictionary_path, warm_start_file, model_dir, new_customer)


In [None]:
df= processed_data
X_train, X_test, y_train, y_test, train_instance_weight, test_instance_weight,train_holdout,test_holdout= fundtap_train_test_split(df, 
                                                                                                         dictionary_path, 
                                                                                                         random_state=456)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print("Train instance weight shape:", train_instance_weight)
print("Test instance weight shape:", test_instance_weight.shape)
print(train_holdout)
print(test_holdout)
print(len(df.columns))
print(df["label"])



In [None]:
import pandas as pd

def export_features_to_txt(X_train, X_test, y_train, y_test, file_path):
    with open(file_path, 'w') as f:
        f.write("Features in X_train:\n")
        f.write("\n".join(X_train.columns) + "\n\n")

        f.write("First 5 rows of X_train:\n")
        f.write(X_train.head().to_string() + "\n\n")
        
        f.write("Features in X_test:\n")
        f.write("\n".join(X_test.columns) + "\n\n")

        f.write("First 5 rows of X_test:\n")
        f.write(X_test.head().to_string() + "\n\n")
                      
        f.write("First 5 values of y_train:\n")
        f.write(y_train.head().to_string() + "\n\n")
        
        f.write("First 5 values of y_test:\n")
        f.write(y_test.head().to_string() + "\n\n")
        
        f.write("Features of train_instance_weight:\n")
        f.write(train_instance_weight.to_string() + "\n\n")
        
        f.write("Features of test_instance_weight:\n")
        f.write(test_instance_weight.to_string() + "\n\n")

# Assuming X_train, X_test, y_train, y_test, train_instance_weight, test_instance_weight are defined
export_features_to_txt(X_train, X_test, y_train, y_test, "../model_output_history/model_features/features_train02.txt")


Functions updated
1. Fix the absence of weighted scores and unexpected high FPR by further cleaned the data(6,7,fundtap_profit_loss)
2. Tried to handle warm starting
3. Output training data features and instant_weight as txt. 
4. Added some more parameters but they impacted the model performance (higher FPR and FNR ). 

Problems to be check:
1. Warm start file is not created. Where should it build?
2. Remain added parameters or remove them to regain the better performance score?