In [None]:
#import packages#
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import  train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
import lightgbm as lgb
import optuna
import optuna.integration.lightgbm as opt_lgb
from sklearn.metrics import log_loss, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix, brier_score_loss
import argparse
import joblib
from lightgbm import LGBMClassifier
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import optuna
import json
import gc
import shap
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [None]:
#training_helper#

def load_dictionary(dictionary_path):
    data_dictionary = pd.read_csv(dictionary_path)
    training_columns = data_dictionary[data_dictionary["use_for_training"] == "Y"]["columns_cleaned"].tolist()
    hold_out_columns = data_dictionary[data_dictionary["hold_out_columns"] == "Y"]["columns_cleaned"].tolist()
    return training_columns, hold_out_columns

def select_data(data, dictionary_path, target_column):
    # Get the columns to use for training and the columns to hold out
    training_columns, hold_out_columns = load_dictionary(dictionary_path)

    # Ensure only training columns are used, excluding hold-out columns and the target column
    training_columns = [col for col in training_columns if col not in hold_out_columns + [target_column]]
    
    # Filter data to include only training columns plus the target column
    data = data[training_columns+ [target_column]]

    # Define catgorical features #
    cat_features = list(data.select_dtypes(include=['object']).columns)
    data[cat_features] = data[cat_features].astype("category") 
    return data

def balanced_train_validation_test(data, target_column,random_state):
    X = data.loc[:, data.columns != target_column]
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_state, stratify=y)
    return X_train, X_test, y_train, y_test


def fundtap_train_test_split(data, dictionary_path, target_column="label",random_state=456):            
    data = select_data(data, dictionary_path, target_column)
    X_train, X_test, y_train, y_test = balanced_train_validation_test(data, target_column, random_state)
    
    train_instance_weight = np.abs(data.loc[X_train.index]["fundtap_profit_loss"])
    test_instance_weight = np.abs(data.loc[X_test.index]["fundtap_profit_loss"])

    return X_train, X_test, y_train, y_test, train_instance_weight, test_instance_weight



In [None]:
#hp_tuning_helper#

def learning_rate_decay_power_0995(current_iter): 
    base_learning_rate = 0.1 
    lr = base_learning_rate * np.power(.995, current_iter) 
    return lr if lr > 1e-3 else 1e-3


def hp_tuning_init_param(X_train, y_train, metric, k_folds, feval,random_state):
    optuna.logging.set_verbosity(optuna.logging.WARNING) 
    dtrain = opt_lgb.Dataset(X_train, label=y_train)
    
    fixed_param =  params = {
        "objective": "binary",
        "metric": metric,
        "boosting_type": "gbdt",
        'random_state' : random_state,
        'verbose':-1,
        'is_unbalance': 'true'
    }

    tuner = opt_lgb.LightGBMTunerCV(
        fixed_param, dtrain, verbose_eval=False,
        early_stopping_rounds=100, 
        nfold = k_folds,
        stratified = True,
        show_progress_bar = False,
        optuna_seed = random_state,
        feval=feval,
        callbacks=[lgb.reset_parameter(learning_rate = learning_rate_decay_power_0995) ]
    )

    

    tuner.run()
    return tuner.best_params 


In [None]:
#result_interpretation_helper#

def accuracy_analysis(classifier, X_train, X_test, y_train, y_test, test_instance_weight ):

        pred_proba = classifier.predict(X_test)
        yhat =np.where(pred_proba < 0.5, 0, 1) 

        mcc = matthews_corrcoef(y_test, yhat)
        logloss = log_loss(y_test, pred_proba)
        bs= brier_score_loss(y_test, pred_proba)


        weighted_mcc = matthews_corrcoef(y_test, yhat, sample_weight = test_instance_weight)
        weighted_logloss = log_loss(y_test, pred_proba,sample_weight=test_instance_weight)
        weighted_bs= brier_score_loss(y_test, pred_proba, sample_weight= test_instance_weight)


        tn, fp, fn, tp = confusion_matrix(y_test, yhat).ravel()
        fpr = (fp)/ float(fp + tn)
        fnr = (fn)/ float(fn + tp)
        train_tn, train_fp, train_fn, train_tp = confusion_matrix(y_train, np.where(classifier.predict(X_train) < 0.5, 0, 1) ).ravel()
        train_fpr = (train_fp)/ float(train_fp + train_tn)
        train_fnr = (train_fn)/ float(train_fn + train_tp)

        accuracy_df = pd.DataFrame([  weighted_mcc, weighted_logloss, weighted_bs, mcc, logloss, bs, tn, fp, fn, tp, fpr, fnr, train_tn, train_fp, train_fn, train_tp,train_fpr,train_fnr])
        accuracy_df = accuracy_df.transpose()
        accuracy_df.columns = [ "weighted_mcc", "weighted_logloss", "weighted_bs", "mcc", "logloss", "brier_score_loss", "tn", "fp", "fn", "tp", "fpr", "fnr", "train_tn", 'train_fp', "train_fn", "train_tp","train_fpr","train_fnr"]
        if test_instance_weight.unique().size > 1:
                total_loss = test_instance_weight[yhat!=y_test].sum()
                fp_loss = test_instance_weight[(yhat!=y_test) & (yhat==1) ].sum()
                fn_loss = test_instance_weight[(yhat!=y_test) & (yhat == 0)].sum()
                accuracy_df["total_loss"] = total_loss
                accuracy_df["fp_loss"] = fp_loss
                accuracy_df["fn_loss"] = fn_loss
        return accuracy_df


def multiclass_accuracy_analysis(classifier, X_test, y_test ):
        pred_proba = classifier.predict(X_test)
        yhat = list(pred_proba.argmax(axis = 1))
        return pd.DataFrame(confusion_matrix(y_test, yhat))


    
def get_feature_importance(classifier):
        feature_importance = pd.DataFrame({'Features': classifier.feature_name(),'Importances': classifier.feature_importances()})
        feature_importance.sort_values(by='Importances', inplace=True,ascending = False )
        return feature_importance

In [None]:
#training#
"""
TRAINING FUNCTIONS: this file in run in 'script mode' when `.fit` is called
from the notebook. `parse_args` and `train_fn` are called in the
`if __name__ =='__main__'` block.
"""
shap.initjs()


NUMERICAL_TYPES = set(["boolean", "integer", "number"])
CATEGORICAL_TYPES = set(["string"])


class AsTypeFloat32(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.astype("float32")


def get_numerical_idxs(data_schema):
    idxs = get_idxs(data_schema, NUMERICAL_TYPES)
    return idxs


def get_categorical_idxs(data_schema):
    idxs = get_idxs(data_schema, CATEGORICAL_TYPES)
    return idxs


def get_idxs(data_schema, types):
    idxs = []
    for idx, type in enumerate(data_schema.item_types):
        if type in types:
            idxs.append(idx)
    return idxs


def create_preprocessor(data_schema) -> ColumnTransformer:
    numerical_idxs = get_numerical_idxs(data_schema)
    numerical_transformer = AsTypeFloat32()
    categorical_idxs = get_categorical_idxs(data_schema)
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    preprocessor = ColumnTransformer(
        transformers=[
            ("numerical", numerical_transformer, numerical_idxs),
            ("categorical", categorical_transformer, categorical_idxs),
        ],
        remainder="drop",
    )
    return preprocessor


def preprocess_numerical_schema(preprocessor, data_schema):
    num_idx = [e[0] for e in preprocessor.transformers].index("numerical")
    numerical_idxs = get_numerical_idxs(data_schema)
    numerical_items = [data_schema.items[idx] for idx in numerical_idxs]
    features = []
    for item in numerical_items:
        feature = {
            "title": item["title"],
            "description": item["description"],
            "type": "number"
        }
        features.append(feature)
    return num_idx, features


def preprocess_categorical_schema(preprocessor, data_schema):
    cat_idx = [e[0] for e in preprocessor.transformers].index("categorical")
    categorical_idxs = get_categorical_idxs(data_schema)
    categorical_items = [data_schema.items[idx] for idx in categorical_idxs]
    features = []
    ohe = preprocessor.transformers_[cat_idx][1]
    for item, categories in zip(categorical_items, ohe.categories_):
        for category in categories:
            feature = {
                "title": "{}__{}".format(item["title"], category),
                "description": "{} is '{}' if value is 1.0.".format(
                    item["description"].strip('.'), category
                ),
                "type": "number"
            }
            features.append(feature)
    return cat_idx, features


def transform_schema(preprocessor, data_schema):
    num_idx, num_features = preprocess_numerical_schema(preprocessor, data_schema)  # noqa
    cat_idx, cat_features = preprocess_categorical_schema(preprocessor, data_schema)  # noqa
    assert num_idx < cat_idx, "Ordering should be numerical, then categorical."
    features = num_features + cat_features

    array_schema = {
        "$schema": "http://json-schema.org/draft-04/schema#",
        "type": "array",
        "minItems": len(features),
        "maxItems": len(features),
        "items": features,
        "title": data_schema.title,
        "description": data_schema.description.replace(
            "items", "features"
        ),
    }
    return schemas.Schema(array_schema)


def load_schemas(schemas_folder):
    data_schema_filepath = Path(schemas_folder, "data.schema.json")
    data_schema = schemas.from_json_schema(data_schema_filepath)
    label_schema_filepath = Path(schemas_folder, "label.schema.json")
    label_schema = schemas.from_json_schema(label_schema_filepath)
    return data_schema, label_schema


def log_cross_val_auc(clf, X, y, cv_splits, log_prefix):
    cv_auc = cross_val_score(clf, X, y, cv=cv_splits, scoring='roc_auc')
    cv_auc_mean = cv_auc.mean()
    cv_auc_error = cv_auc.std() * 2
    log = "{}_auc_cv: {:.5f} (+/- {:.5f})"
    print(log.format(log_prefix, cv_auc_mean, cv_auc_error))


def log_auc(clf, X, y, log_prefix):
    y_pred_proba = clf.predict_proba(X)
    auc = roc_auc_score(y, y_pred_proba[:, 1])
    log = '{}_auc: {:.5f}'
    print(log.format(log_prefix, auc))


def train_pipeline(pipeline, X, y, cv_splits):
    # fit pipeline to cross validation splits
    if cv_splits > 1:
        log_cross_val_auc(pipeline, X, y, cv_splits, 'train')
    # fit pipeline to all training data
    pipeline.fit(X, y)
    log_auc(pipeline, X, y, 'train')
    return pipeline


def test_pipeline(pipeline, X, y):
    log_auc(pipeline, X, y, 'test')


def parse_args(sys_args):
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--tree-boosting-type",
        type=str,
        default="gbdt"
    )
    parser.add_argument(
        "--cv-splits",
        type=int,
        default=5
    )
    parser.add_argument(
        "--model-dir",
        type=str,
        default=os.environ.get("SM_MODEL_DIR")
    )
    parser.add_argument(
        "--schemas",
        type=str,
        default=os.environ.get("SM_CHANNEL_SCHEMAS")
    )
    parser.add_argument(
        "--data-train",
        type=str,
        default=os.environ.get("SM_CHANNEL_DATA_TRAIN"),
    )
    parser.add_argument(
        "--hyperparameters",
        type=str,
        default="",
    )
    parser.add_argument(
        "--label-train",
        type=str,
        default=os.environ.get("SM_CHANNEL_LABEL_TRAIN"),
    )
    parser.add_argument(
        "--data-test",
        type=str,
        default=os.environ.get("SM_CHANNEL_DATA_TEST")
    )
    parser.add_argument(
        "--label-test",
        type=str,
        default=os.environ.get("SM_CHANNEL_LABEL_TEST"),
    )

    parser.add_argument(
        "--ramdom-state",
        type=str,
        default=456,
    )

    args, _ = parser.parse_known_args(sys_args)
    return args

def get_shap_values(X_train, X_test, final_clf, model_dir, file_prefix):
    # Identify categorical columns
    cat_cols = X_train.select_dtypes(include=['category', 'object']).columns
    
    # Ensure both train and test datasets have the same categories
    for col in cat_cols:
        combined = pd.concat([X_train[col], X_test[col]], axis=0).astype('category')
        X_train[col] = X_train[col].astype('category').cat.set_categories(combined.cat.categories)
        X_test[col] = X_test[col].astype('category').cat.set_categories(combined.cat.categories)
    
    # Convert all features to numeric
    X_train = X_train.apply(pd.to_numeric, errors='coerce')
    X_test = X_test.apply(pd.to_numeric, errors='coerce')

    # Fill NaN values with 0 or you can use another strategy
    X_train = X_train.fillna(0)
    X_test = X_test.fillna(0)

    # Combine train and test datasets
    X = pd.concat([X_train, X_test])

    explainer = shap.TreeExplainer(final_clf, feature_perturbation='interventional')
    shap_values = explainer.shap_values(X)

    print(f"Type of shap_values: {type(shap_values)}")
    print(f"Shape of shap_values: {shap_values.shape if isinstance(shap_values, np.ndarray) else [v.shape for v in shap_values]}")

    if isinstance(shap_values, list):  # Check if shap_values is a list (multi-class classification)
        shap_values = shap_values[1]  # Use the shap values for class 1 for binary classification
    shap_values = np.array(shap_values) if shap_values.ndim == 1 else shap_values
    print(f"Shape of shap_values after conversion: {shap_values.shape}")

    vals = np.abs(np.array(shap_values)).mean(axis=0)
    print(f"Shape of vals: {vals.shape}")

    feature_names = X_train.columns

    feature_importance = pd.DataFrame(list(zip(feature_names, vals)),
                                      columns=['col_name', 'feature_importance_vals'])
    feature_importance.sort_values(by=['feature_importance_vals'], ascending=False, inplace=True)

    shap.summary_plot(shap_values, X, max_display=40, show=False)
    plt.savefig(Path(model_dir, file_prefix + ".png"), bbox_inches='tight')
    plt.close()
    X.to_csv(Path(model_dir, file_prefix + "X.csv"))
    for name in list(feature_importance.col_name)[0:30]:
        shap.dependence_plot(name, shap_values, X, display_features=X, show=False)
        plt.savefig(Path(model_dir, file_prefix + "_" + name + ".png"), bbox_inches='tight')
        plt.close()



def train_profit_loss_binary(df, dictionary_path, hyperparameters, model_dir, new_customer):
    ## load data
    df["label"] = df.fundtap_profit_loss >= 0
    
    # create components
    warm_starting_param = ""
    if hyperparameters != "":
        warm_starting_param = datasets.read_hyper_parameters(hyperparameters)
    else:
        warm_starting_param = {"bagging_fraction": 0.5492535456145099, "bagging_freq": 2, "feature_fraction": 0.88, "lambda_l1": 0.00011548574578690704, "lambda_l2": 1.3199945533897172e-06, "max_depth": 12, "min_child_samples": 10, "num_leaves": 48}
    random_state = 456   
    fixed_param = {
    'objective': 'binary',
    'metric':  "binary_logloss",
    'boosting_type': 'gbdt',
    'random_state': random_state,
    'verbose': -1,
    'feature_pre_filter': False
    }

    if new_customer:
        columns_to_drop = ["funded_outstanding","priorfundtaphistoryfundedsum","priorfundtaphistorycompletedsum", "priorfundtaphistoryduesum", "priorfundtaphistorypendingsum"]
        df = df.drop(columns = columns_to_drop, errors = 'ignore')
    # full-model 
    X_train, X_test, y_train, y_test, train_instance_weight, test_instance_weight = fundtap_train_test_split(df, dictionary_path, target_column="label", random_state=456)
    weight = train_instance_weight
    def objective(trial):
        max_depth = trial.suggest_int('max_depth', 2, 14)
        max_num_leaves = (2 ** max_depth) - 1
        dtrain = lgb.Dataset(X_train, label=y_train, weight=weight)
        param = {
            'objective': 'binary',
            'metric': "binary_logloss",
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'max_depth': max_depth,
            'num_leaves': trial.suggest_int('num_leaves', 2, max_num_leaves),
            'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0,log=True),
            'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0,log=True),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 0, 15),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
            'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-3, 10),
            'seed': random_state
        }

        lgbcv = lgb.cv(param,
                    dtrain,
                    nfold=5,
                    stratified=True,
                    num_boost_round=10000,
                    callbacks=[
                        lgb.early_stopping(100),
                        lgb.reset_parameter(learning_rate=learning_rate_decay_power_0995)
                    ]
                    )
        print("CV Results Keys:", lgbcv.keys())  # Debugging: Print the keys of the CV results
    
        score_mean = "binary_logloss-mean"
        score_stdv = "binary_logloss-stdv"

        if score_mean in lgbcv and score_stdv in lgbcv:
            cv_score = lgbcv[score_mean][-1] + lgbcv[score_stdv][-1]
        else:
            print(f"Keys {score_mean} and {score_stdv} not found in CV results.")
            cv_score = float('inf')  # Assign a large value to ensure this trial is not selected

        return cv_score
       
    
    study = optuna.create_study(direction="minimize")  # default TPE sampleR
    study.enqueue_trial({**fixed_param, **warm_starting_param})
    n_trials = 10
    study.optimize(objective, n_trials=n_trials, gc_after_trial=True, n_jobs = 3)

    best_params = study.best_params

    dtrain = lgb.Dataset(X_train, label=y_train, weight=weight)
    classifier = lgb.train({**fixed_param, **best_params}, dtrain)
    accuracy = accuracy_analysis(
        classifier, X_train, X_test, y_train, y_test, test_instance_weight)

    # save components
    final_dtrain = lgb.Dataset(pd.concat([X_train, X_test]), label=pd.concat([y_train, y_test]), weight=pd.concat([train_instance_weight, test_instance_weight]))
    final_clf = lgb.train({**fixed_param, **best_params}, final_dtrain)

    model_dir = Path(model_dir)
    model_dir.mkdir(exist_ok=True, parents=True)
    if new_customer:
        file_prefix = "new_customer_profitloss"
    else:
        file_prefix = "existing_customer_profitloss"
    joblib.dump(final_clf, Path(model_dir, file_prefix+"classifier.joblib"))
    accuracy.to_csv(Path(model_dir, file_prefix+"accuracy.csv"))
    with open(Path(model_dir, file_prefix+"hyperparamter.json"), 'w') as fp:
        json.dump({**best_params}, fp)
        
    


def train_profit_loss_multi(train_data_path, hyperparameters,model_dir, new_customer):
    ## load data
    def encoding_label(x):
        if x <4:
            return x
        else:
            return 4
    
    df = datasets.read_csv_dataset(train_data_path)

    df = df[df.fundtap_profit_loss.notnull() & df.fundtap_profit_loss != 0]
    df = preprocessing(df, method="multi")
    if "weekspastdue" not in df.columns:
        return 1
    df["label"] = df.weekspastdue
    df['label'] = df['label'].apply(lambda x: encoding_label(x))

    # create components
    warm_starting_param = ""
    if hyperparameters != "":
        warm_starting_param = datasets.read_hyper_parameters(hyperparameters)
    else:
        warm_starting_param = {'max_depth': 14,
                                'num_leaves': 3445,
                                'lambda_l1': 0.0024719526504884707,
                                'lambda_l2': 8.956806049714798e-06,
                                'feature_fraction': 0.15930975035202272,
                                'bagging_fraction': 0.24596777131151706,
                                'bagging_freq': 0,
                                'min_child_samples': 49,
                                'min_sum_hessian_in_leaf': 1.5433498652572106}
                                    
    random_state = 456   
    fixed_param = {
    'objective': 'multiclass',
    'metric':  "multi_logloss",
    'num_classes': 5,
    'boosting_type': 'gbdt',
    'random_state': random_state,
    'verbose': -1,
    'feature_pre_filter': False
    }

    if new_customer:
        columns_to_drop = ["funded_outstanding","priorfundtaphistoryfundedsum","priorfundtaphistorycompletedsum", "priorfundtaphistoryduesum", "priorfundtaphistorypendingsum"]
        df = df.drop(columns = columns_to_drop, errors = 'ignore')
    # full-model 
    X_train, X_test, y_train, y_test, train_instance_weight, test_instance_weight = fundtap_train_test_split(df, hold_out_columns=["quote","fundtap_profit_loss","weekspastdue"] , random_state = random_state)
    weight = train_instance_weight
    def objective(trial):
        max_depth = trial.suggest_int('max_depth', 2, 14)
        max_num_leaves = (2 ** max_depth) - 1
        dtrain = lgb.Dataset(X_train, label=y_train, weight=weight)
        param = {
            'objective': 'multiclass',
            'metric': "multi_logloss",
            'num_classes': 5,
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'max_depth': max_depth,
            'num_leaves': trial.suggest_int('num_leaves', 2, max_num_leaves),
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 0, 15),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
            'min_sum_hessian_in_leaf': trial.suggest_uniform('min_sum_hessian_in_leaf', 1e-3, 10),
            'seed': random_state
        }

        lgbcv = lgb.cv(param,
                    dtrain,
                    nfold=3,
                    stratified=True,
                    verbose_eval=False,
                    early_stopping_rounds=100,
                    num_boost_round=10000,
                    callbacks=[lgb.reset_parameter(
                        learning_rate=learning_rate_decay_power_0995)]
                    )
        score_mean = "multi_logloss-mean"
        score_stdv = "multi_logloss-stdv"
        cv_score = lgbcv[score_mean][-1] + lgbcv[score_stdv][-1]
        return cv_score
    
    study = optuna.create_study(direction="minimize")  # default TPE sampleR
    study.enqueue_trial({**fixed_param, **warm_starting_param})
    n_trials = 10
    study.optimize(objective, n_trials=n_trials, gc_after_trial=True, n_jobs = 3)

    best_params = study.best_params

    dtrain = lgb.Dataset(X_train, label=y_train, weight=weight)
    classifier = lgb.train({**fixed_param, **best_params}, dtrain)
    accuracy = multiclass_accuracy_analysis(classifier, X_test, y_test )
    # save components
    final_dtrain = lgb.Dataset(pd.concat([X_train, X_test]), label=pd.concat([y_train, y_test]), weight=pd.concat([train_instance_weight, test_instance_weight]))
    final_clf = lgb.train({**fixed_param, **best_params}, final_dtrain)

    
    model_dir = Path(model_dir)
    model_dir.mkdir(exist_ok=True, parents=True)
    if new_customer:
        file_prefix = "new_customer_overdue"
    else:
        file_prefix = "existing_customer_overdue"
    joblib.dump(final_clf, Path(model_dir, file_prefix+"classifier.joblib"))
    accuracy.to_csv(Path(model_dir, file_prefix+"accuracy.csv"))
    with open(Path(model_dir, file_prefix+"hyperparamter.json"), 'w') as fp:
        json.dump({**best_params}, fp)
    get_shap_values(X_train,X_test,final_clf,model_dir,file_prefix)

def train_fn(args):
    train_profit_loss_binary(args.data_train, args.hyperparameters,args.model_dir, new_customer = True)
    gc.collect()
    train_profit_loss_binary(args.data_train, args.hyperparameters,args.model_dir, new_customer = False)
    gc.collect()
    train_profit_loss_multi(args.data_train, args.hyperparameters,args.model_dir, new_customer = True)
    gc.collect()
    train_profit_loss_multi(args.data_train, args.hyperparameters,args.model_dir, new_customer = False)
    gc.collect()

In [None]:
# Function to clean column names #
def clean_column_names(df):
    df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)
    return df

# Assuming processed_data is a DataFrame
train_data = pd.read_csv('../data/train.csv')
processed_data = clean_column_names(train_data)

# Save the cleaned DataFrame to a CSV file
processed_data_path = '../data/processed_data.csv'
processed_data.to_csv(processed_data_path, index=False)

In [None]:
print(processed_data.dtypes)

In [None]:
# Features selection #
def select_features(data_path, dictionary_path):
    # Load the dataset
    data = pd.read_csv(data_path)
    
    # Load the data dictionary
    data_dictionary = pd.read_csv(dictionary_path)
    
    # Get columns marked as 'Y' in 'use_for_training'
    columns_to_use = data_dictionary[data_dictionary["use_for_training"] == "Y"]["columns_cleaned"].tolist()
    
    # Filter the data based on the selected columns
    filtered_data = data[columns_to_use]
    
    return filtered_data


data_path = "../data/processed_data.csv"  # Adjust to your actual data path
dictionary_path = "../data/fundtap-data-dictionary.csv"  # Adjust to your actual dictionary path
    
    # Load and filter the data
feature_data = select_features(data_path, dictionary_path)
    

print(feature_data.head(3))

In [None]:
# Training script
data_path = "../data/processed_data.csv"
dictionary_path = "../data/fundtap-data-dictionary.csv"
hyperparameters = ""
model_dir = Path(r"C:\Users\1\gitrepo\FundTapMLOps\model_output_100")
model_dir.mkdir(exist_ok=True, parents=True)
new_customer = True

# Load the data
df = pd.read_csv(data_path)
df["label"] = df.fundtap_profit_loss >= 0  # Ensure the label is created before passing it

# Run the training function
train_profit_loss_binary(df, dictionary_path, hyperparameters, model_dir, new_customer)


In [None]:
import pandas as pd

data_path = "../data/processed_data.csv"  # Adjust to your actual data path

# Load the dataset
df = pd.read_csv(data_path)

# Print the column names to verify
print(df.columns)
