In [7]:
# Core libraries
import pandas as pd
import numpy as np
import os
import json
import gc
import argparse
import joblib
from pathlib import Path

# Machine Learning
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss, f1_score, matthews_corrcoef, roc_auc_score, confusion_matrix, brier_score_loss

# Hyperparameter optimization
import optuna
from optuna.integration import lightgbm as opt_lgb

# Visualization and analysis
import matplotlib.pyplot as plt
import shap

# Configuration
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)


In [2]:
#training_helper#

def load_dictionary(dictionary_path):
    data_dictionary = pd.read_csv(dictionary_path)
    training_columns = data_dictionary[data_dictionary["use_for_training"] == "Y"]["columns_cleaned"].tolist()
    hold_out_columns = data_dictionary[data_dictionary["hold_out_columns"] == "Y"]["columns_cleaned"].tolist()
    return training_columns, hold_out_columns

def select_data(data, dictionary_path, target_column):
    # Get the columns to use for training and the columns to hold out
    training_columns, hold_out_columns = load_dictionary(dictionary_path)

    # Ensure only training columns are used, excluding hold-out columns and the target column
    training_columns = [col for col in training_columns if col not in hold_out_columns + [target_column]]
    
    # Filter data to include only training columns plus the target column
    data = data[training_columns+ [target_column]]

    # Define catgorical features #
    cat_features = list(data.select_dtypes(include=['object']).columns)
    data[cat_features] = data[cat_features].astype("category") 
    return data

def balanced_train_validation_test(data, target_column,random_state):
    X = data.loc[:, data.columns != target_column]
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_state, stratify=y)
    return X_train, X_test, y_train, y_test


def fundtap_train_test_split(data, dictionary_path, target_column="label",random_state=456):            
    data = select_data(data, dictionary_path, target_column)
    X_train, X_test, y_train, y_test = balanced_train_validation_test(data, target_column, random_state)
    
    train_instance_weight = np.abs(data.loc[X_train.index]["fundtap_profit_loss"])
    test_instance_weight = np.abs(data.loc[X_test.index]["fundtap_profit_loss"])

    return X_train, X_test, y_train, y_test, train_instance_weight, test_instance_weight



In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def load_and_select_data(dictionary_path, data, target_column):
    # Load data dictionary and extract relevant columns for training
    data_dictionary = pd.read_csv(dictionary_path)
    training_columns = data_dictionary[data_dictionary["use_for_training"] == "Y"]["columns_cleaned"].tolist()
    hold_out_columns = data_dictionary[data_dictionary["hold_out_columns"] == "Y"]["columns_cleaned"].tolist()
    
    # Exclude hold-out columns and the target column from the training data
    training_columns = [col for col in training_columns if col not in hold_out_columns + [target_column]]
    
    # Select and type-cast categorical features
    data = data[training_columns + [target_column]]
    cat_features = data.select_dtypes(include=['object']).columns
    data[cat_features] = data[cat_features].astype("category")
    
    return data

def balanced_train_validation_test(data, target_column, random_state=456):
    # Split data into training and test sets
    X = data.drop(columns=[target_column])
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_state, stratify=y)
    return X_train, X_test, y_train, y_test

def fundtap_train_test_split(data, dictionary_path, target_column="label"):
    # Process and split the data
    processed_data = load_and_select_data(dictionary_path, data, target_column)
    X_train, X_test, y_train, y_test = balanced_train_validation_test(processed_data, target_column)
    
    # Calculate instance weights based on specific column
    train_instance_weight = np.abs(data.loc[X_train.index, "fundtap_profit_loss"])
    test_instance_weight = np.abs(data.loc[X_test.index, "fundtap_profit_loss"])
    
    return X_train, X_test, y_train, y_test, train_instance_weight, test_instance_weight


In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import matthews_corrcoef, log_loss, brier_score_loss, confusion_matrix

def accuracy_analysis(classifier, X_train, X_test, y_train, y_test, test_instance_weight):
    pred_proba = classifier.predict_proba(X_test)[:, 1]
    yhat = np.where(pred_proba < 0.5, 0, 1)
    
    metrics = {
        "mcc": matthews_corrcoef(y_test, yhat),
        "logloss": log_loss(y_test, pred_proba),
        "brier_score_loss": brier_score_loss(y_test, pred_proba),
        "weighted_mcc": matthews_corrcoef(y_test, yhat, sample_weight=test_instance_weight),
        "weighted_logloss": log_loss(y_test, pred_proba, sample_weight=test_instance_weight),
        "weighted_bs": brier_score_loss(y_test, pred_proba, sample_weight=test_instance_weight)
    }
    
    tn, fp, fn, tp = confusion_matrix(y_test, yhat).ravel()
    train_tn, train_fp, train_fn, train_tp = confusion_matrix(y_train, np.where(classifier.predict_proba(X_train)[:, 1] < 0.5, 0, 1)).ravel()

    metrics.update({
        "tn": tn, "fp": fp, "fn": fn, "tp": tp,
        "fpr": fp / (fp + tn), "fnr": fn / (fn + tp),
        "train_tn": train_tn, "train_fp": train_fp, "train_fn": train_fn, "train_tp": train_tp,
        "train_fpr": train_fp / (train_fp + train_tn), "train_fnr": train_fn / (train_fn + train_tp)
    })
    
    if test_instance_weight.unique().size > 1:
        incorrect_preds = yhat != y_test
        metrics.update({
            "total_loss": test_instance_weight[incorrect_preds].sum(),
            "fp_loss": test_instance_weight[incorrect_preds & (yhat == 1)].sum(),
            "fn_loss": test_instance_weight[incorrect_preds & (yhat == 0)].sum()
        })
    
    return pd.DataFrame([metrics])

def multiclass_accuracy_analysis(classifier, X_test, y_test):
    pred_proba = classifier.predict_proba(X_test)
    yhat = pred_proba.argmax(axis=1)
    return pd.DataFrame(confusion_matrix(y_test, yhat))

def get_feature_importance(classifier):
    feature_importance = pd.DataFrame({
        'Features': classifier.feature_name(),
        'Importances': classifier.feature_importances_
    })
    feature_importance.sort_values(by='Importances', ascending=False, inplace=True)
    return feature_importance


In [12]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
import joblib
from pathlib import Path
import json
import gc
from sklearn.metrics import roc_auc_score
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

def preprocess_data(df, columns_to_drop):
    """Remove specified columns and fill NaN values."""
    df = df.drop(columns=columns_to_drop, errors='ignore').fillna(0)
    return df

def perform_training(df, label_column, objective, metric, boosting_type, hyperparameters, random_state, model_dir, file_prefix):
    """Generalized training function for both binary and multiclass tasks."""
    X_train, X_test, y_train, y_test = train_test_split(df.drop(label_column, axis=1), df[label_column], test_size=0.2, random_state=random_state)
    dtrain = lgb.Dataset(X_train, label=y_train)
    classifier = lgb.train({**hyperparameters, 'objective': objective, 'metric': metric, 'boosting_type': boosting_type}, dtrain)
    accuracy = calculate_accuracy(classifier, X_test, y_test)
    save_model(classifier, model_dir, file_prefix)
    accuracy.to_csv(Path(model_dir, file_prefix + "accuracy.csv"))
    return classifier

def calculate_accuracy(classifier, X_test, y_test):
    """Calculate accuracy and other metrics."""
    pred = classifier.predict(X_test)
    accuracy = roc_auc_score(y_test, pred)
    return pd.DataFrame({'accuracy': [accuracy]})

def save_model(model, model_dir, file_prefix):
    """Save the model to the disk."""
    joblib.dump(model, Path(model_dir, file_prefix + "classifier.joblib"))

def train_profit_loss_binary(df, hyperparameters, model_dir, new_customer):
    """Specific function for binary classification."""
    if new_customer:
        df = preprocess_data(df, ["funded_outstanding", "priorfundtaphistoryfundedsum", "priorfundtaphistorycompletedsum", "priorfundtaphistoryduesum", "priorfundtaphistorypendingsum"])
    classifier = perform_training(df, 'label', 'binary', 'binary_logloss', 'gbdt', hyperparameters, 456, model_dir, "new_customer_profitloss" if new_customer else "existing_customer_profitloss")
    get_shap_values(classifier, df.drop('label', axis=1), model_dir, "SHAP_new_customer" if new_customer else "SHAP_existing_customer")

def main():
    """Main function to run training processes."""
    args = parse_args(sys.argv[1:])
    train_profit_loss_binary(args.data_train, args.hyperparameters, args.model_dir, new_customer=True)
    gc.collect()
    train_profit_loss_binary(args.data_train, args.hyperparameters, args.model_dir, new_customer=False)
    gc.collect()

if __name__ == '__main__':
    main()


NameError: name 'sys' is not defined

In [13]:
import pandas as pd

# Clean column names and save the DataFrame
def clean_and_save(data_path, output_path):
    df = pd.read_csv(data_path)
    df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)
    df.to_csv(output_path, index=False)

# Execute the cleaning and saving process
clean_and_save('../data/train.csv', '../data/processed_data.csv')


FileNotFoundError: [Errno 2] No such file or directory: '../data/train.csv'

In [None]:
print(processed_data.dtypes)

In [14]:
import pandas as pd

def select_features(data_path, dictionary_path):
    data = pd.read_csv(data_path)
    columns_to_use = pd.read_csv(dictionary_path)[pd.read_csv(dictionary_path)["use_for_training"] == "Y"]["columns_cleaned"].tolist()
    return data[columns_to_use]

# 读取和筛选数据
feature_data = select_features("../data/processed_data.csv", "../data/fundtap-data-dictionary.csv")

# 输出前三行以检查结果
print(feature_data.head(3))


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed_data.csv'

In [15]:
import pandas as pd
from pathlib import Path

# Define paths and parameters
data_path = "../data/processed_data.csv"
dictionary_path = "../data/fundtap-data-dictionary.csv"
hyperparameters = ""
model_dir = Path(r"C:\Users\1\gitrepo\FundTapMLOps\model_output_100")
new_customer = True

# Load data and set up label
df = pd.read_csv(data_path)
df["label"] = df.fundtap_profit_loss >= 0

# Prepare model directory and train
model_dir.mkdir(exist_ok=True, parents=True)
train_profit_loss_binary(df, dictionary_path, hyperparameters, model_dir, new_customer)


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed_data.csv'

In [None]:
import pandas as pd

data_path = "../data/processed_data.csv"  # Adjust to your actual data path

# Load the dataset
df = pd.read_csv(data_path)

# Print the column names to verify
print(df.columns)
