In [None]:
import pandas as pd
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from datetime import datetime
import time

pd.set_option('display.max_columns', None)

Replace the placeholder with the correct filename. The data is available as a CSV file in the properly preprocessed format. The dataset consists of the attributes 'USER' and 'TCODE' (transaction permission), along with the respective labels for UPA_I, UPA_T, and UPA_R. In addition, the six user attributes ('COMPANY', 'CLASS', 'KOSTL', 'USTYP', 'DEPARTMENT', 'FUNCTION') and the permission attribute 'SUBCOMPONENT' are provided.

In [None]:
mlbac_data = pd.read_csv('mlbac_data.csv',low_memory=False).sample(frac=1, random_state=42).reset_index(drop=True))

In [None]:
cat_features_all_attributes = ['USER','TCODE','COMPANY', 'CLASS', 'KOSTL', 'USTYP', 'DEPARTMENT', 'FUNCTION', 'SUBCOMPONENT']

In [None]:
UPA_I = mlbac_data.drop('Target_UPA_T', axis=1).rename(columns={'Target_UPA_I': 'Target'})

UPA_T = mlbac_data.loc[(mlbac_data["Target_UPA_T"]==1)|(mlbac_data["Target_UPA_T"]==0)].drop('Target_UPA_I', axis=1).rename(columns={'Target_UPA_T': 'Target'})

UPA_R = mlbac_data.drop('Target_UPA_I', axis=1).rename(columns={'Target_UPA_T': 'Target'})
UPA_R = UPA_R.replace(2,0, regex=True)

datasets_dict = {
    'UPA_I': (UPA_I, cat_features_all_attributes),
    'UPA_T': (UPA_T, cat_features_all_attributes),
    'UPA_R': (UPA_R, cat_features_all_attributes)
}

## UserSplit

In [None]:
def save_split_data(X_train, X_val, X_test, y_train, y_val, y_test, y_test_pred, y_test_pred_proba, tn, fp, fn, tp, dataset_name, run):
    # Mark training data
    train_data = X_train.copy()
    train_data['True_Target'] = y_train
    train_data['Predicted_Target'] = None  # Keine Vorhersagen für Trainingsdaten
    train_data['Split'] = 'Train'
    
    # Mark validation data
    val_data = X_val.copy()
    val_data['True_Target'] = y_val
    val_data['Predicted_Target'] = None  # Keine Vorhersagen für Validierungsdaten
    val_data['Split'] = 'Validation'
    
    # Mark test data
    test_data = X_test.copy()
    test_data['True_Target'] = y_test
    test_data['Predicted_Target'] = y_test_pred
    test_data['Split'] = 'Test'
    
    # Combine all splits into one DataFrame
    combined_data = pd.concat([train_data, val_data, test_data], axis=0, ignore_index=True)
    
    # Define the file name
    file_name = f'{dataset_name}_run_{run+1}_Usersplit.csv'
    
    # Save the combined data
    combined_data.to_csv(file_name, index=False)
    print(f'Dataset saved as {file_name}')


In [None]:
def run_model(datasets_dict, num_runs, train_size, test_size, use_custom_split):
    results = [] 
    
    for run in range(num_runs):
        random_state = run * 10

        users_train, users_val, users_test = None, None, None

        for dataset_name, (dataset, cat_features) in datasets_dict.items():
            print(f"\nDataset: {dataset_name} | Run: {run + 1} | Random State: {random_state} | Number of Data Points: {len(dataset)}")

            X = dataset.drop("Target", axis=1)
            y = dataset["Target"]
    
            split_method = 'Custom Split' if use_custom_split else 'Standard Split'
    
            if use_custom_split:
                unique_users = X['USER'].unique()

                if users_train is None:
                    users_train, users_temp = train_test_split(unique_users, test_size=1-train_size, random_state=random_state)
                    users_val, users_test = train_test_split(users_temp, test_size=test_size, random_state=random_state)
    
                X_train = X[X['USER'].isin(users_train)]
                X_val = X[X['USER'].isin(users_val)]
                X_test = X[X['USER'].isin(users_test)]
                y_train = y[X['USER'].isin(users_train)]
                y_val = y[X['USER'].isin(users_val)]
                y_test = y[X['USER'].isin(users_test)]
            
            else:
                X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=1-train_size, random_state=random_state)
                X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=test_size, random_state=random_state)

            num_ones = sum(y_test == 1)
            num_zeros = sum(y_test == 0)
            print(f"Number of 1s in the test set: {num_ones}")
            print(f"Number of 0s in the test set: {num_zeros}")
    
            params = {
                'iterations': 1000,
                'eval_metric': 'AUC',
                'cat_features': cat_features,
                'early_stopping_rounds': 100,
                'verbose': 50,
                'random_seed': random_state
            }

            # Start the timer
            start_time = time.time()
    
            # Initialize and train the model
            cbc = CatBoostClassifier(**params)
            cbc.fit(X_train, y_train,
                    eval_set=(X_val, y_val),
                    use_best_model=True,
                    plot=False
                   )

            y_test_pred_proba = cbc.predict_proba(X_test)[:, 1] 
    
            auc_score = roc_auc_score(y_test, y_test_pred_proba)
            print(f'AUC Score Test-Data: {auc_score:.4f}')
    
            y_test_pred = cbc.predict(X_test)
    
            f1_model = f1_score(y_test, y_test_pred)
            print(f'F1 Score Test-Data (Model): {f1_model:.4f}')
    
            conf_matrix = confusion_matrix(y_test, y_test_pred)
            print(f'Confusion Matrix:\n{conf_matrix}')
    
            tn, fp, fn, tp = conf_matrix.ravel()
    
            feature_importance = dict(zip(X_train.columns, cbc.get_feature_importance()))
    
            # Stop the timer
            end_time = time.time()
            elapsed_time = end_time - start_time  # Calculate elapsed time in seconds
            
            # Append results for this run to the results list
            results.append({
                'Dataset_Name': dataset_name,
                'Run_Number': run + 1,
                'Random_State': random_state,
                'Number_of_Data_Points': len(dataset),
                'Number_of_1s_in_Test_Set': num_ones,
                'Number_of_0s_in_Test_Set': num_zeros,
                'AUC_Score': auc_score,
                'F1_Score': f1_model,
                'TP': tp,
                'FP': fp,
                'TN': tn,
                'FN': fn,
                'Feature_Importance': feature_importance,
                'Elapsed_Time': elapsed_time 
            })
            
            # Save the split data with labels
            save_split_data(X_train, X_val, X_test, y_train, y_val, y_test, y_test_pred, y_test_pred_proba, tn, fp, fn, tp, dataset_name, run)

    results_df = pd.DataFrame(results)
    results_df.to_csv('Results_Usersplit.csv', index=False)

In [None]:
num_runs = 10  
train_size = 0.6
test_size = 0.5

run_model(datasets_dict, num_runs, train_size, test_size)

## Calculate Scores

In [None]:
def merge_and_delete_datasets(run):
    UPA_I_file = f'UPA_I_run_{run+1}_Usersplit.csv'
    UPA_T_file = f'UPA_T_run_{run+1}_Usersplit.csv'
    UPA_R_file = f'UPA_R_run_{run+1}_Usersplit.csv'
    
    UPA_I_df = pd.read_csv(UPA_I_file)
    UPA_T_df = pd.read_csv(UPA_T_file)
    UPA_R_df = pd.read_csv(UPA_R_file)

    merged_df = UPA_I_df[['USER', 'TCODE', 'Split', 'True_Target', 'Predicted_Target']].merge(
        UPA_T_df[['USER', 'TCODE', 'Split', 'True_Target', 'Predicted_Target']],
        on=['USER', 'TCODE'],
        how='left',
        suffixes=('_UPA_I', '_UPA_T')
    ).merge(
        UPA_R_df[['USER', 'TCODE', 'Split', 'True_Target', 'Predicted_Target']],
        on=['USER', 'TCODE'],
        how='left',
        suffixes=('', '_UPA_R')
    )

    merged_df = merged_df.rename(columns={
        'Split_UPA_I': 'Split_UPA_I',
        'True_Target_UPA_I': 'True_Target_UPA_I',
        'Predicted_Target_UPA_I': 'Predicted_Target_UPA_I',
        'Split_UPA_T': 'Split_UPA_T',
        'True_Target_UPA_T': 'True_Target_UPA_T',
        'Predicted_Target_UPA_T': 'Predicted_Target_UPA_T',
        'Split': 'Split_UPA_R',
        'True_Target': 'True_Target_UPA_R',
        'Predicted_Target': 'Predicted_Target_UPA_R'
    })

    file_name = f'Combined_dataset_run_{run+1}_Usersplit.csv'
    merged_df.to_csv(file_name, index=False)
    print(f'Combined dataset saved as {file_name}')

    os.remove(UPA_I_file)
    os.remove(UPA_T_file)
    os.remove(UPA_R_file)

In [None]:
for run in range(0,num_runs):
    merge_and_delete_datasets(run)

In [None]:
for run in range(1, num_runs + 1):
    file_name = f"Combined_dataset_run_{run}_Usersplit.csv"
    globals()[f"Combined_dataset_run_{run}_Usersplit"] = pd.read_csv(file_name, low_memory=False)

In [None]:
datasets_info = [
    {"split_col": "Split_UPA_I", "true_col": "True_Target_UPA_I", "pred_col": "Predicted_Target_UPA_I", "name": "UPA_I"},
    {"split_col": "Split_UPA_R", "true_col": "True_Target_UPA_R", "pred_col": "Predicted_Target_UPA_R", "name": "UPA_R"},
    {"split_col": "Split_UPA_T", "true_col": "True_Target_UPA_T", "pred_col": "Predicted_Target_UPA_T", "name": "UPA_T"},
    {"split_col": "Split_UPA_R", "true_col": "True_Target_UPA_R", "pred_col": "Predicted_Target_UPA_R", "name": "UPA_R_Assumption", "assumption": True}
]

num_runs = 10
f1_scores_over_runs = {dataset['name']: {"f1_pos": [], "f1_neg": [], "f1_macro": [], "f1_weighted": [], "baseline_f1": []} for dataset in datasets_info}

for run in range(1, num_runs + 1):
    combined_dataset_name = f"Combined_dataset_run_{run}_Usersplit"
    combined_dataset = globals()[combined_dataset_name]
    
    print(f"\n===== Ergebnisse für Run {run} =====")
    
    for dataset in datasets_info:
        results = combined_dataset[[dataset['true_col'], dataset['pred_col']]].loc[combined_dataset[dataset['split_col']] == "Test"]
        
        if dataset.get("assumption"):
            results[dataset['true_col']] = results[dataset['true_col']].fillna(0.0)
            results[dataset['pred_col']] = results[dataset['pred_col']].fillna(0.0)

        class_distribution = results[dataset['true_col']].value_counts(normalize=True)
        p_0 = class_distribution.get(0, 0) 
        p_1 = class_distribution.get(1, 0) 
        
        baseline_predictions = np.random.choice([0, 1], size=len(results), p=[p_0, p_1])
        baseline_f1 = f1_score(results[dataset['true_col']], baseline_predictions)

        print(f"Results for {dataset['name']} in Run {run}:")
        print(f"Baseline F1 Score für {dataset['name']} in Run {run}: {baseline_f1:.4f}")
        
        f1_pos = f1_score(results[dataset['true_col']], results[dataset['pred_col']], pos_label=1)
        f1_neg = f1_score(results[dataset['true_col']], results[dataset['pred_col']], pos_label=0)
        f1_macro = f1_score(results[dataset['true_col']], results[dataset['pred_col']], average="macro")
        f1_weighted = f1_score(results[dataset['true_col']], results[dataset['pred_col']], average="weighted")
        
        cm = confusion_matrix(results[dataset['true_col']], results[dataset['pred_col']])
        
        print("Pos F1: " + str(f1_pos))
        print("Neg F1: " + str(f1_neg))
        print("Macro F1: " + str(f1_macro))
        print("Weighted F1: " + str(f1_weighted))
        print("Confusion Matrix:")
        print(cm)
        print("\n")

        f1_scores_over_runs[dataset['name']]['baseline_f1'].append(baseline_f1)
        f1_scores_over_runs[dataset['name']]['f1_pos'].append(f1_pos)
        f1_scores_over_runs[dataset['name']]['f1_neg'].append(f1_neg)
        f1_scores_over_runs[dataset['name']]['f1_macro'].append(f1_macro)
        f1_scores_over_runs[dataset['name']]['f1_weighted'].append(f1_weighted)
        f1_scores_over_runs[dataset['name']]['baseline_f1'].append(baseline_f1)

print("\n===== Avg and Std per Run =====")
for dataset in datasets_info:
    name = dataset['name']
    
    avg_baseline_f1 = np.mean(f1_scores_over_runs[name]['baseline_f1'])
    avg_f1_pos = np.mean(f1_scores_over_runs[name]['f1_pos'])
    avg_f1_neg = np.mean(f1_scores_over_runs[name]['f1_neg'])
    avg_f1_macro = np.mean(f1_scores_over_runs[name]['f1_macro'])
    avg_f1_weighted = np.mean(f1_scores_over_runs[name]['f1_weighted'])
    
    std_baseline_f1 = np.std(f1_scores_over_runs[name]['baseline_f1'])
    std_f1_pos = np.std(f1_scores_over_runs[name]['f1_pos'])
    std_f1_neg = np.std(f1_scores_over_runs[name]['f1_neg'])
    std_f1_macro = np.std(f1_scores_over_runs[name]['f1_macro'])
    std_f1_weighted = np.std(f1_scores_over_runs[name]['f1_weighted'])
    
    print(f"\nAvg and Std for {name}:")
    print(f"Avg Baseline F1: {avg_baseline_f1:.4f} (Std: {std_baseline_f1:.4f})")
    print(f"Avg Pos F1: {avg_f1_pos:.4f} (Std: {std_f1_pos:.4f})")
    print(f"Avg Neg F1: {avg_f1_neg:.4f} (Std: {std_f1_neg:.4f})")
    print(f"Avg Macro F1: {avg_f1_macro:.4f} (Std: {std_f1_macro:.4f})")
    print(f"Avg Weighted F1: {avg_f1_weighted:.4f} (Std: {std_f1_weighted:.4f})")
