In [None]:
"""
This script is used to train a XGBoost model as a baseline for EC number/product/susbtrate prediction in low-data regimes.
"""

import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from rdkit import Chem, RDLogger
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from rdkit.Chem import rdChemReactions
import xgboost as xgb
from xgboost.callback import TrainingCallback

# Importing data

In [2]:
def load_and_parse_data(file_path, task_type):
    """
    Load and parse the JSON data based on the task type (ec_prediction, product_prediction, substrate_prediction).
    """
    # Load the JSON data
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Create lists for processed data
    inputs = []
    outputs = []
    raw_inputs = []
    raw_outputs = []
    ec_numbers = []  # For product/substrate prediction

    for entry in data:
        raw_input = entry['raw_input']
        raw_output = entry['raw_output']

        if task_type == 'ec_prediction':
            # For EC prediction, use the full reaction string "reactant>>product"
            if ">>" not in raw_input:
                continue  # Skip invalid rows
            
            raw_inputs.append(raw_input)  # Store full "reactant>>product" string
            inputs.append(raw_input)  # Used directly for reaction fingerprinting
            outputs.append(raw_output)  # EC number
            raw_outputs.append(raw_output)  # Store raw EC number

        elif task_type in ['product_prediction', 'substrate_prediction']:
            # For Product/Substrate prediction, separate SMILES and EC from raw input
            if "|" not in raw_input:
                continue  # Skip invalid rows

            smiles, ec_number = raw_input.split('|')
            raw_inputs.append(raw_input)  # Store full input
            inputs.append(smiles)  # Used for molecular fingerprinting
            outputs.append(raw_output)  # Predicting SMILES
            raw_outputs.append(raw_output)  # Store expected SMILES output
            ec_numbers.append(ec_number)  # Store EC separately for encoding

    # Create DataFrame
    df = pd.DataFrame({
        'input': inputs,
        'output': outputs,
        'raw_input': raw_inputs,
        'raw_output': raw_outputs
    })

    # Add EC number column for product/substrate prediction
    if task_type in ['product_prediction', 'substrate_prediction']:
        df['ec_number'] = ec_numbers

    return df


### Load all datasets

In [3]:
# Load EC prediction dataset (train and test sets)
ec_train = load_and_parse_data("Data_json/ec_train_set.json", task_type='ec_prediction')
ec_test = load_and_parse_data("Data_json/ec_test_set.json", task_type='ec_prediction')

# Load Product prediction dataset (train and test sets)
product_train = load_and_parse_data("Data_json/product_train_set.json", task_type='product_prediction')
product_test = load_and_parse_data("Data_json/product_test_set.json", task_type='product_prediction')

# Load Substrate prediction dataset (train and test sets)
substrate_train = load_and_parse_data("Data_json/substrate_train_set.json", task_type='substrate_prediction')
substrate_test = load_and_parse_data("Data_json/substrate_test_set.json", task_type='substrate_prediction')

# Preprocessing for the three tasks

### Hierarchical Encoding & Standardization

In [4]:
def extract_ec_levels(df, column_name='raw_output'):
    """Extract hierarchical EC levels (L1, L2, L3, L4) from a given column."""
    df[['L1', 'L2', 'L3', 'L4']] = df[column_name].str.split('.', expand=True).astype(float)
    return df

def fit_ec_encoders_and_scaler(train_df, test_df):
    """
    Fit LabelEncoders and StandardScaler using both train & test EC numbers.
    Returns trained encoders, scaler, the transformed DataFrames, and a lookup dictionary.
    """
    # Merge train & test before encoding
    combined_ec = pd.concat([train_df[['L1', 'L2', 'L3', 'L4']], 
                             test_df[['L1', 'L2', 'L3', 'L4']]], 
                             axis=0).drop_duplicates().reset_index(drop=True)
    
    # Fit LabelEncoders ONCE
    encoders = {col: LabelEncoder().fit(combined_ec[col]) for col in ['L1', 'L2', 'L3', 'L4']}

    # Apply Label Encoding (Copy DataFrames to Avoid Overwriting)
    train_df = train_df.copy()
    test_df = test_df.copy()
    for col in ['L1', 'L2', 'L3', 'L4']:
        train_df[col] = encoders[col].transform(train_df[col])
        test_df[col] = encoders[col].transform(test_df[col])

    # Create Lookup Table BEFORE Standardization
    combined_encoded = pd.concat([train_df[['L1', 'L2', 'L3', 'L4']], 
                                  test_df[['L1', 'L2', 'L3', 'L4']]], 
                                  axis=0).drop_duplicates().reset_index(drop=True)

    ec_lookup_dict = {tuple(row): ec for row, ec in zip(combined_encoded.values, combined_ec.values)}
    scaler = StandardScaler().fit(train_df[['L1', 'L2', 'L3', 'L4']])

    # Apply Standardization
    train_df[['L1', 'L2', 'L3', 'L4']] = scaler.transform(train_df[['L1', 'L2', 'L3', 'L4']])
    test_df[['L1', 'L2', 'L3', 'L4']] = scaler.transform(test_df[['L1', 'L2', 'L3', 'L4']])

    return train_df, test_df, encoders, scaler, ec_lookup_dict

### Final Processed Data for Training (for Each Task)

In [5]:
# --- Extract EC levels for each task ---
ec_train = extract_ec_levels(ec_train, column_name='raw_output')
ec_test = extract_ec_levels(ec_test, column_name='raw_output')

product_train = extract_ec_levels(product_train, column_name='ec_number')
product_test = extract_ec_levels(product_test, column_name='ec_number')

substrate_train = extract_ec_levels(substrate_train, column_name='ec_number')
substrate_test = extract_ec_levels(substrate_test, column_name='ec_number')

# --- Apply Unified EC Encoding ---
ec_train, ec_test, ec_encoders, ec_scaler, ec_lookup_dict = fit_ec_encoders_and_scaler(ec_train, ec_test)
product_train, product_test, product_encoders, product_scaler, product_lookup_dict = fit_ec_encoders_and_scaler(product_train, product_test)
substrate_train, substrate_test, substrate_encoders, substrate_scaler, substrate_lookup_dict = fit_ec_encoders_and_scaler(substrate_train, substrate_test)

# Utility functions

In [6]:
def smiles_to_morgan(smiles, radius=2, n_bits=256):
    """Convert a SMILES string into a Morgan fingerprint using the newer MorganGenerator API."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits)  # Return a zero vector for invalid SMILES
    
    # Use the new MorganGenerator API
    generator = GetMorganGenerator(radius=radius, fpSize=n_bits)
    return np.array(generator.GetFingerprint(mol))



def reaction_to_fingerprint(reaction_smiles, bit_vector_size=1024):
    """
    Generates a reaction fingerprint from a reaction SMILES and returns it as a NumPy array.

    Parameters:
        reaction_smiles (str): Reaction SMILES string.
        bit_vector_size (int): Size of the bit vector (default: 2048).

    Returns:
        np.ndarray: Reaction fingerprint as a NumPy array of shape (bit_vector_size,).
    """
    # Convert SMILES to an RDKit reaction object
    reaction = rdChemReactions.ReactionFromSmarts(reaction_smiles, useSmiles=True)
    
    # Generate the reaction fingerprint (returns UIntSparseIntVect)
    rxn_fp = rdChemReactions.CreateDifferenceFingerprintForReaction(reaction)
    
    # Initialize a zeroed NumPy array of desired size
    fingerprint_array = np.zeros(bit_vector_size, dtype=np.uint8)

    # Get the nonzero bit positions from the fingerprint
    active_bits = rxn_fp.GetNonzeroElements().keys()

    # Set the corresponding positions in the NumPy array to 1
    for bit in active_bits:
        if bit < bit_vector_size:  # Ensure we don't exceed the array size
            fingerprint_array[bit] = 1

    return np.array(fingerprint_array)



def prepare_features(df, task):
    """Prepare input-output data for XGBoost based on task."""
    df = df.copy()

    if task == "ec_prediction":
        
        # Inputs: Reaction fingerprint (reactant >> product)
        df['rxn_fp'] = df['input'].apply(reaction_to_fingerprint)
        
        # Concatenate EC hierarchical encoding
        df['features'] = df['rxn_fp']

        X = np.vstack(df['features'].values)
        y = df[['L1', 'L2', 'L3', 'L4']].values  # Multi-output for EC hierarchy

    elif task in ["product_prediction", "substrate_prediction"]:
        # Inputs: Reactant/Product + EC hierarchical encoding
        df['mol_fp'] = df['input'].apply(smiles_to_morgan)

        # Concatenate EC hierarchical encoding
        df['features'] = df.apply(lambda row: np.concatenate([row['mol_fp'], row[['L1', 'L2', 'L3', 'L4']].values]), axis=1)

        X = np.vstack(df['features'].values)
        y = np.vstack(df['output'].apply(smiles_to_morgan).values)  # Predicting Morgan Fingerprint

    return X, y



def inverse_transform_ec(y_standardized, scaler):
    """
    Convert standardized EC encoding predictions back to the closest known EC number.
    """
    y_original = scaler.inverse_transform(y_standardized)  # De-standardize
    y_rounded = np.rint(y_original).astype(int)  # Round to nearest integer
    return y_rounded



class TQDMProgressBar(TrainingCallback):
    """Custom callback for XGBoost training progress visualization."""
    
    def __init__(self, total_rounds):
        self.progress_bar = tqdm(total=total_rounds, desc="Training Progress", position=0, leave=True)
    
    def after_iteration(self, model, epoch, evals_log):
        """Update progress bar with latest validation metric."""
        if evals_log:
            metric_name = list(evals_log["validation_0"].keys())[0]  # Get metric name
            latest_score = evals_log["validation_0"][metric_name][-1]  # Get latest score
            self.progress_bar.set_postfix({metric_name: latest_score})  # Update bar
        
        self.progress_bar.update(1)
        return False  # Continue training
    
    def after_training(self, model):
        """Close progress bar and return model."""
        self.progress_bar.close()
        return model  # ✅ Ensure callback properly returns model
    


def convert_hierarchical_to_ec(hierarchical_encoding, lookup_dict):
    """
    Convert hierarchical encoding back to the closest known EC number.
    
    Parameters:
    - hierarchical_encoding: Tuple of (L1, L2, L3, L4) values.
    - lookup_dict: Dictionary mapping hierarchical encodings back to EC numbers.

    Returns:
    - Closest EC number as a string.
    """
    hierarchical_encoding = tuple(np.round(hierarchical_encoding).astype(int))  # Round to nearest int
    return hierarchical_encoding



def compute_ec_accuracy(y_test_ec, y_pred_ec):
    """
    Compute accuracy at each EC hierarchy level (L1, L1+L2, L1+L2+L3, Full L1+L2+L3+L4).

    Parameters:
    - y_test_ec: List of actual EC numbers (each as a list of 4 integers).
    - y_pred_ec: List of predicted EC numbers (each as a list of 4 integers).

    Returns:
    - A dictionary with accuracy at each EC level.
    """
    print(y_test_ec[:5])
    print(y_pred_ec[:5])
    correct_L1 = 0
    correct_L2 = 0
    correct_L3 = 0
    correct_L4 = 0
    total = len(y_test_ec)

    for true_ec, pred_ec in zip(y_test_ec, y_pred_ec):
        if true_ec[0] == pred_ec[0]:  # L1 match
            correct_L1 += 1
        if (true_ec[0] == pred_ec[0]) and (true_ec[1] == pred_ec[1]):  # L1 + L2 match
            correct_L2 += 1
        if (true_ec[0] == pred_ec[0]) and (true_ec[1] == pred_ec[1]) and (true_ec[2] == pred_ec[2]):  # L1 + L2 + L3 match
            correct_L3 += 1
        if (true_ec[0] == pred_ec[0]) and (true_ec[1] == pred_ec[1]) and (true_ec[2] == pred_ec[2]) and (true_ec[3] == pred_ec[3]):  # Full EC match
            correct_L4 += 1

    return {
        "L1 Accuracy": correct_L1 / total,
        "L1+L2 Accuracy": correct_L2 / total,
        "L1+L2+L3 Accuracy": correct_L3 / total,
        "Full EC Accuracy": correct_L4 / total
    }



def compute_ec_micro_avg_accuracy(y_test_ec, y_pred_ec):
    """
    Compute micro-averaged accuracy at each EC hierarchy level (L1, L1+L2, L1+L2+L3, Full L1+L2+L3+L4)
    **per L1 class**, then average across all encountered L1 classes.

    Parameters:
    - y_test_ec: List of actual EC numbers (each as a list of 4 integers).
    - y_pred_ec: List of predicted EC numbers (each as a list of 4 integers).

    Returns:
    - A dictionary with micro-averaged accuracy at each EC level.
    """

    from collections import defaultdict

    # Store counts per L1 class
    classwise_correct = defaultdict(lambda: {"L1": 0, "L2": 0, "L3": 0, "L4": 0, "total": 0})

    # Populate accuracy per L1 class
    for true_ec, pred_ec in zip(y_test_ec, y_pred_ec):
        main_class = true_ec[0]  # L1 class identifier
        classwise_correct[main_class]["total"] += 1  # Count instances

        if true_ec[0] == pred_ec[0]:  # L1 match
            classwise_correct[main_class]["L1"] += 1
        if (true_ec[0] == pred_ec[0]) and (true_ec[1] == pred_ec[1]):  # L1 + L2 match
            classwise_correct[main_class]["L2"] += 1
        if (true_ec[0] == pred_ec[0]) and (true_ec[1] == pred_ec[1]) and (true_ec[2] == pred_ec[2]):  # L1 + L2 + L3 match
            classwise_correct[main_class]["L3"] += 1
        if (true_ec[0] == pred_ec[0]) and (true_ec[1] == pred_ec[1]) and (true_ec[2] == pred_ec[2]) and (true_ec[3] == pred_ec[3]):  # Full EC match
            classwise_correct[main_class]["L4"] += 1

    # Compute micro-averaged accuracy per class
    accuracies = {"L1 Accuracy": 0, "L1+L2 Accuracy": 0, "L1+L2+L3 Accuracy": 0, "Full EC Accuracy": 0}
    encountered_classes = len(classwise_correct)

    for main_class, counts in classwise_correct.items():
        if counts["total"] > 0:  # Avoid division by zero
            accuracies["L1 Accuracy"] += counts["L1"] / counts["total"]
            accuracies["L1+L2 Accuracy"] += counts["L2"] / counts["total"]
            accuracies["L1+L2+L3 Accuracy"] += counts["L3"] / counts["total"]
            accuracies["Full EC Accuracy"] += counts["L4"] / counts["total"]

    # Average over all encountered L1 classes
    for key in accuracies:
        accuracies[key] /= encountered_classes

    return accuracies

# XGBoost functions

In [None]:
def train_xgboost(X, y, scaler, train_size=2000, task = 'ec_encoding', num_boost_round=100, lookup_dict=None, scale_ec_factor = None):
    """
    Train and evaluate an XGBoost model for EC prediction.
    
    Parameters:
    - X: Input reaction fingerprints.
    - y: 4D hierarchical EC encoding.
    - scaler: StandardScaler used for EC encoding.
    - num_boost_round: Maximum boosting rounds.
    - ec_lookup_dict: Dictionary mapping hierarchical encodings back to actual EC numbers.
    """

    # Split Data (Ensuring Validation Set Stays Fixed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)
    X_train, y_train = X_train[:train_size], y_train[:train_size]  # Allow smaller training set

    # EC Prediction (Reaction → EC Encoding)
    if task == "ec_prediction":
        model = xgb.XGBRegressor(
            objective="reg:squarederror",
            eval_metric="rmse",
            early_stopping_rounds=10,
            callbacks=[TQDMProgressBar(num_boost_round)]
        )

        # Train Model and Store Evaluation Results
        model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],  # Validate only on test set
            verbose=False
        )

        # Predict Standardized EC Encodings
        y_pred = model.predict(X_test)
        
        # Convert Predictions Back to Hierarchical Encoding
        y_test_hierarchical = scaler.inverse_transform(y_test)
        y_pred_hierarchical = scaler.inverse_transform(y_pred)

        # Convert Hierarchical Encoding to Actual EC Numbers Using Lookup Table
        if lookup_dict is not None:
            y_test_ec = [convert_hierarchical_to_ec(tuple(vec), lookup_dict) for vec in y_test_hierarchical]
            y_pred_ec = [convert_hierarchical_to_ec(tuple(vec), lookup_dict) for vec in y_pred_hierarchical]
        else:
            y_test_ec = []
            y_pred_ec = []

        return model, model.evals_result(), y_test_hierarchical, y_pred_hierarchical, y_test_ec, y_pred_ec


    # Product/Substrate Prediction (Molecule FP + EC Encoding → Molecule FP)
    else:
        # **Rescale EC Encoding in Input Features**
        if scale_ec_factor is not None:
            X_train[:, -4:] *= scale_ec_factor
            X_test[:, -4:] *= scale_ec_factor
        else:
            X_train = X_train[:, :-4] # Remove EC features
            X_test = X_test[:, :-4]

        # Use `logloss` if treating each bit as a separate binary class
        model = xgb.XGBClassifier(
            objective="binary:logistic",  # Logistic regression per bit
            eval_metric="logloss",
            early_stopping_rounds=10,
            callbacks=[TQDMProgressBar(num_boost_round)]
        )

        # Train the model
        model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            verbose=False,
        )

        # Predict and Compute Accuracy
        y_pred_prob = model.predict_proba(X_test)  # Get probability output
        y_pred_binary = (y_pred_prob > 0.5).astype(int)  # Convert to binary (threshold at 0.5)

        fingerprint_accuracy = np.mean(np.all(y_pred_binary == y_test, axis=1))
        print(f"{task} - Fingerprint Accuracy: {fingerprint_accuracy:.4f}")

        return model, model.evals_result(), y_test, y_pred_binary
    



def test_xgboost(model, X_test, y_test, scaler, lookup_dict=None, task="ec_prediction", scale_ec_factor = None):
    """
    Validate a trained XGBoost model on a held-out test set.
    
    Parameters:
    - model: Trained XGBoost model.
    - X_test: Features of held-out test set.
    - y_test: True labels (Hierarchical EC encodings or Morgan Fingerprints).
    - scaler: StandardScaler used during training (only for EC prediction).
    - lookup_dict: Dictionary to map hierarchical encodings back to EC numbers (only for EC prediction).
    - task: Task type ('ec_prediction' or 'product_prediction'/'substrate_prediction').
    """

    # 1EC PREDICTION (Reaction → EC Encoding)
    if task == "ec_prediction":
        # Predict standardized hierarchical encodings
        y_pred = model.predict(X_test)

        # Convert back to original hierarchical encodings
        y_test_hierarchical = scaler.inverse_transform(y_test)
        y_pred_hierarchical = scaler.inverse_transform(y_pred)

        # Convert hierarchical encodings back to EC numbers
        y_test_ec = [convert_hierarchical_to_ec(tuple(vec), lookup_dict) for vec in y_test_hierarchical]
        y_pred_ec = [convert_hierarchical_to_ec(tuple(vec), lookup_dict) for vec in y_pred_hierarchical]

        # Compute accuracy at each EC level
        ec_accuracy = compute_ec_accuracy(y_test_ec, y_pred_ec)
        ec_micro_accuracy = compute_ec_micro_avg_accuracy(y_test_ec, y_pred_ec)
        print()
        print('########### test micro-average', ec_micro_accuracy)


        # Print results
        print("\nXGBoost EC Prediction Accuracy on Held-Out Test Set:")
        for level, acc in ec_accuracy.items():
            print(f"{level}: {acc:.4f}")

        return ec_accuracy

    # PRODUCT/SUBSTRATE PREDICTION (Molecule FP + EC Encoding → Molecule FP)
    else:
        if scale_ec_factor is not None:
            X_test[:, -4:] *= scale_ec_factor
        else:
            X_test = X_test[:, :-4]  # Remove EC encoding from input feature
        # Predict fingerprint bit vectors
        y_pred_prob = model.predict_proba(X_test)  # Get probability outputs per bit
        y_pred_binary = (y_pred_prob > 0.5).astype(int)  # Convert to binary (threshold at 0.5)

        # Compute exact fingerprint match accuracy
        fingerprint_accuracy = np.mean(np.all(y_pred_binary == y_test, axis=1))

        print(f"\nXGBoost {task} Accuracy on Held-Out Test Set:")
        print(f"Exact Fingerprint Match Accuracy: {fingerprint_accuracy:.4f}")

        return fingerprint_accuracy


# Prepare data for all three tasks

In [15]:
if __name__ == "__main__":
    

    RDLogger.DisableLog('rdApp.*')
    
    # --- Prepare Data for All Three Tasks ---
    print("Preparing EC Prediction Data...")
    X_ec, y_ec = prepare_features(ec_train, task="ec_prediction")
    print("Preparing Product Prediction Data...")
    X_product, y_product = prepare_features(product_train, task="product_prediction")
    print("Preparing Substrate Prediction Data...")
    X_substrate, y_substrate = prepare_features(substrate_train, task="substrate_prediction")

    print("Preparing EC Prediction Data...")
    X_ec_test, y_ec_test = prepare_features(ec_test, task="ec_prediction")
    print("Preparing Product Prediction Data...")
    X_product_test, y_product_test = prepare_features(product_test, task="product_prediction")
    print("Preparing Substrate Prediction Data...")
    X_substrate_test, y_substrate_test = prepare_features(substrate_test, task="substrate_prediction")


    # EC prediction training (1024-bit reaction fingerprint)
    print("\nTraining EC Prediction Model...")
    # --- Training Set = 2000 ---
    model, evals_result, true, pred, c_true, c_pred = train_xgboost(X=X_ec, y=y_ec, scaler=ec_scaler, train_size=2000, task = 'ec_prediction', lookup_dict = ec_lookup_dict)
    ec_accuracy = compute_ec_accuracy(c_true, c_pred)
    for level, acc in ec_accuracy.items():
        print(f"{level}: {acc:.4f}")
    test_acc = test_xgboost(model, X_test=X_ec_test, y_test=y_ec_test, scaler=ec_scaler, lookup_dict=ec_lookup_dict, task='ec_prediction')

    # --- Training Set = 600 ---
    model, evals_result, true, pred, c_true, c_pred = train_xgboost(X=X_ec, y=y_ec, scaler=ec_scaler, train_size=600, task = 'ec_prediction', lookup_dict = ec_lookup_dict)
    ec_accuracy = compute_ec_accuracy(c_true, c_pred)
    for level, acc in ec_accuracy.items():
        print(f"{level}: {acc:.4f}")
    test_acc = test_xgboost(model, X_test=X_ec_test, y_test=y_ec_test, scaler=ec_scaler, lookup_dict=ec_lookup_dict, task='ec_prediction')


    # --- Training Set = 200 ---
    model, evals_result, true, pred, c_true, c_pred = train_xgboost(X=X_ec, y=y_ec, scaler=ec_scaler, train_size=200, task = 'ec_prediction', lookup_dict = ec_lookup_dict)
    ec_accuracy = compute_ec_accuracy(c_true, c_pred)
    for level, acc in ec_accuracy.items():
        print(f"{level}: {acc:.4f}")
    test_acc = test_xgboost(model, X_test=X_ec_test, y_test=y_ec_test, scaler=ec_scaler, lookup_dict=ec_lookup_dict, task='ec_prediction')

    
    # Product/substrate prediction with 256-bit Morgan fingerprint
    print("\nTraining Product Prediction Model...")
    product_model, product_evals_result, product_true, product_pred = train_xgboost(X=X_product, y=y_product, scaler=product_scaler, train_size=2000, task="product_prediction", lookup_dict=product_lookup_dict, scale_ec_factor=1)
    print("\nTraining Substrate Prediction Model...")
    substrate_model, substrate_evals_result, substrate_true, substrate_pred = train_xgboost(X=X_substrate, y=y_substrate, scaler=substrate_scaler, train_size=2000, task="substrate_prediction", lookup_dict=substrate_lookup_dict, scale_ec_factor=1)

    test_acc = test_xgboost(product_model, X_test=X_product_test, y_test=y_product_test, scaler=product_scaler, lookup_dict=product_lookup_dict, task='product_prediction', scale_ec_factor=1)
    test_acc2 = test_xgboost(substrate_model, X_test=X_substrate_test, y_test=y_substrate_test, scaler=substrate_scaler, lookup_dict=substrate_lookup_dict, task='substrate_prediction', scale_ec_factor=1)
        

Preparing EC Prediction Data...
Preparing Product Prediction Data...
Preparing Substrate Prediction Data...
Preparing EC Prediction Data...
Preparing Product Prediction Data...
Preparing Substrate Prediction Data...

Training EC Prediction Model...


Training Progress:  51%|█████     | 51/100 [00:00<00:00, 83.61it/s, rmse=0.67] 


[(2, 0, 2, 24), (1, 0, 0, 139), (0, 2, 0, 71), (0, 12, 10, 170), (1, 4, 0, 81)]
[(2, 4, 2, 25), (1, 0, 0, 151), (0, 12, 9, 12), (0, 5, 3, 134), (1, 4, 1, 78)]
L1 Accuracy: 0.8090
L1+L2 Accuracy: 0.3467
L1+L2+L3 Accuracy: 0.2161
Full EC Accuracy: 0.0000
[(0, 1, 0, 80), (2, 0, 1, 19), (1, 0, 0, 217), (0, 0, 0, 0), (3, 0, 0, 75)]
[(0, 2, 1, 117), (2, 0, 1, 21), (1, -1, 0, 176), (0, 1, 1, 93), (2, 1, 1, 14)]

########### test micro-average {'L1 Accuracy': 0.5397754713322064, 'L1+L2 Accuracy': 0.23710122922301835, 'L1+L2+L3 Accuracy': 0.1592114018355814, 'Full EC Accuracy': 0.0}

XGBoost EC Prediction Accuracy on Held-Out Test Set:
L1 Accuracy: 0.7485
L1+L2 Accuracy: 0.2772
L1+L2+L3 Accuracy: 0.1719
Full EC Accuracy: 0.0000


Training Progress:  16%|█▌        | 16/100 [00:00<00:00, 101.73it/s, rmse=0.783]


[(2, 0, 2, 24), (1, 0, 0, 139), (0, 2, 0, 71), (0, 12, 10, 170), (1, 4, 0, 81)]
[(2, 3, 3, 42), (1, 1, 1, 104), (1, 10, 14, 33), (0, 2, 2, 87), (2, 3, 1, 71)]
L1 Accuracy: 0.6784
L1+L2 Accuracy: 0.0603
L1+L2+L3 Accuracy: 0.0101
Full EC Accuracy: 0.0000
[(0, 1, 0, 80), (2, 0, 1, 19), (1, 0, 0, 217), (0, 0, 0, 0), (3, 0, 0, 75)]
[(1, 3, 1, 82), (3, 2, 0, 44), (1, 1, 0, 104), (0, 2, 2, 80), (2, 2, 1, 26)]

########### test micro-average {'L1 Accuracy': 0.40851211895022504, 'L1+L2 Accuracy': 0.05987364211035982, 'L1+L2+L3 Accuracy': 0.017093603475717294, 'Full EC Accuracy': 0.0}

XGBoost EC Prediction Accuracy on Held-Out Test Set:
L1 Accuracy: 0.6807
L1+L2 Accuracy: 0.1076
L1+L2+L3 Accuracy: 0.0339
Full EC Accuracy: 0.0000


Training Progress:  14%|█▍        | 14/100 [00:00<00:00, 113.23it/s, rmse=0.867]


[(2, 0, 2, 24), (1, 0, 0, 139), (0, 2, 0, 71), (0, 12, 10, 170), (1, 4, 0, 81)]
[(0, 4, 7, 18), (2, 2, 1, 180), (1, 10, 12, 48), (0, 3, 5, 77), (1, 3, 1, 70)]
L1 Accuracy: 0.5678
L1+L2 Accuracy: 0.0503
L1+L2+L3 Accuracy: 0.0050
Full EC Accuracy: 0.0000
[(0, 1, 0, 80), (2, 0, 1, 19), (1, 0, 0, 217), (0, 0, 0, 0), (3, 0, 0, 75)]
[(0, 1, 1, 79), (1, 3, 1, 55), (1, 2, 1, 180), (0, 3, 2, 27), (2, 3, 2, 22)]

########### test micro-average {'L1 Accuracy': 0.32700450970917977, 'L1+L2 Accuracy': 0.04482406331999828, 'L1+L2+L3 Accuracy': 0.0015786983266658065, 'Full EC Accuracy': 0.0}

XGBoost EC Prediction Accuracy on Held-Out Test Set:
L1 Accuracy: 0.6082
L1+L2 Accuracy: 0.0807
L1+L2+L3 Accuracy: 0.0023
Full EC Accuracy: 0.0000

Training Product Prediction Model...


Training Progress:  37%|███▋      | 37/100 [00:15<00:25,  2.45it/s, logloss=0.146]


product_prediction - Fingerprint Accuracy: 0.0914

Training Substrate Prediction Model...


Training Progress:  35%|███▌      | 35/100 [00:14<00:26,  2.48it/s, logloss=0.161]

substrate_prediction - Fingerprint Accuracy: 0.0704

XGBoost product_prediction Accuracy on Held-Out Test Set:
Exact Fingerprint Match Accuracy: 0.0513

XGBoost substrate_prediction Accuracy on Held-Out Test Set:
Exact Fingerprint Match Accuracy: 0.0358



