In [1]:
import pandas as pd
import numpy as np
import os
import json
import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier

In [2]:
def load_data(file_name, dataset):
    """
    Load the data from a Parquet file, encode the target variable, and split the data into training and validation sets.

    Parameters:
    - file_name (str): Name of the file to load (without '.parquet' extension and path).

    Returns:
    - X_train (DataFrame): Training features.
    - y_train (Series): Training labels.
    - X_val (DataFrame): Validation features.
    - y_val (Series): Validation labels.
    """
    data_path = f'../../../data/features/{dataset}/{file_name}.parquet'
    data = pd.read_parquet(data_path)
    
    expanded_lineages = []
    for lineage in data["Target"]:
        output = split_and_replace(lineage, mapping)
        output_str = "".join(output)
        expanded_lineages.append(output_str)
    
    data["Target"] = expanded_lineages
    data = data[~data["Target"].astype(str).str.contains(r"\[|\*")]

    X_train = data[data['Train'] == 0].drop(columns=["Target", "Train"])
    y_train = data[data['Train'] == 0]['Target']
    X_val = data[data['Train'] == 1].drop(columns=["Target", "Train"])
    y_val = data[data['Train'] == 1]['Target']

    return X_train, y_train, X_val, y_val

def get_root_children(predictions_df):
    # Extract column names that don't contain '.' indicating they are root children
    return [col for col in predictions_df.columns if '.' not in col]

def get_children(node, predictions_df):
    if node:  # If the node is specified, find its children
        prefix = node + '.'
        return [col for col in predictions_df.columns if col.startswith(prefix) and col.count('.') == node.count('.') + 1]
    else:  # If no node is specified, return root children
        return get_root_children(predictions_df)


def navigate_tree(predictions_df):
    results_df = pd.DataFrame(index=predictions_df.index)

    for index, row in predictions_df.iterrows():
        current_nodes = get_root_children(predictions_df)  # Start with root children
        predicted_path = []

        while current_nodes:
            next_nodes = []
            node_found = False
            for node in current_nodes:
                if row[node] > 0.5:  # Satisfactory confidence level
                    predicted_path.append(node)
                    node_found = True
                    child_nodes = get_children(node, predictions_df)
                    next_nodes.extend(child_nodes)

            if not node_found:  # No positive classifications at the current level
                break

            current_nodes = next_nodes

        # results_df.at[index, 'Predicted Path'] = ' > '.join(predicted_path)
        results_df.at[index, 'prediction'] = predicted_path[-1]

    return results_df

def get_class_path(col):
    """
    Derive the class path from the root to the specified column (class).

    Parameters:
    - col (str): The column name representing the class, formatted in a hierarchical dot notation.

    Returns:
    - List[str]: A list of column names representing the path from the root to the class.
    """
    # Split the column name based on dots to get individual nodes in the path
    nodes = col.split('.')

    # Initialize an empty list to store the path
    class_path = []

    # Iterate over the nodes to build the path
    for i in range(1, len(nodes) + 1):
        # Reconstruct the class name at each level of the hierarchy
        class_name = '.'.join(nodes[:i])
        class_path.append(class_name)

    return class_path

def compute_class_path_products(predictions_df):
    # Initialize a dictionary to store the intermediate products
    products_dict = {}

    # Iterate over columns (classifiers) to compute the product of probabilities along the path
    for col in predictions_df.columns:
        class_path = get_class_path(col)  # Ensure this function is defined and correct
        products_dict[col] = predictions_df[class_path].prod(axis=1)

    # Create the product DataFrame from the dictionary in one go
    product_df = pd.DataFrame(products_dict)

    return product_df

def select_deepest_classification(product_df, threshold=0.5):
    # Apply the threshold
    above_threshold = product_df >= threshold

    # Sort columns by their depth (assuming deeper nodes have longer names)
    sorted_columns = sorted(product_df.columns, key=lambda x: len(x), reverse=True)
    sorted_product_df = product_df[sorted_columns]

    # Initialize a Series to store the final classification for each observation
    final_classifications = pd.Series(index=sorted_product_df.index, dtype="object")

    # Iterate over the sorted DataFrame to find the deepest classification
    for idx, row in sorted_product_df.iterrows():
        for col in sorted_columns:
            if row[col] and above_threshold.at[idx, col]:
                final_classifications.at[idx] = col
                break  # Stop at the deepest valid classification

    return final_classifications

def apply_multiplicative_rule(predictions_df, threshold=0.5):
    product_df = compute_class_path_products(predictions_df)
    final_classifications = select_deepest_classification(product_df, threshold)
    return final_classifications

In [3]:
def split_string(input_string):
    segments = []
    current_segment = ""

    for char in input_string:
        if char.isalnum():
            current_segment += char  # Append alphanumeric characters to the current segment
        elif char.isspace():
            continue  # Skip whitespace characters
        else:
            if current_segment:
                segments.append(current_segment)  # Add the current segment to the list
                current_segment = ""
            segments.append(char)  # Add the delimiter as a separate segment

    if current_segment:
        segments.append(current_segment)  # Add the last segment if any

    return segments

def split_and_replace(input_string, mapping):
    segments = split_string(input_string)
    i = 0

    while i < len(segments):
        segment = segments[i]
        if segment in mapping:
            # Replace the segment with its mapped value and split it
            new_segments = split_string(mapping[segment])
            segments = segments[:i] + new_segments + segments[i+1:]
            # Do not increment i, so the loop will check the new segments on the next iteration
        else:
            # Only increment i if no replacement was done
            i += 1

    return segments

In [4]:
with open('alias_key.json', 'r') as f:
    mapping = json.load(f)

for key, value in mapping.items():
    if isinstance(value, list):
        mapping[key] = '[' + ', '.join(value) + ']'

In [None]:
file_names = [os.path.splitext(os.path.basename(file))[0] for file in glob.glob("../../../data/features/SARS/*.parquet")]
dataset = "SARS"
file_name = "FCGR_remove_256"
# print(file_names)

X_train, y_train, X_val, y_val = load_data(file_name, dataset)

In [6]:
def expand_labels(labels):
    expanded_labels = []
    for label in labels:
        parts = label.split('.')
        ancestors = ['.'.join(parts[:i+1]) for i in range(len(parts))]
        expanded_labels.append(ancestors)
    return expanded_labels

y_train_expanded = expand_labels(y_train)
y_val_expanded = expand_labels(y_val)

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_train_mlb = mlb.fit_transform(y_train_expanded)
y_val_mlb = mlb.transform(y_val_expanded)

### Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, hamming_loss

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Fit the model on the training data
rf.fit(X_train, y_train_mlb)

# Predict on the test data
y_prob_mlb = rf.predict_proba(X_val)

# Extracting the positive class probabilities for each class
confidences = np.array(y_prob_mlb)[:, :, 1].T

confidences_df = pd.DataFrame(confidences, columns=mlb.classes_)

In [24]:
y_pred_rf_BSLL = navigate_tree(confidences_df)
# Evaluate the overall performance
accuracy = accuracy_score(y_val, y_pred_rf_BSLL)
f1 = f1_score(y_val, y_pred_rf_BSLL, average='weighted')  # Consider weighted if class imbalance is present

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

y_pred_rf_BSLL.to_csv('predictions/multi-label_random_forest_BSLL.csv')

Accuracy: 0.975107006082451
F1 Score: 0.975827162992359


In [47]:
thresholds = np.arange(0.3, 0.7, 0.02)  # 0.8 is included
best_accuracy = 0
best_threshold = 0

print("Thresholds and their corresponding accuracies:")

for threshold in thresholds:
    y_pred_mult = apply_multiplicative_rule(confidences_df, threshold=threshold)
    accuracy = accuracy_score(y_val, y_pred_mult)
    print(f"Threshold: {threshold:.2f}, Accuracy: {accuracy:.4f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

print(f"\nBest Threshold: {best_threshold:.2f} with Accuracy: {best_accuracy:.4f}")
y_pred_rf_mult = apply_multiplicative_rule(confidences_df, threshold=best_threshold)
y_pred_rf_mult.to_csv('predictions/multi-label_random_forest_mult.csv')

Thresholds and their corresponding accuracies:
Threshold: 0.30, Accuracy: 0.9674
Threshold: 0.32, Accuracy: 0.9683
Threshold: 0.34, Accuracy: 0.9696
Threshold: 0.36, Accuracy: 0.9705
Threshold: 0.38, Accuracy: 0.9716
Threshold: 0.40, Accuracy: 0.9715
Threshold: 0.42, Accuracy: 0.9721
Threshold: 0.44, Accuracy: 0.9727
Threshold: 0.46, Accuracy: 0.9729
Threshold: 0.48, Accuracy: 0.9725
Threshold: 0.50, Accuracy: 0.9718
Threshold: 0.52, Accuracy: 0.9709
Threshold: 0.54, Accuracy: 0.9691
Threshold: 0.56, Accuracy: 0.9674
Threshold: 0.58, Accuracy: 0.9655
Threshold: 0.60, Accuracy: 0.9628
Threshold: 0.62, Accuracy: 0.9590
Threshold: 0.64, Accuracy: 0.9555
Threshold: 0.66, Accuracy: 0.9503
Threshold: 0.68, Accuracy: 0.9454

Best Threshold: 0.46 with Accuracy: 0.9729


### Binary Relevance

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, hamming_loss

# Initialize the RandomForest classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Wrap the classifier with MultiOutputClassifier which suits for the binary relevance method
multi_target_classifier = MultiOutputClassifier(classifier, n_jobs=-1)

# Fit the model on the multi-label transformed training data
multi_target_classifier.fit(X_train, y_train_mlb)

# Predict on the test data
y_prob_mlb = multi_target_classifier.predict_proba(X_val)

In [11]:
# Extracting the positive class probabilities for each class
confidences = np.array(y_prob_mlb)[:, :, 1].T

confidences_df = pd.DataFrame(confidences, columns=mlb.classes_)

y_pred_rf_BSLL = navigate_tree(confidences_df)
# Evaluate the overall performance
accuracy = accuracy_score(y_val, y_pred_rf_BSLL)
f1 = f1_score(y_val, y_pred_rf_BSLL, average='weighted')  # Consider weighted if class imbalance is present

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

y_pred_rf_BSLL.to_csv('predictions/multi-label_binary_relevance_BSLL.csv')

thresholds = np.arange(0.3, 0.7, 0.02)  # 0.8 is included
best_accuracy = 0
best_threshold = 0

print("Thresholds and their corresponding accuracies:")

for threshold in thresholds:
    y_pred_mult = apply_multiplicative_rule(confidences_df, threshold=threshold)
    accuracy = accuracy_score(y_val, y_pred_mult)
    print(f"Threshold: {threshold:.2f}, Accuracy: {accuracy:.4f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

print(f"\nBest Threshold: {best_threshold:.2f} with Accuracy: {best_accuracy:.4f}")
y_pred_rf_mult = apply_multiplicative_rule(confidences_df, threshold=best_threshold)
y_pred_rf_mult.to_csv('predictions/multi-label_binary_relevance_mult.csv')

Accuracy: 0.9782608695652174
F1 Score: 0.9789409060351784
Thresholds and their corresponding accuracies:
Threshold: 0.30, Accuracy: 0.9699
Threshold: 0.32, Accuracy: 0.9713
Threshold: 0.34, Accuracy: 0.9724
Threshold: 0.36, Accuracy: 0.9729
Threshold: 0.38, Accuracy: 0.9743
Threshold: 0.40, Accuracy: 0.9763
Threshold: 0.42, Accuracy: 0.9767
Threshold: 0.44, Accuracy: 0.9762
Threshold: 0.46, Accuracy: 0.9765
Threshold: 0.48, Accuracy: 0.9763
Threshold: 0.50, Accuracy: 0.9754
Threshold: 0.52, Accuracy: 0.9745
Threshold: 0.54, Accuracy: 0.9739
Threshold: 0.56, Accuracy: 0.9721
Threshold: 0.58, Accuracy: 0.9704
Threshold: 0.60, Accuracy: 0.9680
Threshold: 0.62, Accuracy: 0.9668
Threshold: 0.64, Accuracy: 0.9637
Threshold: 0.66, Accuracy: 0.9602
Threshold: 0.68, Accuracy: 0.9569

Best Threshold: 0.42 with Accuracy: 0.9767


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import accuracy_score, hamming_loss

# Initialize the base classifier
base_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Creating a Classifier Chain with random order
chain = ClassifierChain(base_classifier, order='random', random_state=42)

# Fit the Classifier Chain model on the multi-label transformed training data
chain.fit(X_train, y_train_mlb)

# Predict on the test set
y_prob_mlb = chain.predict(X_val)

In [17]:
y_prob_mlb =  chain.predict_proba(X_val)

In [21]:
y_prob_mlb

array([[0.  , 1.  , 0.97, ..., 0.  , 0.02, 0.  ],
       [0.  , 0.99, 1.  , ..., 0.  , 0.01, 0.  ],
       [0.  , 1.  , 1.  , ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 1.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 1.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 1.  , 1.  , ..., 0.  , 0.  , 0.  ]])

In [22]:
# Extracting the positive class probabilities for each class
confidences = y_prob_mlb

confidences_df = pd.DataFrame(confidences, columns=mlb.classes_)

y_pred_rf_BSLL = navigate_tree(confidences_df)
# Evaluate the overall performance
accuracy = accuracy_score(y_val, y_pred_rf_BSLL)
f1 = f1_score(y_val, y_pred_rf_BSLL, average='weighted')  # Consider weighted if class imbalance is present

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

y_pred_rf_BSLL.to_csv('predictions/multi-label_classifier_chain_BSLL.csv')

thresholds = np.arange(0.3, 0.7, 0.02)  # 0.8 is included
best_accuracy = 0
best_threshold = 0

print("Thresholds and their corresponding accuracies:")

for threshold in thresholds:
    y_pred_mult = apply_multiplicative_rule(confidences_df, threshold=threshold)
    accuracy = accuracy_score(y_val, y_pred_mult)
    print(f"Threshold: {threshold:.2f}, Accuracy: {accuracy:.4f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

print(f"\nBest Threshold: {best_threshold:.2f} with Accuracy: {best_accuracy:.4f}")
y_pred_rf_mult = apply_multiplicative_rule(confidences_df, threshold=best_threshold)
y_pred_rf_mult.to_csv('predictions/multi-label_classifier_chain_mult.csv')

Accuracy: 0.9765712998423068
F1 Score: 0.9774655423367556
Thresholds and their corresponding accuracies:
Threshold: 0.30, Accuracy: 0.9697
Threshold: 0.32, Accuracy: 0.9704
Threshold: 0.34, Accuracy: 0.9718
Threshold: 0.36, Accuracy: 0.9731
Threshold: 0.38, Accuracy: 0.9743
Threshold: 0.40, Accuracy: 0.9750
Threshold: 0.42, Accuracy: 0.9753
Threshold: 0.44, Accuracy: 0.9752
Threshold: 0.46, Accuracy: 0.9743
Threshold: 0.48, Accuracy: 0.9745
Threshold: 0.50, Accuracy: 0.9744
Threshold: 0.52, Accuracy: 0.9740
Threshold: 0.54, Accuracy: 0.9734
Threshold: 0.56, Accuracy: 0.9725
Threshold: 0.58, Accuracy: 0.9707
Threshold: 0.60, Accuracy: 0.9692
Threshold: 0.62, Accuracy: 0.9674
Threshold: 0.64, Accuracy: 0.9642
Threshold: 0.66, Accuracy: 0.9607
Threshold: 0.68, Accuracy: 0.9564

Best Threshold: 0.42 with Accuracy: 0.9753
