In [1]:
import pandas as pd
import numpy as np
import os
import json
import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict

In [2]:
def load_data(file_name, dataset):
    """
    Load the data from a Parquet file, encode the target variable, and split the data into training and validation sets.

    Parameters:
    - file_name (str): Name of the file to load (without '.parquet' extension and path).

    Returns:
    - X_train (DataFrame): Training features.
    - y_train (Series): Training labels.
    - X_val (DataFrame): Validation features.
    - y_val (Series): Validation labels.
    """
    data_path = f'../../../data/features/{dataset}/{file_name}.parquet'
    data = pd.read_parquet(data_path)
    
    expanded_lineages = []
    for lineage in data["Target"]:
        output = split_and_replace(lineage, mapping)
        output_str = "".join(output)
        expanded_lineages.append(output_str)
    
    data["Target"] = expanded_lineages
    data = data[~data["Target"].astype(str).str.contains(r"\[|\*")]

    X_train = data[data['Train'] == 0].drop(columns=["Target", "Train"])
    y_train = data[data['Train'] == 0]['Target']
    X_val = data[data['Train'] == 1].drop(columns=["Target", "Train"])
    y_val = data[data['Train'] == 1]['Target']

    return X_train, y_train, X_val, y_val

def extract_nodes(y_train):
    nodes = set()
    for path in y_train:
        parts = path.split('.')
        for i in range(1, len(parts) + 1):
            nodes.add('.'.join(parts[:i]))
    return nodes

In [3]:
def split_string(input_string):
    segments = []
    current_segment = ""

    for char in input_string:
        if char.isalnum():
            current_segment += char  # Append alphanumeric characters to the current segment
        elif char.isspace():
            continue  # Skip whitespace characters
        else:
            if current_segment:
                segments.append(current_segment)  # Add the current segment to the list
                current_segment = ""
            segments.append(char)  # Add the delimiter as a separate segment

    if current_segment:
        segments.append(current_segment)  # Add the last segment if any

    return segments

def split_and_replace(input_string, mapping):
    segments = split_string(input_string)
    i = 0

    while i < len(segments):
        segment = segments[i]
        if segment in mapping:
            # Replace the segment with its mapped value and split it
            new_segments = split_string(mapping[segment])
            segments = segments[:i] + new_segments + segments[i+1:]
            # Do not increment i, so the loop will check the new segments on the next iteration
        else:
            # Only increment i if no replacement was done
            i += 1

    return segments

In [4]:
with open('alias_key.json', 'r') as f:
    mapping = json.load(f)

for key, value in mapping.items():
    if isinstance(value, list):
        mapping[key] = '[' + ', '.join(value) + ']'

In [5]:
file_names = [os.path.splitext(os.path.basename(file))[0] for file in glob.glob("../../../data/features/SARS/*.parquet")]
dataset = "SARS"
file_name = "FCGR_remove_256"
# print(file_names)

X_train, y_train, X_val, y_val = load_data(file_name, dataset)

In [6]:
max_level = max(len(node.split('.')) for node in y_train)
y_train_e = [label + '.e' if len(label.split('.')) < max_level else label for label in y_train]
classifiers = defaultdict(lambda: RandomForestClassifier(random_state=42, n_jobs=-1))

In [7]:
unique_nodes = extract_nodes(y_train_e)

In [8]:
def get_labels_by_parent(X_train, y_train, parent_node):
    valid_indices = []
    y_train_filtered = []

    parent_level = len(parent_node.split('.'))

    for i, sample in enumerate(y_train):
        hierarchy = sample.split('.')
        # More precise check: ensure the sample's hierarchy matches the parent_node exactly at the correct level
        if '.'.join(hierarchy[:parent_level]) == parent_node:
            valid_indices.append(i)
            # Append the full path up to the child node
            if len(hierarchy) > parent_level:
                child_label = '.'.join(hierarchy[:parent_level + 1])
                y_train_filtered.append(child_label)
            else:
                # Handle cases where the hierarchy length is less than or equal to parent_level
                y_train_filtered.append(parent_node)

    # Filter X_train according to valid indices
    if isinstance(X_train, pd.DataFrame) or isinstance(X_train, pd.Series):
        X_train_filtered = X_train.iloc[valid_indices]
    else:
        X_train_filtered = X_train[valid_indices]

    return X_train_filtered, np.array(y_train_filtered)

def get_root_labels(X_train, y_train):
    # Generate labels for the root level
    y_train_root = [sample.split('.')[0] for sample in y_train]  # Root labels are the first element

    return X_train, np.array(y_train_root)

# Dictionary to hold classifiers for each parent node
classifiers = {}

# Define the root classifier and train it
root_nodes = set(node.split('.')[0] for node in unique_nodes)
X_train_root, y_train_root = get_root_labels(X_train, y_train_e)  # Assuming this function is defined correctly

if len(np.unique(y_train_root)) > 1:  # Ensure there are at least two classes
    clf_root = RandomForestClassifier(random_state=42, n_jobs=-1)
    clf_root.fit(X_train_root, y_train_root)
    classifiers['root'] = clf_root  # Store the root classifier

# Iterate through levels, skipping the last one and starting from the first level
for level in range(0, max_level - 1):  # Adjusted to start from 1, since root is already processed
    parent_nodes = set('.'.join(node.split('.')[:level + 1]) for node in unique_nodes if len(node.split('.')) > level)
    print(parent_nodes)

    for parent_node in parent_nodes:
        # Generate labels and filter X_train for the current parent node
        X_train_parent, y_train_parent = get_labels_by_parent(X_train, y_train_e, parent_node)
        
        if len(np.unique(y_train_parent)) > 1:  # Check there's more than one class to predict
            # Train a classifier for this parent node
            clf = RandomForestClassifier(random_state=42, n_jobs=-1)
            clf.fit(X_train_parent, y_train_parent)  # Use the filtered and correctly labeled training data

            # Store the trained classifier, keyed by the parent node
            classifiers[parent_node] = clf

{'A', 'B'}
{'B.e', 'A.e', 'B.1', 'B.40'}
{'B.1.8', 'B.1.462', 'B.1.381', 'B.1.417', 'B.1.351', 'B.1.237', 'B.1.525', 'B.1.467', 'B.1.638', 'B.40.e', 'B.1.140', 'B.1.617', 'B.1.1', 'B.1.e'}
{'B.1.1.84', 'B.1.1.448', 'B.1.1.386', 'B.1.417.e', 'B.1.381.e', 'B.1.1.487', 'B.1.1.117', 'B.1.1.382', 'B.1.237.e', 'B.1.638.e', 'B.1.1.40', 'B.1.1.7', 'B.1.1.459', 'B.1.1.383', 'B.1.1.10', 'B.1.1.54', 'B.1.1.1', 'B.1.351.e', 'B.1.8.e', 'B.1.1.57', 'B.1.462.e', 'B.1.1.456', 'B.1.467.e', 'B.1.1.273', 'B.1.1.53', 'B.1.140.e', 'B.1.617.2', 'B.1.1.412', 'B.1.1.99', 'B.1.1.e', 'B.1.1.529', 'B.1.1.528', 'B.1.617.e', 'B.1.1.62', 'B.1.1.254', 'B.1.1.34', 'B.1.1.507', 'B.1.1.56', 'B.1.525.e', 'B.1.1.318'}
{'B.1.1.529.4', 'B.1.1.40.e', 'B.1.1.383.e', 'B.1.1.459.e', 'B.1.1.529.3', 'B.1.617.2.32', 'B.1.1.56.e', 'B.1.617.2.120', 'B.1.617.2.19', 'B.1.1.34.e', 'B.1.1.1.6', 'B.1.1.318.e', 'B.1.1.487.e', 'B.1.617.2.122', 'B.1.617.2.116', 'B.1.1.1.1', 'B.1.1.382.e', 'B.1.1.529.5', 'B.1.1.507.e', 'B.1.1.456.e', 'B.1.1

In [9]:
classifiers

{'root': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.617': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.1': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.1.10': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.1.1': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.617.2': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.1.529': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.1.529.4': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.617.2.120': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.1.1.1': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.1.529.5': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.1.529.2': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.1.529.1': RandomForestClassifier(n_jobs=-1, random_state=42),
 'B.1.1.529.2.86': RandomF

In [10]:
# Initialize a list to store the DataFrames
classifier_predictions_dfs = []

# Iterate through each classifier stored in the dictionary
for parent_node, clf in classifiers.items():
    
    # Get the predicted probabilities for the validation set
    probabilities = clf.predict_proba(X_val)
    
    # Retrieve the class labels
    class_labels = clf.classes_
    
    # Create a DataFrame for the current classifier's probabilities
    classifier_df = pd.DataFrame(probabilities, columns=class_labels)
    
    # Append the DataFrame to the list
    classifier_predictions_dfs.append(classifier_df)

In [11]:
predictions_df = pd.concat(classifier_predictions_dfs, axis=1)

### Top Down to Leaf

In [14]:
def get_root_children(predictions_df):
    # Extract column names that don't contain '.' indicating they are root children
    return [col for col in predictions_df.columns if '.' not in col]

def get_children(node, predictions_df):
    if node:  # If the node is specified, find its children
        prefix = node + '.'
        return [col for col in predictions_df.columns if col.startswith(prefix) and col.count('.') == node.count('.') + 1]
    else:  # If no node is specified, return root children
        return get_root_children(predictions_df)
    
def navigate_tree_top_down(predictions_df):
    results_df = pd.DataFrame(index=predictions_df.index)

    for index, row in predictions_df.iterrows():
        current_nodes = get_root_children(predictions_df)
        predicted_path = []
        
        while current_nodes:
            try:
                # Attempt to find the node with the maximum probability
                max_conf_node = max(current_nodes, key=lambda node: row[node])
                predicted_path.append(max_conf_node)
                current_nodes = get_children(max_conf_node, predictions_df)
            except ValueError as e:
                print(f"ValueError: {e}")
                print(f"Index: {index}, Current nodes: {current_nodes}")
                print(f"Row data: {row}")
                # Optionally, break or continue depending on how you want to handle the error
                break

        # Assign the last node in the predicted path as the final prediction
        results_df.at[index, 'prediction'] = predicted_path[-1] if predicted_path else None

    return results_df

In [15]:
y_pred = navigate_tree_top_down(predictions_df)['prediction'].str.replace(".e", "", regex=False)

# Evaluate the overall performance
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='weighted')  # Consider weighted if class imbalance is present

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.9662086055417887
F1 Score: 0.9669571041096796


In [16]:
y_pred.to_csv('predictions/LPP.csv')