In [1]:
import pandas as pd
import numpy as np
import os
import json
import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict

In [2]:
def load_data(file_name, dataset):
    """
    Load the data from a Parquet file, encode the target variable, and split the data into training and validation sets.

    Parameters:
    - file_name (str): Name of the file to load (without '.parquet' extension and path).

    Returns:
    - X_train (DataFrame): Training features.
    - y_train (Series): Training labels.
    - X_val (DataFrame): Validation features.
    - y_val (Series): Validation labels.
    """
    data_path = f'../../../data/features/{dataset}/{file_name}.parquet'
    data = pd.read_parquet(data_path)
    
    expanded_lineages = []
    for lineage in data["Target"]:
        output = split_and_replace(lineage, mapping)
        output_str = "".join(output)
        expanded_lineages.append(output_str)
    
    data["Target"] = expanded_lineages
    data = data[~data["Target"].astype(str).str.contains(r"\[|\*")]

    X_train = data[data['Train'] == 0].drop(columns=["Target", "Train"])
    y_train = data[data['Train'] == 0]['Target']
    X_val = data[data['Train'] == 1].drop(columns=["Target", "Train"])
    y_val = data[data['Train'] == 1]['Target']

    return X_train, y_train, X_val, y_val

def extract_nodes(y_train):
    nodes = set()
    for path in y_train:
        parts = path.split('.')
        for i in range(1, len(parts) + 1):
            nodes.add('.'.join(parts[:i]))
    return nodes

In [3]:
def split_string(input_string):
    segments = []
    current_segment = ""

    for char in input_string:
        if char.isalnum():
            current_segment += char  # Append alphanumeric characters to the current segment
        elif char.isspace():
            continue  # Skip whitespace characters
        else:
            if current_segment:
                segments.append(current_segment)  # Add the current segment to the list
                current_segment = ""
            segments.append(char)  # Add the delimiter as a separate segment

    if current_segment:
        segments.append(current_segment)  # Add the last segment if any

    return segments

def split_and_replace(input_string, mapping):
    segments = split_string(input_string)
    i = 0

    while i < len(segments):
        segment = segments[i]
        if segment in mapping:
            # Replace the segment with its mapped value and split it
            new_segments = split_string(mapping[segment])
            segments = segments[:i] + new_segments + segments[i+1:]
            # Do not increment i, so the loop will check the new segments on the next iteration
        else:
            # Only increment i if no replacement was done
            i += 1

    return segments

In [4]:
with open('alias_key.json', 'r') as f:
    mapping = json.load(f)

for key, value in mapping.items():
    if isinstance(value, list):
        mapping[key] = '[' + ', '.join(value) + ']'

In [5]:
file_names = [os.path.splitext(os.path.basename(file))[0] for file in glob.glob("../../../data/features/SARS/*.parquet")]
dataset = "SARS"
file_name = "FCGR_remove_256"
# print(file_names)

X_train, y_train, X_val, y_val = load_data(file_name, dataset)

In [6]:
y_train

0             B.1.1.529.5.1
1              B.1.617.2.19
2                 B.1.1.1.1
3             B.1.1.529.4.6
5             B.1.617.2.107
                ...        
10089       B.1.1.529.4.1.1
10090         B.1.1.529.2.3
10091      B.1.1.529.2.86.2
10092          B.1.617.2.38
10093    B.1.1.529.5.3.1.13
Name: Target, Length: 8792, dtype: object

In [7]:
max_level = max(len(node.split('.')) for node in y_train)
y_train_e = [label + '.e' if len(label.split('.')) < max_level else label for label in y_train]
classifiers = defaultdict(lambda: RandomForestClassifier(random_state=42, n_jobs=-1))

In [8]:
unique_nodes = extract_nodes(y_train_e)

In [9]:
def get_labels_by_policy(level, X_train, y_train):
    # Filter out samples where the label ends before the current level
    valid_indices = [i for i, sample in enumerate(y_train) if len(sample.split('.')) > level]
    
    # Ensure X_train is accessed correctly
    if isinstance(X_train, pd.DataFrame) or isinstance(X_train, pd.Series):
        X_train_filtered = X_train.iloc[valid_indices]
    else:
        # Assuming X_train is a NumPy array or similar
        X_train_filtered = X_train[valid_indices]

    y_train_filtered = ['.'.join(y_train[i].split('.')[:level + 1]) for i in valid_indices]

    return X_train_filtered, np.array(y_train_filtered)

for level in range(max_level):
    # Generate labels and filter X_train for the current level
    X_train_level, y_train_level = get_labels_by_policy(level, X_train, y_train_e)
    
    print(np.unique(y_train_level))

    # Train a single classifier for this level
    clf = RandomForestClassifier(random_state=42, n_jobs=-1)
    clf.fit(X_train_level, y_train_level)  # Use the filtered and correctly labeled training data

    # Store the trained classifier
    classifiers[level] = clf

['A' 'B']
['A.e' 'B.1' 'B.40' 'B.e']
['B.1.1' 'B.1.140' 'B.1.237' 'B.1.351' 'B.1.381' 'B.1.417' 'B.1.462'
 'B.1.467' 'B.1.525' 'B.1.617' 'B.1.638' 'B.1.8' 'B.1.e' 'B.40.e']
['B.1.1.1' 'B.1.1.10' 'B.1.1.117' 'B.1.1.254' 'B.1.1.273' 'B.1.1.318'
 'B.1.1.34' 'B.1.1.382' 'B.1.1.383' 'B.1.1.386' 'B.1.1.40' 'B.1.1.412'
 'B.1.1.448' 'B.1.1.456' 'B.1.1.459' 'B.1.1.487' 'B.1.1.507' 'B.1.1.528'
 'B.1.1.529' 'B.1.1.53' 'B.1.1.54' 'B.1.1.56' 'B.1.1.57' 'B.1.1.62'
 'B.1.1.7' 'B.1.1.84' 'B.1.1.99' 'B.1.1.e' 'B.1.140.e' 'B.1.237.e'
 'B.1.351.e' 'B.1.381.e' 'B.1.417.e' 'B.1.462.e' 'B.1.467.e' 'B.1.525.e'
 'B.1.617.2' 'B.1.617.e' 'B.1.638.e' 'B.1.8.e']
['B.1.1.1.1' 'B.1.1.1.2' 'B.1.1.1.6' 'B.1.1.1.9' 'B.1.1.1.e' 'B.1.1.10.3'
 'B.1.1.10.e' 'B.1.1.117.e' 'B.1.1.254.e' 'B.1.1.273.e' 'B.1.1.318.e'
 'B.1.1.34.e' 'B.1.1.382.e' 'B.1.1.383.e' 'B.1.1.386.e' 'B.1.1.40.e'
 'B.1.1.412.e' 'B.1.1.448.e' 'B.1.1.456.e' 'B.1.1.459.e' 'B.1.1.487.e'
 'B.1.1.507.e' 'B.1.1.528.e' 'B.1.1.529.1' 'B.1.1.529.2' 'B.1.1.529.3'
 '

In [10]:
import pandas as pd

# Initialize a list to store the DataFrames, one for each level
level_predictions_dfs = []

for level in range(max_level):
    # Determine the current level nodes (the labels for this level)
    
    # Ensure the classifier for the current level exists
    if level in classifiers:
        clf = classifiers[level]
        
        # Get the predicted probabilities for the validation set
        probabilities = clf.predict_proba(X_val)
        
        # Retrieve the class labels
        class_labels = clf.classes_

        # Create a DataFrame for the current level's probabilities
        level_df = pd.DataFrame(probabilities, columns=class_labels)
        
        # Append the DataFrame to the list
        level_predictions_dfs.append(level_df)

In [11]:
predictions_df = pd.concat(level_predictions_dfs, axis=1)

In [12]:
predictions_df

Unnamed: 0,A,B,A.e,B.1,B.40,B.e,B.1.1,B.1.140,B.1.237,B.1.351,...,B.1.1.529.2.75.3.4.1.1.1.1.31.1,B.1.1.529.2.75.3.4.1.1.1.1.31.e,B.1.1.529.5.3.1.1.1.1.1.1.1.e,B.1.1.529.5.3.1.1.1.1.1.1.10.e,B.1.1.529.5.3.1.1.1.1.1.1.3.e,B.1.1.529.5.3.1.1.1.1.1.1.38.1,B.1.1.529.5.3.1.1.1.1.1.1.38.3,B.1.1.529.5.3.1.1.1.1.1.1.38.e,B.1.1.529.5.3.1.1.1.1.1.1.64.e,B.1.1.529.5.3.1.1.1.1.1.1.67.e
0,0.0,1.0,0.0,0.99,0.0,0.01,0.00,0.0,0.01,0.00,...,0.04,0.12,0.09,0.15,0.21,0.02,0.02,0.05,0.13,0.17
1,0.0,1.0,0.0,1.00,0.0,0.00,0.02,0.0,0.00,0.98,...,0.04,0.11,0.11,0.06,0.10,0.06,0.07,0.07,0.30,0.08
2,0.0,1.0,0.0,1.00,0.0,0.00,1.00,0.0,0.00,0.00,...,0.02,0.16,0.10,0.14,0.27,0.00,0.07,0.06,0.07,0.11
3,0.0,1.0,0.0,1.00,0.0,0.00,0.99,0.0,0.00,0.01,...,0.06,0.09,0.16,0.12,0.10,0.04,0.07,0.14,0.05,0.17
4,0.0,1.0,0.0,1.00,0.0,0.00,0.00,0.0,0.00,0.95,...,0.04,0.10,0.09,0.15,0.21,0.02,0.03,0.07,0.16,0.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8873,0.0,1.0,0.0,0.99,0.0,0.01,0.00,0.0,0.00,0.00,...,0.14,0.08,0.07,0.15,0.13,0.02,0.03,0.07,0.19,0.12
8874,0.0,1.0,0.0,1.00,0.0,0.00,0.01,0.0,0.00,0.94,...,0.03,0.10,0.11,0.10,0.21,0.02,0.07,0.06,0.19,0.11
8875,0.0,1.0,0.0,1.00,0.0,0.00,0.00,0.0,0.00,0.00,...,0.06,0.10,0.08,0.13,0.20,0.02,0.02,0.06,0.20,0.13
8876,0.0,1.0,0.0,1.00,0.0,0.00,1.00,0.0,0.00,0.00,...,0.04,0.17,0.13,0.11,0.26,0.00,0.03,0.06,0.09,0.11


In [13]:
# def extract_levels(labels):
#     max_level = max(len(label.split('.')) for label in labels)
#     level_data = {level: [] for level in range(max_level)}
    
#     for label in labels:
#         parts = label.split('.')
#         for level in range(max_level):
#             if level < len(parts):
#                 level_data[level].append('.'.join(parts[:level+1]))
#             else:
#                 # If the current label has fewer levels, repeat the last available part
#                 level_data[level].append(level_data[level-1][-1])
    
#     return level_data

# # Extract levels from y_train and y_val
# train_levels = extract_levels(y_train)
# val_levels = extract_levels(y_val)

# # Now, let's ensure that the length matches
# for level in train_levels:
#     assert len(train_levels[level]) == len(y_train), f"Length mismatch in training level {level}"
#     assert len(val_levels[level]) == len(y_val), f"Length mismatch in validation level {level}"
    
# def create_confidence_df(models, X_val, train_levels):
#     """
#     Creates a DataFrame with confidence scores for each node.

#     :param models: Dictionary of trained models for each level.
#     :param X_val: Validation features.
#     :param train_levels: Dictionary containing the labels for each level used during training.
#     :return: DataFrame with confidence scores.
#     """
#     # Dictionary to collect confidence scores
#     confidences = {}

#     for level, model in models.items():
#         # Predict the probability for each class
#         probas = model.predict_proba(X_val)

#         # Match the predicted probabilities to the corresponding nodes
#         for idx, label in enumerate(model.classes_):
#             # The label here corresponds to the node
#             # We take the probability of the class being present (assuming binary classification, index 1)
#             confidences[label] = probas[:, idx]

#     # Create the DataFrame from the dictionary
#     confidences_df = pd.DataFrame(confidences)

#     return confidences_df

# # Initialize models, predictions, and accuracies
# models = {}
# predictions = {}
# val_accuracies = {}

# # Unique labels (classes) for each level
# level_labels = {level: set(labels) for level, labels in train_levels.items()}

# for level in train_levels:
#     model = RandomForestClassifier(random_state=42, n_jobs=-1)
#     model.fit(X_train, train_levels[level])
#     models[level] = model
    
#     # Validate the model
#     y_pred = model.predict(X_val)
#     predictions[level] = y_pred
#     val_accuracies[level] = accuracy_score(val_levels[level], y_pred)
    
#     # Create the confidence DataFrame
# confidence_df = create_confidence_df(models, X_val, train_levels)

### Top Down to Leaf

In [14]:
def get_root_children(predictions_df):
    # Extract column names that don't contain '.' indicating they are root children
    return [col for col in predictions_df.columns if '.' not in col]

def get_children(node, predictions_df):
    if node:  # If the node is specified, find its children
        prefix = node + '.'
        return [col for col in predictions_df.columns if col.startswith(prefix) and col.count('.') == node.count('.') + 1]
    else:  # If no node is specified, return root children
        return get_root_children(predictions_df)
    
def navigate_tree_top_down(predictions_df):
    results_df = pd.DataFrame(index=predictions_df.index)

    for index, row in predictions_df.iterrows():
        # Start with root children
        current_level = 0
        current_nodes = get_root_children(predictions_df)
        predicted_path = []
        
        # Iterate down the levels of the hierarchy
        while current_nodes:
            # Select the node with the maximum confidence
            max_conf_node = max(current_nodes, key=lambda node: row[node])
            max_confidence = row[max_conf_node]

            # Append the selected node to the path
            predicted_path.append(max_conf_node)
            
            # Proceed to the next level if the current node is not a leaf
            current_nodes = get_children(max_conf_node, predictions_df)
            current_level += 1

        # Assign the last node in the predicted path as the final prediction
        results_df.at[index, 'prediction'] = predicted_path[-1] if predicted_path else None

    return results_df

In [15]:
y_pred = navigate_tree_top_down(predictions_df)['prediction'].str.replace(".e", "", regex=False)

# Evaluate the overall performance
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='weighted')  # Consider weighted if class imbalance is present

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.9704888488398288
F1 Score: 0.9709417148247697


In [16]:
y_pred.to_csv('predictions/LPL.csv')