In [1]:
import pandas as pd
import numpy as np
import os
import json
import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict

In [2]:
def load_data(file_name, dataset):
    """
    Load the data from a Parquet file, encode the target variable, and split the data into training and validation sets.

    Parameters:
    - file_name (str): Name of the file to load (without '.parquet' extension and path).

    Returns:
    - X_train (DataFrame): Training features.
    - y_train (Series): Training labels.
    - X_val (DataFrame): Validation features.
    - y_val (Series): Validation labels.
    """
    data_path = f'../../../data/features/{dataset}/{file_name}.parquet'
    data = pd.read_parquet(data_path)
    
    expanded_lineages = []
    for lineage in data["Target"]:
        output = split_and_replace(lineage, mapping)
        output_str = "".join(output)
        expanded_lineages.append(output_str)
    
    data["Target"] = expanded_lineages
    data = data[~data["Target"].astype(str).str.contains(r"\[|\*")]

    X_train = data[data['Train'] == 0].drop(columns=["Target", "Train"])
    y_train = data[data['Train'] == 0]['Target']
    X_val = data[data['Train'] == 1].drop(columns=["Target", "Train"])
    y_val = data[data['Train'] == 1]['Target']

    return X_train, y_train, X_val, y_val

def split_string(input_string):
    segments = []
    current_segment = ""

    for char in input_string:
        if char.isalnum():
            current_segment += char  # Append alphanumeric characters to the current segment
        elif char.isspace():
            continue  # Skip whitespace characters
        else:
            if current_segment:
                segments.append(current_segment)  # Add the current segment to the list
                current_segment = ""
            segments.append(char)  # Add the delimiter as a separate segment

    if current_segment:
        segments.append(current_segment)  # Add the last segment if any

    return segments

def split_and_replace(input_string, mapping):
    segments = split_string(input_string)
    i = 0

    while i < len(segments):
        segment = segments[i]
        if segment in mapping:
            # Replace the segment with its mapped value and split it
            new_segments = split_string(mapping[segment])
            segments = segments[:i] + new_segments + segments[i+1:]
            # Do not increment i, so the loop will check the new segments on the next iteration
        else:
            # Only increment i if no replacement was done
            i += 1

    return segments

def extract_nodes(y_train):
    nodes = set()
    for path in y_train:
        parts = path.split('.')
        for i in range(1, len(parts) + 1):
            nodes.add('.'.join(parts[:i]))
    return nodes

def is_descendant(node, label):
    """
    Check if the label is a descendant of the node.

    Parameters:
    - node (str): The node we're checking against (e.g., 'B.1').
    - label (str): The label being checked (e.g., 'B.1.135').

    Returns:
    - bool: True if label is a descendant of node, False otherwise.
    """
    if label == node:
        return False  # A node is not a descendant of itself
    if label.startswith(node):
        return label[len(node)] == '.'  # Ensures that 'B.11' is not a descendant of 'B.1'
    return False

# def get_siblings(node, all_nodes):
#     parent = '.'.join(node.split('.')[:-1])
#     siblings = [n for n in all_nodes if n.startswith(parent) and n != node]
#     return siblings

# def get_descendants(node, all_nodes):
#     return [n for n in all_nodes if n.startswith(node) and n != node]

# def get_siblings_and_their_descendants(node, all_nodes):
#     siblings = get_siblings(node, all_nodes)
#     siblings_descendants = []
#     for sibling in siblings:
#         siblings_descendants.extend(get_descendants(sibling, all_nodes))
#     return siblings + siblings_descendants

def is_ancestor(ancestor, label):
    return ancestor != label and label.startswith(ancestor)

def exclusive_policy(node, y_train):
    """Exclusive policy: Instances exactly matching the node are positive; others are negative."""
    return y_train.apply(lambda x: 1 if x == node else -1)

def less_exclusive_policy(node, y_train):
    """Less exclusive policy: Assign 1 to exact matches, 0 to descendents and -1 to everything else."""
    labels = pd.Series([0] * len(y_train), index=y_train.index)  # Initialize with zeros (to be excluded)

    # Assign 1 to samples exactly matching the node
    labels[y_train.apply(lambda x: x == node)] = 1

    # Assign -1 to samples that are clearly not the node and not its descendants
    # This assumes you have a function is_descendant(node, x) available
    labels[y_train.apply(lambda x: not is_descendant(node, x) and x != node)] = -1

    return labels


def less_inclusive_policy(node, y_train):
    labels = pd.Series([0] * len(y_train), index=y_train.index)

    # Assigning positive labels to the node or its descendants
    labels[y_train.apply(lambda x: x == node or x.startswith(node + '.'))] = 1

    # Assigning negative labels to non-descendants and non-node
    labels[y_train.apply(lambda x: not x.startswith(node))] = -1

    return labels

def inclusive_policy(node, y_train):
    labels = pd.Series([0] * len(y_train), index=y_train.index)

    # Assigning positive labels to the node or its descendants
    labels[y_train.apply(lambda x: x == node or x.startswith(node + '.'))] = 1

    # Assigning negative labels excluding the node, its descendants, and ancestors
    labels[y_train.apply(lambda x: not x.startswith(node) and not is_ancestor(node, x))] = -1

    return labels

def my_policy(node, y_train):
    return y_train.apply(lambda x: int(node == '.'.join(x.split('.')[:len(node.split('.'))]))).replace(0, -1)


# def siblings_policy(node, y_train):
#     all_nodes = y_train.unique()
#     siblings_and_their_descendants = get_siblings_and_their_descendants(node, all_nodes)

#     labels = pd.Series([0] * len(y_train), index=y_train.index)

#     # Assigning positive labels to the node or its descendants
#     labels[y_train.apply(lambda x: x == node or x.startswith(node + '.'))] = 1

#     # Assigning negative labels for sibling nodes and their descendants
#     labels[y_train.isin(siblings_and_their_descendants)] = -1

#     return labels

# def exclusive_siblings_policy(node, y_train):
#     all_nodes = y_train.unique()
#     siblings = get_siblings(node, all_nodes)

#     labels = pd.Series([0] * len(y_train), index=y_train.index)

#     # Assigning positive labels for the node
#     labels[y_train == node] = 1

#     # Assigning negative labels for sibling nodes
#     labels[y_train.isin(siblings)] = -1

#     return labels


def get_labels_by_policy(node, y_train, policy):
    """Wrapper function to get labels by chosen policy."""
    if policy == 'exclusive':
        return exclusive_policy(node, y_train)
    elif policy == 'less_exclusive':
        return less_exclusive_policy(node, y_train)
    elif policy == 'less_inclusive':
        return less_inclusive_policy(node, y_train)
    elif policy == 'inclusive':
        return inclusive_policy(node, y_train)
    elif policy == 'siblings':
        return siblings_policy(node, y_train)
    elif policy == 'exclusive_siblings':
        return exclusive_siblings_policy(node, y_train)
    elif policy == 'mine':
        return my_policy(node, y_train)
    else:
        raise ValueError("Unknown policy")

In [3]:
with open('alias_key.json', 'r') as f:
    mapping = json.load(f)

for key, value in mapping.items():
    if isinstance(value, list):
        mapping[key] = '[' + ', '.join(value) + ']'
        
file_names = [os.path.splitext(os.path.basename(file))[0] for file in glob.glob("../../../data/features/SARS/*.parquet")]
dataset = "SARS"
file_name = "FCGR_remove_256"
# print(file_names)

X_train, y_train, X_val, y_val = load_data(file_name, dataset)

In [4]:
unique_nodes = extract_nodes(y_train)
labeling_policy = 'inclusive'

# Initializing a dictionary to hold the binary classifiers for each node
classifiers = defaultdict(lambda: RandomForestClassifier(random_state=42, n_jobs=-1))

# Training a binary classifier for each node according to the chosen policy
for node in tqdm(unique_nodes, desc="Training classifiers"):
    labels = get_labels_by_policy(node, y_train, labeling_policy)
    
    # Filter out samples that should not be considered for training
    X_train_filtered = X_train[labels != 0]
    y_train_filtered = labels[labels != 0]
    
    if len(y_train_filtered.value_counts()) == 1:
        print(f"{node}: {y_train_filtered.value_counts()}")

    if len(y_train_filtered) > 0:
        clf = classifiers[node]
        clf.fit(X_train_filtered, y_train_filtered)

# Example of how to predict (assuming X_val is defined)
predictions = {node: clf.predict_proba(X_val)[:, 1] for node, clf in tqdm(classifiers.items(), desc="Predicting")}
predictions_df = pd.DataFrame(predictions)

Training classifiers: 100%|██████████| 150/150 [02:40<00:00,  1.07s/it]
Predicting: 100%|██████████| 150/150 [01:28<00:00,  1.70it/s]


In [9]:
predictions_df.to_csv(f'predictions/LPN_{labeling_policy}.csv')