In [1]:
import pandas as pd
import numpy as np
import os
import json
import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict

In [2]:
def determine_and_store_classes(row):
    # Extract columns with positive prediction (value of 1)
    columns_with_positive_prediction = [col for col in predictions_df.columns if row[col] == 1]
    
    
    # If there's only one positive prediction, use it as the most specific node
    if len(columns_with_positive_prediction) == 0:
        return 'None', 'None'
    if len(columns_with_positive_prediction) == 1:
        most_specific_node = columns_with_positive_prediction[0]
        sorted_nodes = [].append(most_specific_node)
    else:
        # Sort the nodes by depth (count of ".") and then lexicographically
        sorted_nodes = sorted(columns_with_positive_prediction, key=lambda x: (x.count('.'), x), reverse=True)
        most_specific_node = sorted_nodes[0]  # Assume the first node is the most specific after sorting
    
    return most_specific_node, sorted_nodes

In [3]:
label_policy = "less_inclusive"

predictions_df = pd.read_csv(f'local_per_node_predictions/LPN_{label_policy}.csv', index_col=0)
y_val = pd.read_csv('y_val.csv', index_col=0)

### Binary Structured Label Learning

In [4]:
def get_root_children(predictions_df):
    # Extract column names that don't contain '.' indicating they are root children
    return [col for col in predictions_df.columns if '.' not in col]

def get_children(node, predictions_df):
    if node:  # If the node is specified, find its children
        prefix = node + '.'
        return [col for col in predictions_df.columns if col.startswith(prefix) and col.count('.') == node.count('.') + 1]
    else:  # If no node is specified, return root children
        return get_root_children(predictions_df)


def navigate_tree(predictions_df):
    results_df = pd.DataFrame(index=predictions_df.index)

    for index, row in predictions_df.iterrows():
        current_nodes = get_root_children(predictions_df)  # Start with root children
        predicted_path = []

        while current_nodes:
            next_nodes = []
            node_found = False
            for node in current_nodes:
                if row[node] > 0.5:  # Satisfactory confidence level
                    predicted_path.append(node)
                    node_found = True
                    child_nodes = get_children(node, predictions_df)
                    next_nodes.extend(child_nodes)

            if not node_found:  # No positive classifications at the current level
                break

            current_nodes = next_nodes

        # results_df.at[index, 'Predicted Path'] = ' > '.join(predicted_path)
        results_df.at[index, 'prediction'] = predicted_path[-1]

    return results_df

In [5]:
y_pred_BSLL = navigate_tree(predictions_df)
# Evaluate the overall performance
accuracy = accuracy_score(y_val, y_pred_BSLL)
f1 = f1_score(y_val, y_pred_BSLL, average='weighted')  # Consider weighted if class imbalance is present

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.9778103176391079
F1 Score: 0.9784948884278595


In [6]:
y_pred_BSLL.to_csv(f'predictions/LPN_{label_policy}_BSLL.csv')

### Multiplicative Probabilities

In [7]:
def get_class_path(col):
    """
    Derive the class path from the root to the specified column (class).

    Parameters:
    - col (str): The column name representing the class, formatted in a hierarchical dot notation.

    Returns:
    - List[str]: A list of column names representing the path from the root to the class.
    """
    # Split the column name based on dots to get individual nodes in the path
    nodes = col.split('.')

    # Initialize an empty list to store the path
    class_path = []

    # Iterate over the nodes to build the path
    for i in range(1, len(nodes) + 1):
        # Reconstruct the class name at each level of the hierarchy
        class_name = '.'.join(nodes[:i])
        class_path.append(class_name)

    return class_path

def compute_class_path_products(predictions_df):
    # Initialize a dictionary to store the intermediate products
    products_dict = {}

    # Iterate over columns (classifiers) to compute the product of probabilities along the path
    for col in predictions_df.columns:
        class_path = get_class_path(col)  # Ensure this function is defined and correct
        products_dict[col] = predictions_df[class_path].prod(axis=1)

    # Create the product DataFrame from the dictionary in one go
    product_df = pd.DataFrame(products_dict)

    return product_df

def select_deepest_classification(product_df, threshold=0.5):
    # Apply the threshold
    above_threshold = product_df >= threshold

    # Sort columns by their depth (assuming deeper nodes have longer names)
    sorted_columns = sorted(product_df.columns, key=lambda x: len(x), reverse=True)
    sorted_product_df = product_df[sorted_columns]

    # Initialize a Series to store the final classification for each observation
    final_classifications = pd.Series(index=sorted_product_df.index, dtype="object")

    # Iterate over the sorted DataFrame to find the deepest classification
    for idx, row in sorted_product_df.iterrows():
        for col in sorted_columns:
            if row[col] and above_threshold.at[idx, col]:
                final_classifications.at[idx] = col
                break  # Stop at the deepest valid classification

    return final_classifications

def apply_multiplicative_rule(predictions_df, threshold=0.5):
    product_df = compute_class_path_products(predictions_df)
    final_classifications = select_deepest_classification(product_df, threshold)
    return final_classifications

In [8]:
thresholds = np.arange(0.3, 0.7, 0.02)  # 0.8 is included
best_accuracy = 0
best_threshold = 0

print("Thresholds and their corresponding accuracies:")

for threshold in thresholds:
    y_pred_mult = apply_multiplicative_rule(predictions_df, threshold=threshold)
    accuracy = accuracy_score(y_val, y_pred_mult)
    print(f"Threshold: {threshold:.2f}, Accuracy: {accuracy:.4f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

print(f"\nBest Threshold: {best_threshold:.2f} with Accuracy: {best_accuracy:.4f}")

Thresholds and their corresponding accuracies:
Threshold: 0.30, Accuracy: 0.9646
Threshold: 0.32, Accuracy: 0.9669
Threshold: 0.34, Accuracy: 0.9681
Threshold: 0.36, Accuracy: 0.9704
Threshold: 0.38, Accuracy: 0.9716
Threshold: 0.40, Accuracy: 0.9742
Threshold: 0.42, Accuracy: 0.9748
Threshold: 0.44, Accuracy: 0.9751
Threshold: 0.46, Accuracy: 0.9756
Threshold: 0.48, Accuracy: 0.9757
Threshold: 0.50, Accuracy: 0.9749
Threshold: 0.52, Accuracy: 0.9741
Threshold: 0.54, Accuracy: 0.9733
Threshold: 0.56, Accuracy: 0.9720
Threshold: 0.58, Accuracy: 0.9700
Threshold: 0.60, Accuracy: 0.9680
Threshold: 0.62, Accuracy: 0.9668
Threshold: 0.64, Accuracy: 0.9637
Threshold: 0.66, Accuracy: 0.9600
Threshold: 0.68, Accuracy: 0.9557

Best Threshold: 0.48 with Accuracy: 0.9757


In [9]:
y_pred_mult = apply_multiplicative_rule(predictions_df, threshold=best_threshold)
y_pred_mult.to_csv(f'predictions/LPN_{label_policy}_mult.csv')

### Longest Positive

In [10]:
# Apply the function to each row and store the results
thresholded_df = (predictions_df > 0.5).astype(int)
results = thresholded_df.apply(lambda row: determine_and_store_classes(row), axis=1)

# Separate the predictions and the sorted lists
y_pred_longest = results.apply(lambda x: x[0])
sorted_nodes_list = results.apply(lambda x: x[1])

In [11]:
# Evaluate the overall performance
accuracy = accuracy_score(y_val, y_pred_longest)
f1 = f1_score(y_val, y_pred_longest, average='weighted')  # Consider weighted if class imbalance is present

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.977247127731471
F1 Score: 0.9778176570733454


In [12]:
y_pred_longest.to_csv(f'predictions/LPN_{label_policy}_longest.csv')

In [13]:
# y_val = y_val.reset_index(drop=True)
# y_pred_longest = y_pred_longest.reset_index(drop=True)
# sorted_nodes_list = sorted_nodes_list.reset_index(drop=True)

# mismatches = y_val != y_pred_longest

# # Filtering the instances with mismatches
# mismatched_targets = y_val[mismatches]
# mismatched_predictions = sorted_nodes_list[mismatches]

# # Ensure that mismatched_predictions is a list or a Series of lists
# if not isinstance(mismatched_predictions.iloc[0], list):
#     mismatched_predictions = mismatched_predictions.apply(eval)

# # Create the formatted output using a list comprehension
# output = [f"Target: {y}, LC: {pred}" for y, pred in zip(mismatched_targets, mismatched_predictions)]

# # Display the output
# for item in output:
#     print(item)