In [2]:
import pandas as pd
from collections import Counter
from sklearn.metrics import f1_score, accuracy_score, recall_score

# Load CSVs
video_majority_df = pd.read_csv("/data/home/huixian/Documents/Homeworks/535_project/late_fusion/video_level_majority_predictions.csv")
mlp_segment_df = pd.read_csv("/data/home/huixian/Documents/Homeworks/535_project/late_fusion/mlp_segment_predictions.csv")
audio_majority_df = pd.read_csv("/data/home/huixian/Documents/Homeworks/535_project/late_fusion/audio_majority_pred.csv")

# Label mappings
label_to_id = {"Negative": 0, "Neutral": 1, "Positive": 2}
id_to_label = {v: k for k, v in label_to_id.items()}

results = []

# Perform majority voting
for _, row in audio_majority_df.iterrows():
    video_id = row["video_id"]
    gt_label = row["gt_label"]

    pred_video_raw = video_majority_df.loc[video_majority_df["video_id"] == video_id, "majority_label"]
    pred_video = label_to_id[pred_video_raw.values[0]] if not pred_video_raw.empty else None

    pred_mlp_raw = mlp_segment_df.loc[mlp_segment_df["segment_id"] == video_id, "predicted_label"]
    pred_mlp = pred_mlp_raw.values[0] if not pred_mlp_raw.empty else None

    pred_audio = row["majority_prediction"]

    # Aggregate votes
    votes = [p for p in [pred_video, pred_mlp, pred_audio] if p is not None]
    vote_counts = Counter(votes)
    most_common = vote_counts.most_common()

    if len(most_common) == 1 or most_common[0][1] > most_common[1][1]:
        final_pred = most_common[0][0]
    else:
        final_pred = label_to_id["Neutral"]  # Tie breaker

    results.append({
        "video_id": video_id,
        "final_prediction": final_pred,
        "final_prediction_name": id_to_label[final_pred],
        "ground_truth": gt_label,
        "ground_truth_name": id_to_label[gt_label]
    })

# Convert to DataFrame
result_df = pd.DataFrame(results)

# Extract prediction and ground truth lists
y_true = result_df["ground_truth"]
y_pred = result_df["final_prediction"]

# Compute metrics
macro_f1 = f1_score(y_true, y_pred, average="macro")
micro_f1 = f1_score(y_true, y_pred, average="micro")
accuracy = accuracy_score(y_true, y_pred)
recall_per_class = recall_score(y_true, y_pred, average=None, labels=[0, 1, 2])

# Print results
print("📊 Evaluation Metrics:")
print(f"Macro-F1:  {macro_f1:.4f}")
print(f"Micro-F1:  {micro_f1:.4f}")
print(f"Accuracy:  {accuracy:.4f}")
print("Recall per class:")
print(f"  Negative (0): {recall_per_class[0]:.4f}")
print(f"  Neutral  (1): {recall_per_class[1]:.4f}")
print(f"  Positive (2): {recall_per_class[2]:.4f}")


📊 Evaluation Metrics:
Macro-F1:  0.5408
Micro-F1:  0.6053
Accuracy:  0.6053
Recall per class:
  Negative (0): 0.3750
  Neutral  (1): 0.6207
  Positive (2): 0.6522
