# OmniMedVQA Data Cleaning for Disease Diagnosis

In [None]:
import pandas as pd

import os

from src.data import load_omnimed_dataset

In [None]:
# Recombine all splits
train_df, val_df, test_df = load_omnimed_dataset()
full_df = pd.concat([train_df, val_df, test_df]).reset_index(drop=True)

# Create gt_label column by mapping gt_answer to the correct option
def add_gt_label(df):
    option_cols = ["option_A", "option_B", "option_C", "option_D"]

    def find_correct_option(row):
        for col in option_cols:
            if str(row[col]).strip().lower().rstrip(".") == str(row["gt_answer"]).strip().lower().rstrip("."):
                return col  # store the key: option_A, option_B, ...
        return None

    df["gt_label"] = df.apply(find_correct_option, axis=1)
    return df

# Apply after loading
full_df = add_gt_label(full_df)

# Example usage:
row = full_df.iloc[0]
print("Correct option key:", row["gt_label"])
print("Correct answer text:", row[row["gt_label"]])

# Columns relevant for cleaning
cols_to_clean = ['gt_label', 'option_A', 'option_B', 'option_C', 'option_D']
full_df = full_df[cols_to_clean].copy()

print(full_df.head())
print(f"After mapping: {len(full_df)} samples remain")

In [None]:
# Folder for exploratory outputs
export_dir = os.path.join("data", "exploration_outputs")
os.makedirs(export_dir, exist_ok=True)

# Count of unique gt_answers
answer_counts = full_df.apply(lambda r: r[r["gt_label"]], axis=1).value_counts()
answer_counts_file = os.path.join(export_dir, "gt_answer_counts.csv")
answer_counts.to_csv(answer_counts_file, header=True)
print(f"Exported gt_answer counts to {answer_counts_file}")

# Eexport all unique options
option_cols = ['option_A', 'option_B', 'option_C', 'option_D']
all_options = pd.concat([full_df[col].str.lower().str.strip().str.rstrip(".") for col in option_cols])

# Count unique options
option_counts = all_options.value_counts()

# Export to CSV
option_counts_file = os.path.join(export_dir, "all_unique_options.csv")
option_counts.to_csv(option_counts_file, header=["count"])
print(f"Exported all unique options to {option_counts_file}")

In [None]:
print(answer_counts.head(50))  # Top 50 most frequent
print(len(answer_counts))      # Total number of unique answers

In [None]:
rare_answers = answer_counts[answer_counts <= 50]
print(rare_answers)

### Part 1: Converting "No" answers to No Finding and "Yes" answers to specific labels

### Part 2: Removing punctuation/grammar/useless words in answer choices

### Part 3 Remove rare or problematic labels / options, combine similar labels together if any, decide threshold for what labels show up too little once all other cleaning is done