In [None]:
import pandas as pd

def run_phase_c():
    """
    Executes Phase C: Label Conflict Detection and Reporting.
    Analyzes df_clean for molecules with inconsistent toxicity labels.
    """
    print(f"--- Starting PHASE C: Label Conflict Detection and Reporting ---")

    # 12. Defensive Programming: Verify Pre-conditions
    assert 'df_clean' in globals(), "CRITICAL ERROR: df_clean not found. Run Phase B first."
    assert 'canonical_smiles' in df_clean.columns, "CRITICAL ERROR: 'canonical_smiles' missing in df_clean."
    assert 'toxicity_label' in df_clean.columns, "CRITICAL ERROR: 'toxicity_label' missing in df_clean."

    # 7. Identify molecules with conflicting labels
    # Group by canonical SMILES and count unique toxicity labels
    print("[Analysis] Analyzing label consistency per molecule...")

    # Calculate unique label count per SMILES
    label_counts = df_clean.groupby('canonical_smiles')['toxicity_label'].nunique()

    # Identify SMILES where the count of unique labels is > 1
    conflicting_smiles = label_counts[label_counts > 1].index.tolist()

    # 8. Compute Metrics
    num_conflicting_mols = len(conflicting_smiles)

    # Identify all rows that belong to the conflicting set
    conflicting_rows = df_clean[df_clean['canonical_smiles'].isin(conflicting_smiles)]
    num_conflicting_rows = len(conflicting_rows)

    total_rows = len(df_clean)
    pct_affected = (num_conflicting_rows / total_rows * 100) if total_rows > 0 else 0.0

    # 9. Print LABEL CONFLICT REPORT
    print(f"\n" + "="*45)
    print(f"           LABEL CONFLICT REPORT")
    print(f"="*45)
    print(f"Total Rows in df_clean:           {total_rows}")
    print(f"Conflicting Molecules (Unique):   {num_conflicting_mols}")
    print(f"Total Rows Affected:              {num_conflicting_rows}")
    print(f"Dataset Percentage Affected:      {pct_affected:.4f}%")
    print(f"="*45)

    # 10. Inspection Table
    if num_conflicting_mols > 0:
        print(f"\n[Inspection] First 5 Conflicting Molecules:")

        # Create a view showing the list of labels for each conflicting SMILES
        inspection_df = conflicting_rows.groupby('canonical_smiles')['toxicity_label'].apply(list).reset_index()
        inspection_df.columns = ['canonical_smiles', 'observed_labels']

        # 11. Display ONLY the first 5
        print(inspection_df.head(5).to_string(index=False))
    else:
        print(f"\n[Inspection] No conflicting labels detected in df_clean.")

# --- EXECUTION ---

# Execute analysis
run_phase_c()

# 21. Final Success Message
print(f"\n--- PHASE C COMPLETED SUCCESSFULLY ---")

--- Starting PHASE C: Label Conflict Detection and Reporting ---
[Analysis] Analyzing label consistency per molecule...

           LABEL CONFLICT REPORT
Total Rows in df_clean:           1461
Conflicting Molecules (Unique):   0
Total Rows Affected:              0
Dataset Percentage Affected:      0.0000%

[Inspection] No conflicting labels detected in df_clean.

--- PHASE C COMPLETED SUCCESSFULLY ---
