In [2]:
import os
import json
import pandas as pd
from io import StringIO
from bs4 import BeautifulSoup

# Define the paths to the notebooks
file_paths = [
    "/Users/kshitizsikriwal/Kshitiz/evaluation-2025/Gemini/compilance-asthma.ipynb",
    "/Users/kshitizsikriwal/Kshitiz/evaluation-2025/Gemini/compilance-diabetes.ipynb",
    "/Users/kshitizsikriwal/Kshitiz/evaluation-2025/Gemini/compilance-cholesterol.ipynb",
    "/Users/kshitizsikriwal/Kshitiz/evaluation-2025/GPT/compilance-asthma.ipynb",
    "/Users/kshitizsikriwal/Kshitiz/evaluation-2025/GPT/compilance-diabetes.ipynb",
    "/Users/kshitizsikriwal/Kshitiz/evaluation-2025/GPT/compilance-cholesterol.ipynb"
]

def extract_table_from_notebook(file_path):
    """
    Parses a Jupyter notebook to find the 'Average compliance' table 
    and returns it as a DataFrame.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            nb_data = json.load(f)
    except FileNotFoundError:
        print(f"‚ö†Ô∏è File not found: {file_path}")
        return None

    # Determine Model and Disease from file path
    path_parts = file_path.split('/')
    model = path_parts[1]  # 'Gemini' or 'GPT'
    # Extract disease from filename (e.g., 'compilance-asthma.ipynb' -> 'Asthma')
    filename = path_parts[-1]
    disease = filename.replace('compilance-', '').replace('.ipynb', '').capitalize()

    for cell in nb_data.get('cells', []):
        if cell.get('cell_type') == 'code':
            outputs = cell.get('outputs', [])
            
            # Check if this cell contains the target output
            has_summary_text = False
            html_content = None
            
            for output in outputs:
                # Check for the specific print statement indicating the summary table
                if output.get('output_type') == 'stream':
                    text = "".join(output.get('text', []))
                    if "Average compliance" in text:
                        has_summary_text = True
                
                # Capture the HTML table data if present
                if output.get('output_type') in ['display_data', 'execute_result']:
                    data = output.get('data', {})
                    if 'text/html' in data:
                        html_content = "".join(data['text/html'])

            # If we found the text marker and have HTML content, parse it
            if has_summary_text and html_content:
                try:
                    # Parse HTML table into DataFrame
                    df = pd.read_html(StringIO(html_content))[0]
                    
                    # Normalize column names
                    df.columns = [c.replace(' ', '_').replace('(', '').replace(')', '') for c in df.columns]
                    
                    # Add metadata columns
                    df['Model'] = model
                    df['Disease'] = disease
                    
                    # Select and rename relevant columns to ensure consistency
                    # Note: Handling potential slight variations in column names across files
                    rename_map = {
                        'meal_plan': 'Plan',
                        'Avg_Strict_Compliance%': 'Strict_Compliance_%',
                        'Avg_Partial_Compliance%': 'Partial_Compliance_%'
                    }
                    df = df.rename(columns=rename_map)
                    
                    # Reorder for readability
                    cols = ['Model', 'Disease', 'Plan'] + [c for c in df.columns if c not in ['Model', 'Disease', 'Plan']]
                    return df[cols]
                    
                except Exception as e:
                    print(f"Error parsing table in {file_path}: {e}")

    print(f"‚ö†Ô∏è No summary table found in {file_path}")
    return None

# --- Main Execution ---

all_data = []

print("üîÑ Starting dynamic extraction...")

for path in file_paths:
    df_result = extract_table_from_notebook(path)
    if df_result is not None:
        all_data.append(df_result)
        print(f"‚úÖ Successfully extracted data from {path}")

if all_data:
    # Combine all extracted dataframes
    final_df = pd.concat(all_data, ignore_index=True)
    
    # Save to CSV
    output_filename = "overall_compliance_from_both_models.csv"
    final_df.to_csv(output_filename, index=False)
    
    print("\nüìä Consolidated Data:")
    print(final_df)
    print(f"\nüìÅ File saved successfully as: {output_filename}")
else:
    print("\n‚ùå No data could be extracted.")

üîÑ Starting dynamic extraction...
‚úÖ Successfully extracted data from /Users/kshitizsikriwal/Kshitiz/evaluation-2025/Gemini/compilance-asthma.ipynb
‚úÖ Successfully extracted data from /Users/kshitizsikriwal/Kshitiz/evaluation-2025/Gemini/compilance-diabetes.ipynb
‚úÖ Successfully extracted data from /Users/kshitizsikriwal/Kshitiz/evaluation-2025/Gemini/compilance-cholesterol.ipynb
‚úÖ Successfully extracted data from /Users/kshitizsikriwal/Kshitiz/evaluation-2025/GPT/compilance-asthma.ipynb
‚úÖ Successfully extracted data from /Users/kshitizsikriwal/Kshitiz/evaluation-2025/GPT/compilance-diabetes.ipynb
‚úÖ Successfully extracted data from /Users/kshitizsikriwal/Kshitiz/evaluation-2025/GPT/compilance-cholesterol.ipynb

üìä Consolidated Data:
    Model      Disease    Plan  Unnamed:_0  Strict_Compliance_%  \
0   Users       Asthma  Plan A           0            88.570000   
1   Users       Asthma  Plan B           1            81.430000   
2   Users     Diabetes  Plan A           0 

In [None]:
import json
import pandas as pd
import io
from bs4 import BeautifulSoup

# List of all notebook paths to process
file_paths = [
    "/Users/kshitizsikriwal/Kshitiz/evaluation-2025/Gemini/compilance-asthma.ipynb",
    "/Users/kshitizsikriwal/Kshitiz/evaluation-2025/Gemini/compilance-diabetes.ipynb",
    "/Users/kshitizsikriwal/Kshitiz/evaluation-2025/Gemini/compilance-cholesterol.ipynb",
    "/Users/kshitizsikriwal/Kshitiz/evaluation-2025/GPT/compilance-asthma.ipynb",
    "/Users/kshitizsikriwal/Kshitiz/evaluation-2025/GPT/compilance-diabetes.ipynb",
    "/Users/kshitizsikriwal/Kshitiz/evaluation-2025/GPT/compilance-cholesterol.ipynb"
]

def extract_compliance_data(file_path):
    """
    Reads a notebook, finds the 'Average compliance' table, 
    and returns a DataFrame with Model, Disease, Plan, and Compliance scores.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            notebook = json.load(f)
    except FileNotFoundError:
        print(f"‚ö†Ô∏è File not found: {file_path}")
        return None

    # Identify Model and Disease from the file path
    # Assumes structure: evaluation-2025/ModelName/filename.ipynb
    path_parts = file_path.split('/')
    model_name = path_parts[1]  # 'Gemini' or 'GPT'
    disease_name = path_parts[-1].replace('compilance-', '').replace('.ipynb', '').capitalize()

    for cell in notebook.get('cells', []):
        # We only care about code cells with outputs
        if cell.get('cell_type') == 'code':
            outputs = cell.get('outputs', [])
            for output in outputs:
                # Check if this output contains our target HTML table
                # We look for specific column headers usually found in the HTML
                if 'data' in output and 'text/html' in output['data']:
                    html_content = "".join(output['data']['text/html'])
                    
                    # Quick check if this is the right table
                    if "Avg_Strict_Compliance" in html_content or "strict_compliance_%" in html_content:
                        try:
                            # Parse the HTML table
                            df = pd.read_html(io.StringIO(html_content))[0]
                            
                            # Normalize Column Names (remove %, (), spaces)
                            # Expected original: meal_plan, Avg_Strict_Compliance(%), Avg_Partial_Compliance(%)
                            df.columns = [
                                c.replace(' ', '_')
                                 .replace('(', '')
                                 .replace(')', '')
                                 .replace('%', '')
                                 .replace('Avg_', '') # Standardize naming
                                 .strip() 
                                for c in df.columns
                            ]
                            
                            # Standardize specific column names to a common format
                            # We want: Plan, Strict, Partial
                            rename_map = {}
                            for col in df.columns:
                                if 'meal_plan' in col.lower() or 'plan' in col.lower():
                                    rename_map[col] = 'Plan'
                                elif 'strict' in col.lower():
                                    rename_map[col] = 'Strict_Compliance'
                                elif 'partial' in col.lower():
                                    rename_map[col] = 'Partial_Compliance'
                            
                            df = df.rename(columns=rename_map)
                            
                            # Filter only necessary columns if table is larger
                            if 'Plan' in df.columns and 'Strict_Compliance' in df.columns:
                                df['Model'] = model_name
                                df['Disease'] = disease_name
                                return df[['Model', 'Disease', 'Plan', 'Strict_Compliance', 'Partial_Compliance']]
                                
                        except Exception as e:
                            pass # Continue looking if parsing fails
    return None

# --- Main Execution ---

all_dataframes = []

print("üîÑ Processing files...")
for path in file_paths:
    df = extract_compliance_data(path)
    if df is not None:
        all_dataframes.append(df)
        print(f"‚úÖ Extracted: {path}")
    else:
        print(f"‚ùå Failed to extract: {path}")

if all_dataframes:
    # 1. Combine all raw data
    full_df = pd.concat(all_dataframes, ignore_index=True)
    
    # 2. Group by Model and Plan to get the Mean (Average) across all diseases
    final_aggregation = full_df.groupby(['Model', 'Plan'])[['Strict_Compliance', 'Partial_Compliance']].mean().reset_index()
    
    # Formatting for cleaner display (round to 2 decimal places)
    final_aggregation['Strict_Compliance'] = final_aggregation['Strict_Compliance'].round(2)
    final_aggregation['Partial_Compliance'] = final_aggregation['Partial_Compliance'].round(2)

    print("\n" + "="*60)
    print("üìä FINAL AGGREGATED RESULTS (Average across Asthma, Diabetes, Cholesterol)")
    print("="*60)
    print(final_aggregation.to_string(index=False))
    print("="*60)

    # 3. Save to CSV
    output_file = "overall_compliance_summary.csv"
    final_aggregation.to_csv(output_file, index=False)
    print(f"\nüìÅ Final aggregated file saved as: {output_file}")

else:
    print("No data found to aggregate.")