# Comparator Analysis: ML vs Non-ML Organizations

This notebook generates outputs for the comparator datasets:
- **ML Comparators**: Established ML organizations (positive controls)
- **Non-ML Comparators**: Non-ML organizations (negative controls)

For each comparator group, we generate:
1. Company table (CSV)
2. ML estimates plot (PNG)
3. ML landscape plot (PNG)

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle, Patch
import matplotlib.ticker as mticker
import warnings
import re
from matplotlib.lines import Line2D
from matplotlib.patches import FancyBboxPatch
from matplotlib.colors import LinearSegmentedColormap
warnings.filterwarnings('ignore')

# =============================================================================
# DESIGN SYSTEM — Academic/Scientific Style
# =============================================================================

FONT_FAMILY = 'Helvetica Neue'
FONT_SIZES = {
    'title': 14,
    'subtitle': 11,
    'axis_label': 11,
    'tick_label': 9,
    'legend': 9,
    'annotation': 8,
    'org_label': 7,
}

plt.rcParams.update({
    'font.family': 'sans-serif',
    'font.sans-serif': ['Helvetica Neue', 'Helvetica', 'Arial', 'DejaVu Sans'],
    'font.size': FONT_SIZES['tick_label'],
    'axes.titlesize': FONT_SIZES['title'],
    'axes.labelsize': FONT_SIZES['axis_label'],
    'xtick.labelsize': FONT_SIZES['tick_label'],
    'ytick.labelsize': FONT_SIZES['tick_label'],
    'legend.fontsize': FONT_SIZES['legend'],
    'figure.titlesize': FONT_SIZES['title'],
    'axes.titleweight': 'medium',
    'axes.labelweight': 'regular',
    'axes.linewidth': 0.8,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.grid': True,
    'grid.alpha': 0.25,
    'grid.linewidth': 0.5,
    'grid.linestyle': '-',
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'savefig.facecolor': 'white',
    'savefig.dpi': 300,
})

# =============================================================================
# COLOR PALETTE
# =============================================================================

PALETTE = {
    'blue':     '#3C5488',
    'red':      '#DC3220',
    'green':    '#009988',
    'gold':     '#E68613',
    'purple':   '#7B4B94',
    'gray':     '#868686',
}

COLORS = {
    'ci_pure_probit': '#2C6E49',
    'ci_adjusted_synthetic': '#7B4B94',
    'probable':     '#2C6E49',
    'possible':     '#E68613',
    'nonzero':      '#7EB5D6',
    'not_detected': '#868686',
    'primary':    '#3C5488',
    'secondary':  '#009988',
    'muted':      '#868686',
    'background': '#F5F5F5',
    'gridline':   '#E0E0E0',
}

LANDSCAPE_PALETTE = {
    "Probable":     COLORS['probable'],
    "Possible":     COLORS['possible'],
    "Non-zero":     COLORS['nonzero'],
    "Not Detected": COLORS['not_detected'],
}

LANDSCAPE_MARKERS = {
    "Probable":     "D",
    "Possible":     "s",
    "Non-zero":     "^",
    "Not Detected": "o",
}

ESTIMATOR_STYLES = {
    'filter_broad_yes':           {'color': PALETTE['gray'],   'marker': 'o', 'size': 24},
    'filter_strict_no':           {'color': PALETTE['gray'],   'marker': 'v', 'size': 24},
    'filter_broad_yes_strict_no': {'color': '#5A5A5A',         'marker': 's', 'size': 24},
    'claude_total_accepted':      {'color': PALETTE['blue'],   'marker': 'D', 'size': 36},
    'gpt5_total_accepted':        {'color': PALETTE['green'],  'marker': '^', 'size': 36},
    'gemini_total_accepted':      {'color': PALETTE['gold'],   'marker': 'P', 'size': 40},
}

ESTIMATOR_LABELS = {
    'filter_broad_yes': 'Keyword: Broad Yes',
    'filter_strict_no': 'Keyword: Strict No', 
    'filter_broad_yes_strict_no': 'Keyword: Broad+Strict',
    'claude_total_accepted': 'Claude (sonnet-4)',
    'gpt5_total_accepted': 'GPT-5-mini',
    'gemini_total_accepted': 'Gemini 2.5 Flash',
}

# =============================================================================
# CONFIGURATION
# =============================================================================

SAVE_OUTPUTS = True
DATA_DIR = Path('..')

print(f"Data directory: {DATA_DIR.resolve()}")
print(f"Output saving: {'enabled' if SAVE_OUTPUTS else 'disabled'}")

In [None]:
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def assign_confidence_category(q10, q50, q90):
    """Assign organization to confidence category based on statistical estimates."""
    if pd.isna(q10) or pd.isna(q50) or pd.isna(q90):
        return "Not Detected"
    if q10 > 0:
        return "Probable"
    if q50 > 0:
        return "Possible"
    if q90 > 0:
        return "Non-zero"
    return "Not Detected"


def format_log_axis(ax, axis='y', limits=(1, 10000)):
    """Format log-scale axis with clean tick labels."""
    if axis == 'y':
        ax.set_yscale('log')
        ax.set_ylim(limits)
        ticks = [t for t in [1, 10, 100, 1000, 10000] if limits[0] <= t <= limits[1]]
        ax.set_yticks(ticks)
        ax.set_yticklabels([f'{t:,}' if t >= 1000 else str(t) for t in ticks])
        ax.yaxis.set_minor_locator(mticker.NullLocator())
    else:
        ax.set_xscale('log')
        ax.set_xlim(limits)
        ticks = [t for t in [1, 10, 100, 1000, 10000] if limits[0] <= t <= limits[1]]
        ax.set_xticks(ticks)
        ax.set_xticklabels([f'{t:,}' if t >= 1000 else str(t) for t in ticks])
        ax.xaxis.set_minor_locator(mticker.NullLocator())


def format_int_iso(n):
    """Format integers with spaces between groups of three digits."""
    if n is None or (isinstance(n, float) and not np.isfinite(n)) or pd.isna(n):
        return ""
    n = int(n)
    sign = "-" if n < 0 else ""
    s = str(abs(n))
    groups = []
    while s:
        groups.append(s[-3:])
        s = s[:-3]
    return sign + " ".join(reversed(groups))

In [None]:
# =============================================================================
# COMPANY TABLE FUNCTION
# =============================================================================

def create_company_table(df_input, subgroup_name=""):
    """
    Create detailed company table with ML estimates.
    
    Args:
        df_input: DataFrame with organization data
        subgroup_name: Name of the subgroup for display
    
    Returns:
        DataFrame with formatted company data
    """
    estimator_cols = [
        'filter_broad_yes_strict_no', 
        'filter_strict_no', 
        'filter_broad_yes',
        'claude_total_accepted', 
        'gpt5_total_accepted', 
        'gemini_total_accepted'
    ]
    
    rows = []
    for _, row in df_input.iterrows():
        # Extract founding year
        founded_year = ""
        if 'Founded Date' in row.index and pd.notna(row['Founded Date']):
            try:
                dt = pd.to_datetime(row['Founded Date'], errors='coerce')
                if pd.notna(dt):
                    founded_year = str(dt.year)
            except (ValueError, TypeError):
                pass
        
        # Get individual estimator values
        estimator_values = []
        for col in estimator_cols:
            if col in row.index:
                val = pd.to_numeric(row[col], errors='coerce')
                if pd.notna(val):
                    estimator_values.append(str(int(val)))
                else:
                    estimator_values.append("-")
            else:
                estimator_values.append("-")
        
        # Determine if using synthetic estimate
        q50_pure = pd.to_numeric(row.get('q50', np.nan), errors='coerce')
        use_synthetic = pd.isna(q50_pure)
        
        # Get ML estimate (q50) and CI bounds
        if use_synthetic:
            ml_q50 = pd.to_numeric(row.get('adjusted_synthetic_q50', np.nan), errors='coerce')
            ml_q10 = pd.to_numeric(row.get('adjusted_synthetic_q10', np.nan), errors='coerce')
            ml_q90 = pd.to_numeric(row.get('adjusted_synthetic_q90', np.nan), errors='coerce')
        else:
            ml_q50 = q50_pure
            ml_q10 = pd.to_numeric(row.get('q10', np.nan), errors='coerce')
            ml_q90 = pd.to_numeric(row.get('q90', np.nan), errors='coerce')
        
        # Format ML talent estimate with CI
        if pd.notna(ml_q50):
            ml_str = f"{format_int_iso(int(ml_q50))}"
            if pd.notna(ml_q10) and pd.notna(ml_q90):
                ml_str += f" ({format_int_iso(int(ml_q10))} - {format_int_iso(int(ml_q90))})"
            if use_synthetic:
                ml_str += " *"
        else:
            ml_str = "-"
        
        # Get headcount and calculate ML share
        headcount = pd.to_numeric(row.get('total_headcount', np.nan), errors='coerce')
        
        if pd.notna(ml_q50) and pd.notna(headcount) and headcount > 0:
            ml_pct = 100.0 * ml_q50 / headcount
            ml_pct_str = f"{ml_pct:.2f}%"
            
            if pd.notna(ml_q10) and pd.notna(ml_q90):
                pct_low = 100.0 * ml_q10 / headcount
                pct_high = 100.0 * ml_q90 / headcount
                ml_pct_str += f" ({pct_low:.2f}% - {pct_high:.2f}%)"
        else:
            ml_pct_str = "-"
        
        # Determine confidence category
        category = assign_confidence_category(ml_q10, ml_q50, ml_q90)
        
        # Get country - try multiple possible column names
        country = row.get('country', row.get('Country', row.get('headquarters_location', '')))
        
        rows.append({
            'Company Name': row.get('organization_name', ''),
            'Founded': founded_year,
            'Country': country,
            'Total Staff (LinkedIn)': format_int_iso(int(headcount)) if pd.notna(headcount) else "-",
            'Individual Estimates [broad+strict, strict, broad, claude, gpt5, gemini]': f"[{', '.join(estimator_values)}]",
            'ML Talent q50 (q10 - q90)': ml_str,
            'ML % of Total': ml_pct_str,
            'Category': category
        })
    
    return pd.DataFrame(rows)

print("Company table function loaded")

In [None]:
# =============================================================================
# ML ESTIMATES PLOT FUNCTION
# =============================================================================

def create_ml_estimates_plot_all_orgs(df_plot, figsize=(16, 8), title_suffix=""):
    """
    Create visualization of ML estimates, filtering to confident organizations.
    
    Args:
        df_plot: DataFrame with organization data (will be filtered to q10 > 0)
        figsize: Figure size tuple
        title_suffix: Optional suffix for plot title
    
    Returns:
        fig, ax, df_sorted (filtered to q10 > 0 only)
    """
    filter_cols = ['filter_broad_yes', 'filter_strict_no', 'filter_broad_yes_strict_no']
    llm_cols = ['gemini_total_accepted', 'claude_total_accepted', 'gpt5_total_accepted']
    
    df_sorted = df_plot.copy()
    
    # Determine which CI to use for each org
    df_sorted['_use_pure_probit'] = pd.to_numeric(df_sorted['q50'], errors='coerce').notna()
    
    df_sorted['_central'] = np.where(
        df_sorted['_use_pure_probit'],
        pd.to_numeric(df_sorted['q50'], errors='coerce'),
        pd.to_numeric(df_sorted['adjusted_synthetic_q50'], errors='coerce')
    )
    df_sorted['_lower'] = np.where(
        df_sorted['_use_pure_probit'],
        pd.to_numeric(df_sorted['q10'], errors='coerce'),
        pd.to_numeric(df_sorted['adjusted_synthetic_q10'], errors='coerce')
    )
    df_sorted['_upper'] = np.where(
        df_sorted['_use_pure_probit'],
        pd.to_numeric(df_sorted['q90'], errors='coerce'),
        pd.to_numeric(df_sorted['adjusted_synthetic_q90'], errors='coerce')
    )
    
    # Filter to companies where q10 > 0 (CI excludes zero)
    mask_ci_excludes_zero = df_sorted['_lower'] > 0
    df_sorted = df_sorted[mask_ci_excludes_zero].copy()
    
    # Sort by central estimate
    df_sorted['_sort_key'] = df_sorted['_central'].fillna(0)
    df_sorted = df_sorted.sort_values('_sort_key').reset_index(drop=True)
    
    fig, ax = plt.subplots(figsize=figsize)
    x = np.arange(len(df_sorted))
    
    offset_step = 0.10
    
    # Layer 1: Keyword filter estimates
    filter_handles = []
    for i, col in enumerate(filter_cols):
        if col not in df_sorted.columns:
            continue
        y = pd.to_numeric(df_sorted[col], errors='coerce').values
        mask = np.isfinite(y) & (y > 0)
        x_pos = x + (i - 1) * offset_step
        
        style = ESTIMATOR_STYLES.get(col, {'color': PALETTE['gray'], 'marker': 'o', 'size': 24})
        sc = ax.scatter(
            x_pos[mask], y[mask],
            s=style['size'], marker=style['marker'],
            c=style['color'], alpha=0.35,
            linewidths=0, zorder=1,
            label=ESTIMATOR_LABELS.get(col, col)
        )
        filter_handles.append((sc, ESTIMATOR_LABELS.get(col, col)))
    
    # Layer 2: LLM estimates
    llm_handles = []
    for j, col in enumerate(llm_cols):
        if col not in df_sorted.columns:
            continue
        y = pd.to_numeric(df_sorted[col], errors='coerce').values
        mask = np.isfinite(y) & (y > 0)
        x_pos = x + (j - 1) * offset_step
        
        style = ESTIMATOR_STYLES.get(col, {'color': PALETTE['blue'], 'marker': 'D', 'size': 36})
        sc = ax.scatter(
            x_pos[mask], y[mask],
            s=style['size'], marker=style['marker'],
            c=style['color'], alpha=0.85,
            edgecolors='white', linewidths=0.5, zorder=2,
            label=ESTIMATOR_LABELS.get(col, col)
        )
        llm_handles.append((sc, ESTIMATOR_LABELS.get(col, col)))
    
    # Layer 3: Confidence intervals
    central = df_sorted['_central'].values
    lower = df_sorted['_lower'].values
    upper = df_sorted['_upper'].values
    use_pure = df_sorted['_use_pure_probit'].values
    
    eps = 0.5
    lower = np.maximum(lower, eps)
    central = np.maximum(central, eps)
    
    yerr_lower = np.clip(central - lower, 0, None)
    yerr_upper = np.clip(upper - central, 0, None)
    
    mask_valid = np.isfinite(central) & (central > 0)
    
    ci_handles = []
    
    # Pure Probit CI
    mask_pure = mask_valid & use_pure
    if np.any(mask_pure):
        err_pure = ax.errorbar(
            x[mask_pure], central[mask_pure],
            yerr=np.vstack([yerr_lower[mask_pure], yerr_upper[mask_pure]]),
            fmt='o', 
            mfc='white', mec=COLORS['ci_pure_probit'], mew=1.8, ms=5,
            ecolor=COLORS['ci_pure_probit'], elinewidth=1.2, capsize=2.5, capthick=1.2,
            zorder=4
        )
        ci_handles.append((err_pure, 'Pure Probit 80% CI'))
    
    # Adjusted Synthetic CI
    mask_synthetic = mask_valid & (~use_pure)
    if np.any(mask_synthetic):
        err_synth = ax.errorbar(
            x[mask_synthetic], central[mask_synthetic],
            yerr=np.vstack([yerr_lower[mask_synthetic], yerr_upper[mask_synthetic]]),
            fmt='o', 
            mfc='white', mec=COLORS['ci_adjusted_synthetic'], mew=1.8, ms=5,
            ecolor=COLORS['ci_adjusted_synthetic'], elinewidth=1.2, capsize=2.5, capthick=1.2,
            zorder=4
        )
        ci_handles.append((err_synth, 'Adjusted Synthetic 80% CI'))
    
    # Axis formatting
    format_log_axis(ax, axis='y', limits=(1, 10000))
    
    ax.set_xlabel('Organizations (sorted by ML estimate)', fontsize=FONT_SIZES['axis_label'])
    ax.set_ylabel('Estimated ML Talent', fontsize=FONT_SIZES['axis_label'])
    ax.set_title(f'ML Talent Estimates by Organization{title_suffix}', fontsize=FONT_SIZES['title'], fontweight='medium', pad=10)
    
    # X-axis labels
    org_col = 'organization_name' if 'organization_name' in df_sorted.columns else None
    if org_col:
        ax.set_xticks(x)
        ax.set_xticklabels(
            df_sorted[org_col].astype(str).tolist(), 
            rotation=45, ha='right', 
            fontsize=FONT_SIZES['org_label']
        )
    
    # Legend
    ci_items = []
    for handle, label in ci_handles:
        color = COLORS['ci_pure_probit'] if 'Pure' in label else COLORS['ci_adjusted_synthetic']
        ci_items.append(Line2D(
            [0], [0], marker='o', color=color,
            markerfacecolor='white', markeredgecolor=color,
            markeredgewidth=1.8, markersize=6,
            linestyle='-', linewidth=1.2,
            label=f'  {label}'
        ))
    
    llm_items = []
    for handle, label in llm_handles:
        style = [s for c, s in ESTIMATOR_STYLES.items() if ESTIMATOR_LABELS.get(c) == label]
        if style:
            s = style[0]
            llm_items.append(Line2D(
                [0], [0], marker=s['marker'], color='w',
                markerfacecolor=s['color'], markeredgecolor='white',
                markersize=7, linestyle='None',
                label=f'  {label}'
            ))
    
    keyword_items = []
    for handle, label in filter_handles:
        style = [s for c, s in ESTIMATOR_STYLES.items() if ESTIMATOR_LABELS.get(c) == label]
        if style:
            s = style[0]
            keyword_items.append(Line2D(
                [0], [0], marker=s['marker'], color='w',
                markerfacecolor=s['color'], markeredgecolor='none',
                markersize=5, linestyle='None', alpha=0.5,
                label=f'  {label}'
            ))
    
    ci_header = Line2D([0], [0], color='none', label='Confidence Intervals:')
    llm_header = Line2D([0], [0], color='none', label='LLM Estimates:')
    keyword_header = Line2D([0], [0], color='none', label='Keyword Filters:')
    spacer = Line2D([0], [0], color='none', linestyle='None', label=' ')
    
    max_len = max(len(ci_items), len(llm_items), len(keyword_items))
    while len(ci_items) < max_len:
        ci_items.append(spacer)
    while len(llm_items) < max_len:
        llm_items.append(spacer)
    while len(keyword_items) < max_len:
        keyword_items.append(spacer)
    
    legend_elements = []
    legend_elements.append(ci_header)
    legend_elements.extend(ci_items)
    legend_elements.append(llm_header)
    legend_elements.extend(llm_items)
    legend_elements.append(keyword_header)
    legend_elements.extend(keyword_items)
    
    ax.legend(
        handles=legend_elements,
        loc='upper center',
        bbox_to_anchor=(0.5, -0.35),
        ncol=3,
        fontsize=FONT_SIZES['legend'],
        frameon=False,
        columnspacing=2.5,
        handletextpad=0.5,
    )
    
    ax.grid(True, which='major', alpha=0.20, linewidth=0.4, color=COLORS['gridline'])
    ax.set_axisbelow(True)
    
    plt.subplots_adjust(left=0.05, right=0.95, top=0.92, bottom=0.35)
    
    return fig, ax, df_sorted

print("ML estimates plot function loaded")

In [None]:
# =============================================================================
# LANDSCAPE PLOT FUNCTION
# =============================================================================

try:
    from adjustText import adjust_text
    HAS_ADJUSTTEXT = True
except ImportError:
    HAS_ADJUSTTEXT = False
    print("Note: Install adjustText for better label placement: pip install adjustText")


def create_landscape_plot(df_input, title_suffix="", x_max=100, figsize=(11, 7), max_labels=25, log_x=True, label_all=False):
    """
    Create ML talent landscape scatter plot.
    
    Args:
        df_input: DataFrame with organization data
        title_suffix: Optional suffix for plot title
        x_max: Maximum x-axis value (ML share %)
        log_x: Use logarithmic x-axis (default True)
        figsize: Figure size tuple
        max_labels: Maximum number of organization labels to show
        label_all: If True, label all organizations (not just Probable)
    
    Returns:
        fig, ax, plot_df
    """
    # Prepare data - use pure probit if available, else adjusted synthetic
    q10_pure = pd.to_numeric(df_input['q10'], errors='coerce') if 'q10' in df_input.columns else pd.Series(np.nan, index=df_input.index)
    q50_pure = pd.to_numeric(df_input['q50'], errors='coerce') if 'q50' in df_input.columns else pd.Series(np.nan, index=df_input.index)
    q90_pure = pd.to_numeric(df_input['q90'], errors='coerce') if 'q90' in df_input.columns else pd.Series(np.nan, index=df_input.index)
    q10_synthetic = pd.to_numeric(df_input['adjusted_synthetic_q10'], errors='coerce')
    q50_synthetic = pd.to_numeric(df_input['adjusted_synthetic_q50'], errors='coerce')
    q90_synthetic = pd.to_numeric(df_input['adjusted_synthetic_q90'], errors='coerce')
    
    ml_q10 = q10_pure.where(q50_pure.notna(), q10_synthetic)
    ml_q50 = q50_pure.where(q50_pure.notna(), q50_synthetic)
    ml_q90 = q90_pure.where(q50_pure.notna(), q90_synthetic)
    
    plot_df = pd.DataFrame({
        'org': df_input['organization_name'].astype(str),
        'ml_n': ml_q50,
        'ml_q10': ml_q10,
        'ml_q90': ml_q90,
        'emp': pd.to_numeric(df_input['total_headcount'], errors='coerce'),
        'used_pure_probit': q50_pure.notna()
    })
    
    plot_df['ml_pct'] = (plot_df['ml_n'] / plot_df['emp']) * 100.0
    plot_df['ml_pct'] = plot_df['ml_pct'].clip(lower=0, upper=100)
    
    plot_df = plot_df.replace([np.inf, -np.inf], np.nan).dropna(subset=['ml_n', 'emp', 'ml_pct'])
    plot_df = plot_df[(plot_df['ml_n'] >= 0) & (plot_df['emp'] > 0)]
    
    plot_df['cluster'] = [assign_confidence_category(q10, q50, q90) 
                          for q10, q50, q90 in plot_df[['ml_q10', 'ml_n', 'ml_q90']].values]
    
    fig, ax = plt.subplots(figsize=figsize)
    
    marker_sizes = {
        'Probable': 80,
        'Possible': 64,
        'Non-zero': 48,
        'Not Detected': 36,
    }
    
    for cluster_name in ['Not Detected', 'Non-zero', 'Possible', 'Probable']:
        sub_df = plot_df[plot_df['cluster'] == cluster_name]
        if len(sub_df) == 0:
            continue
        
        is_highlight = cluster_name in ['Probable', 'Possible']
        
        ax.scatter(
            sub_df['ml_pct'], sub_df['ml_n'],
            s=marker_sizes[cluster_name],
            c=LANDSCAPE_PALETTE[cluster_name],
            marker=LANDSCAPE_MARKERS[cluster_name],
            alpha=0.85 if is_highlight else 0.45,
            edgecolors='white' if is_highlight else 'none',
            linewidths=0.6 if is_highlight else 0,
            label=cluster_name,
            zorder=3 if is_highlight else 2
        )
    
    # Labels for organizations
    if label_all:
        labeled_df = plot_df.copy()
    else:
        labeled_df = plot_df[plot_df['cluster'] == 'Probable'].copy()
    labeled_df = labeled_df.sort_values('ml_n', ascending=False).head(max_labels)
    
    if len(labeled_df) > 0:
        # Simple approach: place labels directly next to points with small offset
        texts = []
        for _, data_row in labeled_df.iterrows():
            point_x = data_row['ml_pct']
            point_y = data_row['ml_n']
            org_name = data_row['org']
            
            # Add text annotation
            txt = ax.annotate(
                org_name,
                xy=(point_x, point_y),
                xytext=(5, 5),  # Small offset in points
                textcoords='offset points',
                fontsize=FONT_SIZES['org_label'],
                ha='left', va='bottom',
                color='#404040',
                arrowprops=dict(
                    arrowstyle='-',
                    lw=0.4,
                    color='#909090',
                    alpha=0.5,
                ),
            )
            texts.append(txt)
        
        # Use adjustText if available to avoid overlaps
        if HAS_ADJUSTTEXT and len(texts) > 1:
            adjust_text(texts, ax=ax, 
                       arrowprops=dict(arrowstyle='-', color='#909090', alpha=0.5, lw=0.4),
                       expand_points=(1.5, 1.5),
                       force_points=(0.5, 0.5))
    
    # Axis formatting
    if log_x:
        ax.set_xscale('log')
        ax.set_xlim(0.001, x_max)
        ax.set_xticks([0.001, 0.01, 0.1, 1, 10, 100])
        ax.set_xticklabels(['0.001%', '0.01%', '0.1%', '1%', '10%', '100%'])
    else:
        ax.set_xlim(0, x_max)
    format_log_axis(ax, axis='y', limits=(1, 50000))
    
    ax.set_xlabel('ML Share (%)', fontsize=FONT_SIZES['axis_label'])
    ax.set_ylabel('ML Staff Count (q50)', fontsize=FONT_SIZES['axis_label'])
    ax.set_title(f'ML Talent Landscape{title_suffix}', fontsize=FONT_SIZES['title'], fontweight='medium', pad=10)
    
    # Legend
    cluster_counts = plot_df['cluster'].value_counts()
    
    legend_handles = []
    for cluster in ['Probable', 'Possible', 'Non-zero', 'Not Detected']:
        count = cluster_counts.get(cluster, 0)
        is_highlight = cluster in ['Probable', 'Possible']
        legend_handles.append(
            plt.scatter([], [], 
                s=marker_sizes[cluster] * 0.8,
                c=LANDSCAPE_PALETTE[cluster],
                marker=LANDSCAPE_MARKERS[cluster],
                alpha=0.85 if is_highlight else 0.45,
                edgecolors='white' if is_highlight else 'none',
                linewidths=0.6 if is_highlight else 0,
                label=f'{cluster} (n={count})'
            )
        )
    
    ax.legend(
        handles=legend_handles,
        loc='center left',
        bbox_to_anchor=(1.02, 0.5),
        frameon=True,
        framealpha=0.95,
        edgecolor='#E0E0E0',
        fontsize=FONT_SIZES['legend'],
    )
    
    ax.grid(True, which='major', alpha=0.20, linewidth=0.4, color=COLORS['gridline'])
    ax.set_axisbelow(True)
    
    plt.subplots_adjust(left=0.08, right=0.78, top=0.92, bottom=0.08)
    
    n_pure = plot_df['used_pure_probit'].sum()
    n_synthetic = len(plot_df) - n_pure
    print(f"Estimate source: {n_pure} pure probit, {n_synthetic} adjusted synthetic")
    
    return fig, ax, plot_df

print("Landscape plot function loaded")

---
## Step 1: Load Comparator Data

In [None]:
# Load comparator datasets
df_ml = pd.read_csv(DATA_DIR / 'final_results_comparator_ml.csv')
df_non_ml = pd.read_csv(DATA_DIR / 'final_results_comparator_non_ml.csv')

print(f"ML Comparators: {len(df_ml)} organizations")
print(f"Non-ML Comparators: {len(df_non_ml)} organizations")

print(f"\nML Comparator organizations:")
print(df_ml['organization_name'].tolist())

print(f"\nNon-ML Comparator organizations:")
print(df_non_ml['organization_name'].tolist())

---
## Step 2: ML Comparators Analysis

In [None]:
# Create company table for ML comparators
company_table_ml = create_company_table(df_ml, subgroup_name="ML Comparators")

# Sort by ML talent estimate
def extract_ml_for_sort(ml_str):
    """Extract numeric ML value for sorting."""
    if ml_str == "-":
        return 0
    match = re.match(r'([\d\s]+)', ml_str.replace(' ', ''))
    if match:
        return int(match.group(1).replace(' ', ''))
    return 0

company_table_ml['_sort_key'] = company_table_ml['ML Talent q50 (q10 - q90)'].apply(extract_ml_for_sort)
company_table_ml = company_table_ml.sort_values('_sort_key', ascending=False).drop(columns=['_sort_key'])

# Save to output
if SAVE_OUTPUTS:
    output_path = DATA_DIR / 'output' / 'company_table_comparator_ml.csv'
    company_table_ml.to_csv(output_path, index=False)
    print(f"Saved company table to {output_path}")

display(company_table_ml)

In [None]:
# Create ML estimates plot for ML comparators
fig_ml, ax_ml, df_ml_sorted = create_ml_estimates_plot_all_orgs(
    df_ml, 
    figsize=(14, 7),
    title_suffix=" - ML Comparators (Established ML Orgs)"
)

# Print summary
n_total = len(df_ml)
n_plotted = len(df_ml_sorted)
n_excluded = n_total - n_plotted
n_pure = df_ml_sorted['_use_pure_probit'].sum() if len(df_ml_sorted) > 0 else 0
n_synthetic = n_plotted - n_pure

print(f"Plot Summary (ML Comparators):")
print(f"  Total organizations: {n_total}")
print(f"  Plotted (q10 > 0): {n_plotted}")
print(f"  Excluded (q10 = 0): {n_excluded}")
print(f"  Using Pure Probit CI: {n_pure}")
print(f"  Using Adjusted Synthetic CI: {n_synthetic}")

if SAVE_OUTPUTS:
    fig_ml.savefig(DATA_DIR / 'output' / 'ml_estimates_comparator_ml.png', dpi=200, bbox_inches='tight')
    print(f"\nSaved: {DATA_DIR / 'output' / 'ml_estimates_comparator_ml.png'}")

plt.show()
plt.close(fig_ml)

In [None]:
# Create landscape plot for ML comparators
fig_landscape_ml, ax_landscape_ml, plot_ml = create_landscape_plot(
    df_ml, 
    title_suffix=" - ML Comparators",
    figsize=(12, 8),
    max_labels=50,
    label_all=True
)

print(f"\nCluster distribution (ML Comparators, N={len(plot_ml)}):")
for cluster in ['Probable', 'Possible', 'Non-zero', 'Not Detected']:
    count = (plot_ml['cluster'] == cluster).sum()
    print(f"  {cluster}: {count}")

if SAVE_OUTPUTS:
    fig_landscape_ml.savefig(DATA_DIR / 'output' / 'ml_landscape_comparator_ml.png', dpi=200, bbox_inches='tight')
    print(f"\nSaved: {DATA_DIR / 'output' / 'ml_landscape_comparator_ml.png'}")

plt.show()
plt.close(fig_landscape_ml)

---
## Step 3: Non-ML Comparators Analysis

In [None]:
# Create company table for Non-ML comparators
company_table_non_ml = create_company_table(df_non_ml, subgroup_name="Non-ML Comparators")

# Sort by ML talent estimate
company_table_non_ml['_sort_key'] = company_table_non_ml['ML Talent q50 (q10 - q90)'].apply(extract_ml_for_sort)
company_table_non_ml = company_table_non_ml.sort_values('_sort_key', ascending=False).drop(columns=['_sort_key'])

# Save to output
if SAVE_OUTPUTS:
    output_path = DATA_DIR / 'output' / 'company_table_comparator_non_ml.csv'
    company_table_non_ml.to_csv(output_path, index=False)
    print(f"Saved company table to {output_path}")

display(company_table_non_ml)

In [None]:
# Create ML estimates plot for Non-ML comparators
fig_non_ml, ax_non_ml, df_non_ml_sorted = create_ml_estimates_plot_all_orgs(
    df_non_ml, 
    figsize=(14, 7),
    title_suffix=" - Non-ML Comparators (Negative Controls)"
)

# Print summary
n_total = len(df_non_ml)
n_plotted = len(df_non_ml_sorted)
n_excluded = n_total - n_plotted
n_pure = df_non_ml_sorted['_use_pure_probit'].sum() if len(df_non_ml_sorted) > 0 else 0
n_synthetic = n_plotted - n_pure

print(f"Plot Summary (Non-ML Comparators):")
print(f"  Total organizations: {n_total}")
print(f"  Plotted (q10 > 0): {n_plotted}")
print(f"  Excluded (q10 = 0): {n_excluded}")
print(f"  Using Pure Probit CI: {n_pure}")
print(f"  Using Adjusted Synthetic CI: {n_synthetic}")

if SAVE_OUTPUTS:
    fig_non_ml.savefig(DATA_DIR / 'output' / 'ml_estimates_comparator_non_ml.png', dpi=200, bbox_inches='tight')
    print(f"\nSaved: {DATA_DIR / 'output' / 'ml_estimates_comparator_non_ml.png'}")

plt.show()
plt.close(fig_non_ml)

In [None]:
# Create landscape plot for Non-ML comparators
fig_landscape_non_ml, ax_landscape_non_ml, plot_non_ml = create_landscape_plot(
    df_non_ml, 
    title_suffix=" - Non-ML Comparators",
    figsize=(12, 8),
    max_labels=50,
    label_all=True
)

print(f"\nCluster distribution (Non-ML Comparators, N={len(plot_non_ml)}):")
for cluster in ['Probable', 'Possible', 'Non-zero', 'Not Detected']:
    count = (plot_non_ml['cluster'] == cluster).sum()
    print(f"  {cluster}: {count}")

if SAVE_OUTPUTS:
    fig_landscape_non_ml.savefig(DATA_DIR / 'output' / 'ml_landscape_comparator_non_ml.png', dpi=200, bbox_inches='tight')
    print(f"\nSaved: {DATA_DIR / 'output' / 'ml_landscape_comparator_non_ml.png'}")

plt.show()
plt.close(fig_landscape_non_ml)

---
## Summary

Generated outputs:

**ML Comparators (Established ML Orgs):**
- `output/company_table_comparator_ml.csv`
- `output/ml_estimates_comparator_ml.png`
- `output/ml_landscape_comparator_ml.png`

**Non-ML Comparators (Negative Controls):**
- `output/company_table_comparator_non_ml.csv`
- `output/ml_estimates_comparator_non_ml.png`
- `output/ml_landscape_comparator_non_ml.png`

In [None]:
# Summary of all outputs
print("=" * 60)
print("COMPARATOR ANALYSIS COMPLETE")
print("=" * 60)

print(f"\nML Comparators ({len(df_ml)} organizations):")
print(f"  - Company Table: {len(company_table_ml)} rows")
print(f"  - ML Estimates Plot: {len(df_ml_sorted)} organizations with q10 > 0")
print(f"  - Landscape Plot: {len(plot_ml)} valid data points")

print(f"\nNon-ML Comparators ({len(df_non_ml)} organizations):")
print(f"  - Company Table: {len(company_table_non_ml)} rows")
print(f"  - ML Estimates Plot: {len(df_non_ml_sorted)} organizations with q10 > 0")
print(f"  - Landscape Plot: {len(plot_non_ml)} valid data points")

if SAVE_OUTPUTS:
    print(f"\nAll outputs saved to: {(DATA_DIR / 'output').resolve()}")