# Human vs Model UX Assessment Comparison

This notebook compares human evaluations with model evaluations for dark patterns, focusing on UX KPI metrics and pattern assessments.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import os

# Set up visual style
plt.style.use('ggplot')
sns.set_theme(style="whitegrid")

# Create output directory
output_dir = "comparison_output"
os.makedirs(output_dir, exist_ok=True)

## Load Data

First, let's load both human evaluations and model evaluations.

In [2]:
# Load human data
human_data = pd.read_csv("Formatting Human Survey Data/raw_participant_evaluations.csv")

# Fix column name for consistency
if 'score_addictive_non-addictive' in human_data.columns:
    human_data = human_data.rename(columns={'score_addictive_non-addictive': 'score_addictive_non_addictive'})

# Add data source column
human_data['data_source'] = 'Human'

print(f"Human data shape: {human_data.shape}")
human_data.head()

Human data shape: (348, 19)


Unnamed: 0,metadata_participant_id,metadata_timestamp,metadata_pattern_type,metadata_interface_id,score_inefficient_efficient,score_interesting_not_interesting,score_clear_confusing,score_enjoyable_annoying,score_organized_cluttered,score_addictive_non_addictive,score_supportive_obstructive,score_pressuring_suggesting,score_boring_exciting,score_revealed_covert,score_complicated_easy,score_unpredictable_predictable,score_friendly_unfriendly,score_deceptive_benevolent,data_source
0,P002,2025-04-04T10:06:23,Overcomplicated Process,interface_002,6,5,3,2,6,4,6,6,4,5,5,3,3,5,Human
1,P002,2025-04-04T10:06:25,Sneaking Bad Default,interface_004,3,4,4,1,6,3,4,5,3,3,6,4,1,4,Human
2,P002,2025-04-04T10:06:29,Toying With Emotion,interface_008,2,6,7,3,4,7,5,2,6,2,6,1,3,4,Human
3,P002,2025-04-04T10:06:35,Endlessness,interface_014,5,2,5,1,6,4,3,4,7,2,5,4,2,3,Human
4,P003,2025-04-04T10:06:26,Expectation Result Mismatch,interface_005,4,4,4,4,6,4,4,4,1,4,3,4,2,4,Human


In [3]:
# Load model data (combining all model results)
model_files = [
    "model_analysis_test/results_anthropic_opus_neutral_description.csv",
    "model_analysis_test/results_ollama_gemma3.csv",
    "model_analysis_test/results_openai_gpt4-vision-neutral.csv",
    "model_analysis_test/results_qwen-vl-max.csv"
]

model_data_list = []
for file in model_files:
    try:
        # First try normal loading
        df = pd.read_csv(file)
        # Check if first row contains statistical headers
        if any(col in str(df.iloc[0]).lower() for col in ['mean', 'std', 'min', 'max']):
            print(f"File {file} has statistical headers, skipping first row")
            df = pd.read_csv(file, skiprows=1)
        model_data_list.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")

# Combine all model data
model_data = pd.concat(model_data_list, ignore_index=True)

# Fix column name for consistency
if 'score_addictive_non-addictive' in model_data.columns:
    model_data = model_data.rename(columns={'score_addictive_non-addictive': 'score_addictive_non_addictive'})

# Add data source column
model_data['data_source'] = 'AI Model'

print(f"Model data shape: {model_data.shape}")
model_data.head()

File model_analysis_test/results_qwen-vl-max.csv has statistical headers, skipping first row
Model data shape: (59, 44)


Unnamed: 0,metadata_timestamp,metadata_ai_service,metadata_model,metadata_pattern_type,metadata_interface_id,score_inefficient_efficient,score_interesting_not_interesting,score_clear_confusing,score_enjoyable_annoying,score_organized_cluttered,...,6.1,7,3.2,2.1,4.1,5.1,6.2,6.3,"The popup is perceived as inefficient and obstructive due to its repetitive nature. It is also seen as pressuring rather than suggesting, making it unfriendly and annoying. The feature is not particularly interesting or exciting, and it is somewhat confusing and complicated.",data_source
0,2025-04-01T15:35:41.758887,anthropic,claude-3-opus-20240229,Nagging,interface_001,2.0,6.0,2.0,6.0,3.0,...,,,,,,,,,,AI Model
1,2025-04-01T15:35:52.881322,anthropic,claude-3-opus-20240229,Overcomplicated Process,interface_002,2.0,6.0,6.0,6.0,7.0,...,,,,,,,,,,AI Model
2,2025-04-01T15:36:04.711931,anthropic,claude-3-opus-20240229,Hindering Account Deletion,interface_003,2.0,6.0,2.0,6.0,2.0,...,,,,,,,,,,AI Model
3,2025-04-01T15:36:16.000749,anthropic,claude-3-opus-20240229,Sneaking Bad Default,interface_004,7.0,3.0,3.0,3.0,3.0,...,,,,,,,,,,AI Model
4,2025-04-01T15:36:27.400215,anthropic,claude-3-opus-20240229,Expectation Result Mismatch,interface_005,2.0,5.0,6.0,6.0,3.0,...,,,,,,,,,,AI Model


## Calculate UX KPI

Calculate the UX KPI for both human and model assessments based on negative UX aspects.

In [4]:
def calculate_ux_kpi(df):
    """Calculate UX KPI based on negative UX aspects."""
    # Create a copy to avoid modifying the input
    result_df = df.copy()
    
    # Define UX KPI components
    ux_kpi_items = {
        'boring': 'score_boring_exciting',           # Low = boring
        'not_interesting': 'score_interesting_not_interesting',  # High = not interesting
        'complicated': 'score_complicated_easy',     # Low = complicated
        'confusing': 'score_clear_confusing',        # High = confusing
        'inefficient': 'score_inefficient_efficient', # Low = inefficient
        'cluttered': 'score_organized_cluttered',     # High = cluttered
        'unpredictable': 'score_unpredictable_predictable', # Low = unpredictable
        'obstructive': 'score_supportive_obstructive'  # High = obstructive
    }
    
    # Create columns with standardized values (higher = worse UX)
    for ux_item, column in ux_kpi_items.items():
        if column in result_df.columns:
            if ux_item in ['not_interesting', 'confusing', 'cluttered', 'obstructive']:
                # These are already oriented so high values = negative aspect
                result_df[f'ux_{ux_item}'] = result_df[column]
            else:
                # Invert these so high values = negative aspect
                result_df[f'ux_{ux_item}'] = 8 - result_df[column]  # 8 - value (1-7 scale)
    
    # Calculate UX KPI (mean of all items)
    ux_items = [f'ux_{item}' for item in ux_kpi_items.keys() if f'ux_{item}' in result_df.columns]
    
    if ux_items:
        result_df['ux_kpi'] = result_df[ux_items].mean(axis=1)
        
        # Also determine worst aspect for each row
        result_df['worst_aspect'] = result_df[ux_items].idxmax(axis=1).str.replace('ux_', '')
        result_df['worst_value'] = result_df[ux_items].max(axis=1)
    
    return result_df

# Calculate UX KPI for human data
human_data_with_kpi = calculate_ux_kpi(human_data)

# Calculate UX KPI for model data
model_data_with_kpi = calculate_ux_kpi(model_data)

# Show sample results
print("Human Data with UX KPI:")
print(human_data_with_kpi[['metadata_pattern_type', 'ux_kpi', 'worst_aspect', 'worst_value']].head())

print("\nModel Data with UX KPI:")
print(model_data_with_kpi[['metadata_pattern_type', 'metadata_ai_service', 'metadata_model', 'ux_kpi', 'worst_aspect', 'worst_value']].head())

Human Data with UX KPI:
         metadata_pattern_type  ux_kpi worst_aspect  worst_value
0      Overcomplicated Process   4.250    cluttered            6
1         Sneaking Bad Default   4.250    cluttered            6
2          Toying With Emotion   4.875    confusing            7
3                  Endlessness   3.375    cluttered            6
4  Expectation Result Mismatch   4.750       boring            7

Model Data with UX KPI:
         metadata_pattern_type metadata_ai_service          metadata_model  \
0                      Nagging           anthropic  claude-3-opus-20240229   
1      Overcomplicated Process           anthropic  claude-3-opus-20240229   
2   Hindering Account Deletion           anthropic  claude-3-opus-20240229   
3         Sneaking Bad Default           anthropic  claude-3-opus-20240229   
4  Expectation Result Mismatch           anthropic  claude-3-opus-20240229   

   ux_kpi     worst_aspect  worst_value  
0   4.250      obstructive          7.0  
1   6.00

  result_df['worst_aspect'] = result_df[ux_items].idxmax(axis=1).str.replace('ux_', '')


## Compare Human and Model Assessments

Now let's generate a comparison of the average UX KPI for each pattern type between humans and models.

In [5]:
# Calculate average UX KPI for each pattern type - Human data
human_pattern_kpi = human_data_with_kpi.groupby('metadata_pattern_type')['ux_kpi'].mean().reset_index()
human_pattern_kpi = human_pattern_kpi.rename(columns={'ux_kpi': 'human_ux_kpi'})

# Calculate average UX KPI for each pattern type and model
model_pattern_kpi = model_data_with_kpi.groupby(['metadata_pattern_type', 'metadata_ai_service', 'metadata_model'])['ux_kpi'].mean().reset_index()

# Create a pivot table for model UX KPI
model_pivot = model_pattern_kpi.pivot_table(
    index='metadata_pattern_type',
    columns=['metadata_ai_service', 'metadata_model'],
    values='ux_kpi'
)

# Merge human and model data
combined_kpi = human_pattern_kpi.merge(model_pivot, left_on='metadata_pattern_type', right_index=True, how='outer')

# Sort by human UX KPI (worst to best)
combined_kpi = combined_kpi.sort_values('human_ux_kpi', ascending=False)

# Display comparison table
print("UX KPI Comparison (Higher values = Worse UX):")
combined_kpi

MergeError: Not allowed to merge between different levels. (1 levels on the left, 2 on the right)

## Visualize the Comparison

Let's create a bar chart to compare human vs. model average UX KPI for each pattern type.

In [None]:
# Prepare data for visualization
# Melt the pivot table to get it in a format suitable for plotting
plot_data = combined_kpi.melt(
    id_vars=['metadata_pattern_type', 'human_ux_kpi'],
    var_name=['ai_service', 'model'],
    value_name='model_ux_kpi'
)

# Create a long-format dataframe for easier plotting
comparison_data = []

for idx, row in plot_data.iterrows():
    pattern = row['metadata_pattern_type']
    
    # Add human data
    comparison_data.append({
        'pattern_type': pattern,
        'source': 'Human',
        'model': 'Human',
        'ux_kpi': row['human_ux_kpi']
    })
    
    # Add model data
    if not pd.isna(row['model_ux_kpi']):
        comparison_data.append({
            'pattern_type': pattern,
            'source': row['ai_service'],
            'model': row['model'],
            'ux_kpi': row['model_ux_kpi']
        })

comparison_df = pd.DataFrame(comparison_data)

# Create bar chart
plt.figure(figsize=(16, 10))
ax = sns.barplot(x='pattern_type', y='ux_kpi', hue='source', data=comparison_df)

# Customize chart
plt.title('UX KPI Comparison: Human vs. AI Models', fontsize=16)
plt.xlabel('Pattern Type', fontsize=14)
plt.ylabel('UX KPI (Higher = Worse UX)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Source', fontsize=12)
plt.tight_layout()

# Save figure
plt.savefig(os.path.join(output_dir, 'human_model_ux_kpi_comparison.png'))
plt.show()

## Create Gauge Visualizations

Now let's create gauge visualizations for both human and model assessments.

In [None]:
# Create gauge visualizations for human assessments
human_gauges_dir = os.path.join(output_dir, 'human_gauges')
os.makedirs(human_gauges_dir, exist_ok=True)

# Calculate average ux values, worst aspect, and UX KPI for each pattern type
human_pattern_avg = human_data_with_kpi.groupby('metadata_pattern_type').agg({
    'ux_kpi': 'mean',
    'worst_aspect': lambda x: x.mode()[0] if not x.mode().empty else None,
    'worst_value': 'mean'
}).reset_index()

# Create gauge for each pattern type
for _, row in human_pattern_avg.iterrows():
    pattern = row['metadata_pattern_type']
    worst_aspect = row['worst_aspect']
    worst_value = row['worst_value']
    ux_kpi = row['ux_kpi']
    
    # Determine color based on worst_value
    if worst_value > 5:
        text_color = "lightcoral"
    elif worst_value > 3:
        text_color = "orange"
    else:
        text_color = "lightgreen"
    
    # Create gauge visualization
    fig = go.Figure(go.Indicator(
        mode="gauge+number+delta",
        value=worst_value,
        domain={'x': [0, 1], 'y': [0, 0.9]},
        delta={
            'reference': ux_kpi,
            'font': {'size': 1},
            'position': "bottom",
            'relative': False,
            'increasing': {'symbol': " "},
            'decreasing': {'symbol': " ", 'color': "white"},
            'valueformat': " "
        },
        title={
            'text': f"<span style='font-size:1em;color:gray'>{pattern} (Human)</span><br>" +
                   f"<span style='font-size:1em;color:black'>UX KPI: {ux_kpi:.2f}</span>",
            'font': {'size': 24}
        },
        number={
            'font': {'size': 80, 'color': text_color},
            'suffix': f"<br><b><span style='font-size:1.0em;color:{text_color}'>{worst_aspect}</span>",
        },
        gauge={
            'axis': {'range': [1, 7]},
            'bar': {'color': "green" if worst_value < 3 else "orange" if worst_value < 5 else "red"},
            'steps': [
                {'range': [1, 3], 'color': "lightgreen"},
                {'range': [3, 5], 'color': "lightyellow"},
                {'range': [5, 7], 'color': "lightcoral"}
            ],
            'threshold': {
                'line': {'color': "black", 'width': 4},
                'thickness': 0.75,
                'value': ux_kpi
            }
        }
    ))
    
    fig.update_layout(
        margin=dict(l=20, r=20, t=50, b=100),
        height=600
    )
    
    # Save gauge visualization
    file_path = os.path.join(human_gauges_dir, f"{pattern.replace(' ', '_')}.png")
    fig.write_image(file_path)
    
    # Display a few gauges
    if _ < 3:
        fig.show()

In [None]:
# Create gauge visualizations for model assessments (one per model for each pattern)
for service in model_data_with_kpi['metadata_ai_service'].unique():
    for model in model_data_with_kpi[model_data_with_kpi['metadata_ai_service'] == service]['metadata_model'].unique():
        # Create directory for this model
        model_gauges_dir = os.path.join(output_dir, f"{service}_{model}_gauges")
        os.makedirs(model_gauges_dir, exist_ok=True)
        
        # Filter data for this model
        model_filter = (model_data_with_kpi['metadata_ai_service'] == service) & \
                       (model_data_with_kpi['metadata_model'] == model)
        this_model_data = model_data_with_kpi[model_filter]
        
        # Calculate average ux values, worst aspect, and UX KPI for each pattern type
        model_pattern_avg = this_model_data.groupby('metadata_pattern_type').agg({
            'ux_kpi': 'mean',
            'worst_aspect': lambda x: x.mode()[0] if not x.mode().empty else None,
            'worst_value': 'mean'
        }).reset_index()
        
        # Create gauge for each pattern type
        for _, row in model_pattern_avg.iterrows():
            pattern = row['metadata_pattern_type']
            worst_aspect = row['worst_aspect']
            worst_value = row['worst_value']
            ux_kpi = row['ux_kpi']
            
            # Skip if any values are NaN
            if pd.isna(pattern) or pd.isna(worst_aspect) or pd.isna(worst_value) or pd.isna(ux_kpi):
                continue
            
            # Determine color based on worst_value
            if worst_value > 5:
                text_color = "lightcoral"
            elif worst_value > 3:
                text_color = "orange"
            else:
                text_color = "lightgreen"
            
            # Create gauge visualization
            fig = go.Figure(go.Indicator(
                mode="gauge+number+delta",
                value=worst_value,
                domain={'x': [0, 1], 'y': [0, 0.9]},
                delta={
                    'reference': ux_kpi,
                    'font': {'size': 1},
                    'position': "bottom",
                    'relative': False,
                    'increasing': {'symbol': " "},
                    'decreasing': {'symbol': " ", 'color': "white"},
                    'valueformat': " "
                },
                title={
                    'text': f"<span style='font-size:1em;color:gray'>{pattern} ({service} {model})</span><br>" +
                           f"<span style='font-size:1em;color:black'>UX KPI: {ux_kpi:.2f}</span>",
                    'font': {'size': 24}
                },
                number={
                    'font': {'size': 80, 'color': text_color},
                    'suffix': f"<br><b><span style='font-size:1.0em;color:{text_color}'>{worst_aspect}</span>",
                },
                gauge={
                    'axis': {'range': [1, 7]},
                    'bar': {'color': "green" if worst_value < 3 else "orange" if worst_value < 5 else "red"},
                    'steps': [
                        {'range': [1, 3], 'color': "lightgreen"},
                        {'range': [3, 5], 'color': "lightyellow"},
                        {'range': [5, 7], 'color': "lightcoral"}
                    ],
                    'threshold': {
                        'line': {'color': "black", 'width': 4},
                        'thickness': 0.75,
                        'value': ux_kpi
                    }
                }
            ))
            
            fig.update_layout(
                margin=dict(l=20, r=20, t=50, b=100),
                height=600
            )
            
            # Save gauge visualization
            file_path = os.path.join(model_gauges_dir, f"{pattern.replace(' ', '_')}.png")
            fig.write_image(file_path)

## Analyze Worst Aspects

Compare which aspects were rated as worst by humans vs. models for each pattern.

In [None]:
# Create a comparison table of worst aspects
worst_aspect_comparison = human_pattern_avg[['metadata_pattern_type', 'worst_aspect']].rename(
    columns={'worst_aspect': 'human_worst_aspect'}
)

# Add model worst aspects
for service in model_data_with_kpi['metadata_ai_service'].unique():
    for model in model_data_with_kpi[model_data_with_kpi['metadata_ai_service'] == service]['metadata_model'].unique():
        model_filter = (model_data_with_kpi['metadata_ai_service'] == service) & \
                       (model_data_with_kpi['metadata_model'] == model)
        this_model_data = model_data_with_kpi[model_filter]
        
        # Calculate most common worst aspect for each pattern type
        model_worst_aspects = this_model_data.groupby('metadata_pattern_type')['worst_aspect'].agg(
            lambda x: x.mode()[0] if not x.mode().empty else None
        ).reset_index()
        
        model_worst_aspects = model_worst_aspects.rename(
            columns={'worst_aspect': f'{service}_{model}_worst_aspect'}
        )
        
        # Add to comparison table
        worst_aspect_comparison = worst_aspect_comparison.merge(
            model_worst_aspects, on='metadata_pattern_type', how='outer'
        )

# Display comparison table
print("Comparison of Worst UX Aspects:")
worst_aspect_comparison

## Analyze Agreement Between Humans and Models

Calculate how closely models' assessments align with human assessments.

In [None]:
# Calculate correlation between human and model UX KPI scores
correlation_results = []

for service in model_data_with_kpi['metadata_ai_service'].unique():
    for model in model_data_with_kpi[model_data_with_kpi['metadata_ai_service'] == service]['metadata_model'].unique():
        # Get model UX KPI for each pattern
        model_filter = (model_data_with_kpi['metadata_ai_service'] == service) & \
                       (model_data_with_kpi['metadata_model'] == model)
        
        model_kpi = model_data_with_kpi[model_filter].groupby('metadata_pattern_type')['ux_kpi'].mean()
        
        # Merge with human data
        merged_data = pd.DataFrame({
            'human_ux_kpi': human_data_with_kpi.groupby('metadata_pattern_type')['ux_kpi'].mean()
        })
        merged_data[f'{service}_{model}_ux_kpi'] = model_kpi
        
        # Calculate correlation
        try:
            corr = merged_data['human_ux_kpi'].corr(merged_data[f'{service}_{model}_ux_kpi'])
            
            # Calculate mean absolute error
            mae = abs(merged_data['human_ux_kpi'] - merged_data[f'{service}_{model}_ux_kpi']).mean()
            
            # Calculate agreement on worst aspect
            human_worst = human_pattern_avg.set_index('metadata_pattern_type')['worst_aspect']
            
            model_worst = this_model_data.groupby('metadata_pattern_type')['worst_aspect'].agg(
                lambda x: x.mode()[0] if not x.mode().empty else None
            )
            
            # Count matching worst aspects
            combined = pd.DataFrame({
                'human': human_worst,
                'model': model_worst
            })
            
            matching = (combined['human'] == combined['model']).sum()
            total = len(combined.dropna())
            
            aspect_agreement = matching / total if total > 0 else 0
            
            correlation_results.append({
                'service': service,
                'model': model,
                'correlation': corr,
                'mean_abs_error': mae,
                'worst_aspect_agreement': aspect_agreement,
                'worst_aspect_matching': f"{matching}/{total}"
            })
        except Exception as e:
            print(f"Error calculating correlation for {service} {model}: {e}")

# Create dataframe and sort by correlation
correlation_df = pd.DataFrame(correlation_results).sort_values('correlation', ascending=False)

# Display results
print("Human-Model Agreement:")
correlation_df

## Create Summary Table of UX KPI By Pattern

Create a clean, easy-to-read table of UX KPI scores for each pattern, comparing humans and models.

In [None]:
# Create a clean summary table
summary_table = pd.DataFrame({
    'Pattern Type': human_pattern_avg['metadata_pattern_type'],
    'Human UX KPI': human_pattern_avg['ux_kpi'],
    'Human Worst Aspect': human_pattern_avg['worst_aspect']
})

# Add model data
for service in model_data_with_kpi['metadata_ai_service'].unique():
    for model in model_data_with_kpi[model_data_with_kpi['metadata_ai_service'] == service]['metadata_model'].unique():
        if pd.isna(model):
            continue
            
        model_filter = (model_data_with_kpi['metadata_ai_service'] == service) & \
                      (model_data_with_kpi['metadata_model'] == model)
        this_model_data = model_data_with_kpi[model_filter]
        
        model_pattern_avg = this_model_data.groupby('metadata_pattern_type').agg({
            'ux_kpi': 'mean',
            'worst_aspect': lambda x: x.mode()[0] if not x.mode().empty else None
        }).reset_index()
        
        # Skip if empty
        if len(model_pattern_avg) == 0:
            continue
            
        # Create a temporary dataframe
        temp_df = pd.DataFrame({
            'Pattern Type': model_pattern_avg['metadata_pattern_type'],
            f'{service} {model} UX KPI': model_pattern_avg['ux_kpi'],
            f'{service} {model} Worst Aspect': model_pattern_avg['worst_aspect']
        })
        
        # Merge with summary table
        summary_table = summary_table.merge(temp_df, on='Pattern Type', how='outer')

# Sort by Human UX KPI (worst to best)
summary_table = summary_table.sort_values('Human UX KPI', ascending=False)

# Display summary table
print("UX KPI Summary by Pattern Type:")
summary_table

In [None]:
# Save summary table to CSV
summary_table.to_csv(os.path.join(output_dir, 'ux_kpi_summary.csv'), index=False)
print(f"Summary saved to {os.path.join(output_dir, 'ux_kpi_summary.csv')}")

## Conclusion

This notebook has provided:
1. A clear comparison of UX KPI metrics between human and model assessments
2. Gauge visualizations for each pattern type showing worst aspects
3. Analysis of agreement between humans and different AI models
4. Summary tables that can be easily referenced for presentations or papers