# Prompt Variations Analysis

This notebook analyzes the performance of different prompt variations for generating PR descriptions and their detectability by AI detection tools.

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import glob
import os
from pathlib import Path

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

## Data Loading and Processing

In [37]:
# Define paths
generation_path = "../generation/datasets/"
detection_path = "../detection/datasets/prompt_variations/prompt_variations-detection.csv"

# Load detection results
print("Loading detection results...")
detection_df = pd.read_csv(detection_path)
print(f"Detection data shape: {detection_df.shape}")
print(f"Detection data columns: {detection_df.columns.tolist()}")
detection_df.head()

Loading detection results...
Detection data shape: (704, 6)
Detection data columns: ['pr_id', 'prompt_variation', 'entry_key', 'entry_type', 'input_text', 'zerogpt_response']


Unnamed: 0,pr_id,prompt_variation,entry_key,entry_type,input_text,zerogpt_response
0,PR_kwDOAQ0TF85oN6RH,P-7_Template_Plus_Title,PR_kwDOAQ0TF85oN6RH_P-7_Template_Plus_Title_or...,original,<!-- \nDescribe the changes you have made here...,"{""success"": true, ""code"": 200, ""message"": ""det..."
1,PR_kwDOAQ0TF85oN6RH,P-7_Template_Plus_Title,PR_kwDOAQ0TF85oN6RH_P-7_Template_Plus_Title_ge...,generated,Fixed a modularity issue with the HTML convert...,"{""success"": true, ""code"": 200, ""message"": ""det..."
2,MDExOlB1bGxSZXF1ZXN0NTcwMzYyODc2,P-7_Template_Plus_Title,MDExOlB1bGxSZXF1ZXN0NTcwMzYyODc2_P-7_Template_...,original,Add zbmath to the public databases which can b...,"{""success"": true, ""code"": 200, ""message"": ""det..."
3,MDExOlB1bGxSZXF1ZXN0NTcwMzYyODc2,P-7_Template_Plus_Title,MDExOlB1bGxSZXF1ZXN0NTcwMzYyODc2_P-7_Template_...,generated,Added a new fetcher for ZbMATH to JabRef to en...,"{""success"": true, ""code"": 200, ""message"": ""det..."
4,PR_kwDOAQ0TF86DGkyK,P-7_Template_Plus_Title,PR_kwDOAQ0TF86DGkyK_P-7_Template_Plus_Title_or...,original,Fixes https://github.com/JabRef/jabref/issues/...,"{""success"": true, ""code"": 200, ""message"": ""det..."


In [38]:
# Find all prompt variation CSV files
prompt_variation_files = glob.glob(os.path.join(generation_path, "prompt_variation_P-*_generated.csv"))
prompt_variation_files.sort()

print(f"Found {len(prompt_variation_files)} prompt variation files:")
for file in prompt_variation_files:
    print(f"  {os.path.basename(file)}")

Found 11 prompt variation files:
  prompt_variation_P-10_Full_Plus_One_Shot_generated.csv
  prompt_variation_P-11_Full_Plus_Few_Shot_generated.csv
  prompt_variation_P-1_Minimal_generated.csv
  prompt_variation_P-2_Basic_generated.csv
  prompt_variation_P-3_Diffs_Only_generated.csv
  prompt_variation_P-4_Diffs_Plus_Title_generated.csv
  prompt_variation_P-5_Code_Only_generated.csv
  prompt_variation_P-6_Issue_Only_generated.csv
  prompt_variation_P-7_Template_Plus_Title_generated.csv
  prompt_variation_P-8_Full_Context_generated.csv
  prompt_variation_P-9_Basic_One_Shot_generated.csv


In [39]:
# Load and combine all prompt variation data
print("\nLoading prompt variation data...")
all_generation_data = []

for file_path in prompt_variation_files:
    print(f"Loading {os.path.basename(file_path)}...")
    df = pd.read_csv(file_path)
    
    # Extract prompt variation from filename
    filename = os.path.basename(file_path)
    prompt_var = filename.split('_')[2] + '_' + filename.split('_')[3]  # e.g., P-1_Minimal
    
    # Add prompt variation if not present
    if 'prompt_variation' not in df.columns:
        df['prompt_variation'] = prompt_var
    
    print(f"  Shape: {df.shape}, Prompt variation: {df['prompt_variation'].iloc[0] if len(df) > 0 else 'N/A'}")
    all_generation_data.append(df)

# Combine all generation data
generation_df = pd.concat(all_generation_data, ignore_index=True)
print(f"\nCombined generation data shape: {generation_df.shape}")
print(f"Prompt variations found: {sorted(generation_df['prompt_variation'].unique())}")


Loading prompt variation data...
Loading prompt_variation_P-10_Full_Plus_One_Shot_generated.csv...
  Shape: (240, 26), Prompt variation: P-10_Full_Plus_One_Shot
Loading prompt_variation_P-11_Full_Plus_Few_Shot_generated.csv...
  Shape: (240, 26), Prompt variation: P-11_Full_Plus_Few_Shot
Loading prompt_variation_P-1_Minimal_generated.csv...
  Shape: (240, 26), Prompt variation: P-1_Minimal
Loading prompt_variation_P-2_Basic_generated.csv...
  Shape: (240, 26), Prompt variation: P-11_Full_Plus_Few_Shot
Loading prompt_variation_P-1_Minimal_generated.csv...
  Shape: (240, 26), Prompt variation: P-1_Minimal
Loading prompt_variation_P-2_Basic_generated.csv...
  Shape: (240, 26), Prompt variation: P-2_Basic
Loading prompt_variation_P-3_Diffs_Only_generated.csv...
  Shape: (240, 26), Prompt variation: P-3_Diffs_Only
Loading prompt_variation_P-4_Diffs_Plus_Title_generated.csv...
  Shape: (240, 26), Prompt variation: P-2_Basic
Loading prompt_variation_P-3_Diffs_Only_generated.csv...
  Shape: (

In [40]:
# Check the structure of generation data
print("Generation data columns:")
print(generation_df.columns.tolist())
print("\nGeneration data sample:")
generation_df.head()

Generation data columns:
['id', 'title', 'description', 'state', 'repository', 'pr_number', 'filename', 'status', 'additions', 'deletions', 'changes', 'sha', 'blob_url', 'raw_url', 'patch', 'file_size_bytes', 'file_content', 'pr_total_size_bytes', 'issue_titles', 'issue_bodies', 'issue_comments', 'generated_description', 'prompt_variation', 'total_input_tokens', 'total_output_tokens', 'total_tokens']

Generation data sample:


Unnamed: 0,id,title,description,state,repository,pr_number,filename,status,additions,deletions,...,file_content,pr_total_size_bytes,issue_titles,issue_bodies,issue_comments,generated_description,prompt_variation,total_input_tokens,total_output_tokens,total_tokens
0,PR_kwDOAQ0TF85oN6RH,Fix modularity issue with html converter,<!-- \nDescribe the changes you have made here...,MERGED,JabRef/jabref,10943,build.gradle,modified,4,2,...,import org.gradle.internal.os.OperatingSystem\...,28005,issue #10942: Fix: abstract field loses markdo...,"issue #10942: After PR #10896, the abstract fi...",Comment #1 by LoayGhreeb in issue #10942: Ther...,This PR fixes a modularity issue related to th...,P-10_Full_Plus_One_Shot,10278,179,10457
1,MDExOlB1bGxSZXF1ZXN0NTcwMzYyODc2,Zbmath fetcher,Add zbmath to the public databases which can b...,MERGED,JabRef/jabref,7440,CHANGELOG.md,modified,1,0,...,# Changelog\n\nAll notable changes to this pro...,97657,issue #7437: Enhance bibliographic information...,issue #7437: It is possible to enhance bibliog...,Comment #1 by Siedlerchr in issue #7437: Sound...,This PR adds support for fetching bibliographi...,P-10_Full_Plus_One_Shot,33439,662,34101
2,MDExOlB1bGxSZXF1ZXN0NTcwMzYyODc2,Zbmath fetcher,Add zbmath to the public databases which can b...,MERGED,JabRef/jabref,7440,src/main/java/org/jabref/logic/importer/EntryB...,modified,11,3,...,package org.jabref.logic.importer;\n\nimport j...,97657,issue #7437: Enhance bibliographic information...,issue #7437: It is possible to enhance bibliog...,Comment #1 by Siedlerchr in issue #7437: Sound...,This PR adds support for fetching bibliographi...,P-10_Full_Plus_One_Shot,33439,662,34101
3,MDExOlB1bGxSZXF1ZXN0NTcwMzYyODc2,Zbmath fetcher,Add zbmath to the public databases which can b...,MERGED,JabRef/jabref,7440,src/main/java/org/jabref/logic/importer/WebFet...,modified,1,0,...,package org.jabref.logic.importer;\n\nimport j...,97657,issue #7437: Enhance bibliographic information...,issue #7437: It is possible to enhance bibliog...,Comment #1 by Siedlerchr in issue #7437: Sound...,This PR adds support for fetching bibliographi...,P-10_Full_Plus_One_Shot,33439,662,34101
4,MDExOlB1bGxSZXF1ZXN0NTcwMzYyODc2,Zbmath fetcher,Add zbmath to the public databases which can b...,MERGED,JabRef/jabref,7440,src/main/java/org/jabref/logic/importer/fetche...,modified,62,9,...,package org.jabref.logic.importer.fetcher;\n\n...,97657,issue #7437: Enhance bibliographic information...,issue #7437: It is possible to enhance bibliog...,Comment #1 by Siedlerchr in issue #7437: Sound...,This PR adds support for fetching bibliographi...,P-10_Full_Plus_One_Shot,33439,662,34101


## Data Merging and Preparation

In [41]:
# Prepare generation data for merging
# Group by PR ID and prompt variation to get unique records (since multiple files per PR have same generated description)
generation_unique = generation_df.groupby(['id', 'prompt_variation']).first().reset_index()
print(f"Generation data after deduplication: {generation_unique.shape}")

# Prepare detection data for merging
print(f"\nDetection data entry types: {detection_df['entry_type'].value_counts()}")
print(f"Detection data prompt variations: {sorted(detection_df['prompt_variation'].unique())}")

Generation data after deduplication: (352, 26)

Detection data entry types: entry_type
original     352
generated    352
Name: count, dtype: int64
Detection data prompt variations: ['P-10_Full_Plus_One_Shot', 'P-11_Full_Plus_Few_Shot', 'P-1_Minimal', 'P-2_Basic', 'P-3_Diffs_Only', 'P-4_Diffs_Plus_Title', 'P-5_Code_Only', 'P-6_Issue_Only', 'P-7_Template_Plus_Title', 'P-8_Full_Context', 'P-9_Basic_One_Shot']


In [42]:
# Merge generation and detection data
# For generation data: use id as pr_id
generation_unique['pr_id'] = generation_unique['id']

# Merge on pr_id and prompt_variation
merged_df = pd.merge(
    generation_unique,
    detection_df,
    on=['pr_id', 'prompt_variation'],
    how='inner'
)

print(f"Merged data shape: {merged_df.shape}")
print(f"Merged data entry types: {merged_df['entry_type'].value_counts()}")
print(f"Merged data prompt variations: {sorted(merged_df['prompt_variation'].unique())}")

Merged data shape: (704, 31)
Merged data entry types: entry_type
original     352
generated    352
Name: count, dtype: int64
Merged data prompt variations: ['P-10_Full_Plus_One_Shot', 'P-11_Full_Plus_Few_Shot', 'P-1_Minimal', 'P-2_Basic', 'P-3_Diffs_Only', 'P-4_Diffs_Plus_Title', 'P-5_Code_Only', 'P-6_Issue_Only', 'P-7_Template_Plus_Title', 'P-8_Full_Context', 'P-9_Basic_One_Shot']


## Analysis Functions

In [43]:
def parse_zerogpt_response(response_str):
    """Parse ZeroGPT response to extract AI probability"""
    try:
        if pd.isna(response_str) or response_str == "":
            return None
        
        # Try to parse as JSON
        if isinstance(response_str, str):
            response = json.loads(response_str)
            
            # Handle nested structure - check if there's a 'data' field
            if 'data' in response:
                data = response['data']
                # Use fakePercentage if available, otherwise calculate from isHuman
                if 'fakePercentage' in data:
                    return data['fakePercentage']
                elif 'isHuman' in data:
                    return 100 - data['isHuman']  # Convert isHuman to AI percentage
                else:
                    return None
            else:
                # Direct structure
                if 'fakePercentage' in response:
                    return response['fakePercentage']
                elif 'isHuman' in response:
                    return 100 - response['isHuman']
                else:
                    return None
        else:
            # If it's already a number or can be converted
            return float(response_str)
    except Exception as e:
        print(f"Error parsing response: {e} | Response: {response_str[:100] if isinstance(response_str, str) else response_str}")
        return None

def calculate_detection_metrics(df, ai_threshold=50.0):  # Changed threshold to 50% since we're dealing with percentages
    """Calculate detection accuracy metrics including AI probability statistics"""
    # Parse AI probabilities
    df = df.copy()  # Avoid SettingWithCopyWarning
    df['ai_probability'] = df['zerogpt_response'].apply(parse_zerogpt_response)
    
    # Filter out rows where AI probability couldn't be parsed
    valid_df = df[df['ai_probability'].notna()].copy()
    
    if len(valid_df) == 0:
        return {
            'true_positive_pct': 0, 
            'false_negative_pct': 0, 
            'total_samples': len(df), 
            'valid_samples': 0,
            'mean_ai_score_generated': 0,
            'median_ai_score_generated': 0,
            'mean_ai_score_original': 0,
            'median_ai_score_original': 0
        }
    
    # Determine if detected as AI (above threshold)
    valid_df['detected_as_ai'] = valid_df['ai_probability'] > ai_threshold
    
    # Calculate metrics
    generated_entries = valid_df[valid_df['entry_type'] == 'generated']
    original_entries = valid_df[valid_df['entry_type'] == 'original']
    
    # True Positive Rate: Generated content correctly identified as AI
    tp_rate = 0
    if len(generated_entries) > 0:
        tp_rate = (generated_entries['detected_as_ai'].sum() / len(generated_entries)) * 100
    
    # False Positive Rate: Original content incorrectly identified as AI (this is what we call "false negative" in the context)
    fp_rate = 0
    if len(original_entries) > 0:
        fp_rate = (original_entries['detected_as_ai'].sum() / len(original_entries)) * 100
    
    # AI probability statistics
    mean_ai_generated = generated_entries['ai_probability'].mean() if len(generated_entries) > 0 else 0
    median_ai_generated = generated_entries['ai_probability'].median() if len(generated_entries) > 0 else 0
    mean_ai_original = original_entries['ai_probability'].mean() if len(original_entries) > 0 else 0
    median_ai_original = original_entries['ai_probability'].median() if len(original_entries) > 0 else 0
    
    return {
        'true_positive_pct': tp_rate,
        'false_negative_pct': fp_rate,  # FP on original = FN from human perspective
        'total_samples': len(df),
        'valid_samples': len(valid_df),
        'mean_ai_score_generated': mean_ai_generated,
        'median_ai_score_generated': median_ai_generated,
        'mean_ai_score_original': mean_ai_original,
        'median_ai_score_original': median_ai_original
    }

def calculate_text_metrics(text_series):
    """Calculate text-based metrics"""
    if len(text_series) == 0:
        return {'mean_length': 0, 'median_length': 0}
    
    lengths = text_series.str.len()
    return {
        'mean_length': lengths.mean(),
        'median_length': lengths.median()
    }

## Main Analysis

In [44]:
# Debug: Let's check the merged data structure
print("Debugging merged data structure:")
print(f"Merged data shape: {merged_df.shape}")
print(f"Entry types: {merged_df['entry_type'].value_counts()}")
print(f"Prompt variations: {sorted(merged_df['prompt_variation'].unique())}")

# Check if we have zerogpt_response data
print(f"\nZeroGPT response data availability:")
print(f"Non-null zerogpt_response: {merged_df['zerogpt_response'].notna().sum()}")
print(f"Empty zerogpt_response: {(merged_df['zerogpt_response'] == '').sum()}")

# Look at a sample of zerogpt responses
print("\nSample zerogpt responses:")
sample_responses = merged_df[merged_df['zerogpt_response'].notna() & (merged_df['zerogpt_response'] != '')]['zerogpt_response'].head(3)
for i, resp in enumerate(sample_responses):
    print(f"Sample {i+1}: {resp[:200]}...")

# Now re-analyze with this understanding
analysis_results = []

# Get all prompt variations plus 'Original'
prompt_variations = sorted([pv for pv in merged_df['prompt_variation'].unique() if pv.startswith('P-')])
print(f"\nAnalyzing prompt variations: {prompt_variations}")

# First, analyze 'Original' (using original descriptions from any prompt variation)
print("\nAnalyzing Original descriptions...")
original_data = merged_df[merged_df['entry_type'] == 'original']
print(f"Original data shape: {original_data.shape}")

if len(original_data) > 0:
    # Token metrics - Original doesn't have token usage, so set to 0
    original_metrics = {
        'prompt_variation': 'Original',
        'mean_prompt_tokens': 0,
        'median_prompt_tokens': 0,
        'mean_completion_tokens': 0,
        'median_completion_tokens': 0
    }
    
    # Text metrics
    text_metrics = calculate_text_metrics(original_data['input_text'])
    original_metrics.update({
        'mean_description_length': text_metrics['mean_length'],
        'median_description_length': text_metrics['median_length']
    })
    
    # Detection metrics (including new AI probability metrics)
    detection_metrics = calculate_detection_metrics(original_data)
    original_metrics.update({
        'true_positive_pct': detection_metrics['true_positive_pct'],
        'false_negative_pct': detection_metrics['false_negative_pct'],
        'mean_ai_score_generated': detection_metrics['mean_ai_score_generated'],
        'median_ai_score_generated': detection_metrics['median_ai_score_generated'],
        'mean_ai_score_original': detection_metrics['mean_ai_score_original'],
        'median_ai_score_original': detection_metrics['median_ai_score_original'],
        'total_samples': detection_metrics['total_samples'],
        'valid_samples': detection_metrics['valid_samples']
    })
    
    analysis_results.append(original_metrics)
    print(f"  Total samples: {detection_metrics['total_samples']}")
    print(f"  Valid samples: {detection_metrics['valid_samples']}")
    print(f"  Mean description length: {text_metrics['mean_length']:.1f}")
    print(f"  False negative rate: {detection_metrics['false_negative_pct']:.1f}%")
    print(f"  Mean AI score (original): {detection_metrics['mean_ai_score_original']:.1f}%")
    print(f"  Median AI score (original): {detection_metrics['median_ai_score_original']:.1f}%")

Debugging merged data structure:
Merged data shape: (704, 31)
Entry types: entry_type
original     352
generated    352
Name: count, dtype: int64
Prompt variations: ['P-10_Full_Plus_One_Shot', 'P-11_Full_Plus_Few_Shot', 'P-1_Minimal', 'P-2_Basic', 'P-3_Diffs_Only', 'P-4_Diffs_Plus_Title', 'P-5_Code_Only', 'P-6_Issue_Only', 'P-7_Template_Plus_Title', 'P-8_Full_Context', 'P-9_Basic_One_Shot']

ZeroGPT response data availability:
Non-null zerogpt_response: 704
Empty zerogpt_response: 0

Sample zerogpt responses:
Sample 1: {"success": true, "code": 200, "message": "detection result passed to proxy", "data": {"sentences": [], "isHuman": 100, "additional_feedback": "", "h": [], "hi": [], "textWords": 77, "aiWords": 0, "fa...
Sample 2: {"success": true, "code": 200, "message": "detection result passed to proxy", "data": {"sentences": [], "isHuman": 100, "additional_feedback": "", "h": [], "hi": [], "textWords": 87, "aiWords": 0, "fa...
Sample 3: {"success": true, "code": 200, "message": "dete

In [45]:
# Now analyze each prompt variation
for pv in prompt_variations:
    print(f"\nAnalyzing {pv}...")
    
    # Get data for this prompt variation
    pv_data = merged_df[merged_df['prompt_variation'] == pv]
    
    if len(pv_data) == 0:
        print(f"  No data found for {pv}")
        continue
    
    # Get generation data for token metrics (only generated entries have token info)
    pv_generated = pv_data[pv_data['entry_type'] == 'generated']
    
    pv_metrics = {'prompt_variation': pv}
    
    # Token metrics
    if len(pv_generated) > 0 and 'total_input_tokens' in pv_generated.columns:
        pv_metrics.update({
            'mean_prompt_tokens': pv_generated['total_input_tokens'].mean(),
            'median_prompt_tokens': pv_generated['total_input_tokens'].median(),
            'mean_completion_tokens': pv_generated['total_output_tokens'].mean(),
            'median_completion_tokens': pv_generated['total_output_tokens'].median()
        })
    else:
        pv_metrics.update({
            'mean_prompt_tokens': 0,
            'median_prompt_tokens': 0,
            'mean_completion_tokens': 0,
            'median_completion_tokens': 0
        })
    
    # Text metrics (using generated descriptions)
    if len(pv_generated) > 0:
        text_metrics = calculate_text_metrics(pv_generated['input_text'])
        pv_metrics.update({
            'mean_description_length': text_metrics['mean_length'],
            'median_description_length': text_metrics['median_length']
        })
    else:
        pv_metrics.update({
            'mean_description_length': 0,
            'median_description_length': 0
        })
    
    # Detection metrics (using both original and generated, including new AI probability metrics)
    detection_metrics = calculate_detection_metrics(pv_data)
    pv_metrics.update({
        'true_positive_pct': detection_metrics['true_positive_pct'],
        'false_negative_pct': detection_metrics['false_negative_pct'],
        'mean_ai_score_generated': detection_metrics['mean_ai_score_generated'],
        'median_ai_score_generated': detection_metrics['median_ai_score_generated'],
        'mean_ai_score_original': detection_metrics['mean_ai_score_original'],
        'median_ai_score_original': detection_metrics['median_ai_score_original'],
        'total_samples': detection_metrics['total_samples']
    })
    
    analysis_results.append(pv_metrics)
    
    print(f"  Samples: {detection_metrics['total_samples']} (Generated: {len(pv_generated)})")
    print(f"  Mean prompt tokens: {pv_metrics['mean_prompt_tokens']:.0f}")
    print(f"  Mean completion tokens: {pv_metrics['mean_completion_tokens']:.0f}")
    print(f"  Mean description length: {pv_metrics['mean_description_length']:.1f}")
    print(f"  True positive rate: {detection_metrics['true_positive_pct']:.1f}%")
    print(f"  False negative rate: {detection_metrics['false_negative_pct']:.1f}%")
    print(f"  Mean AI score (generated): {detection_metrics['mean_ai_score_generated']:.1f}%")
    print(f"  Mean AI score (original): {detection_metrics['mean_ai_score_original']:.1f}%")


Analyzing P-10_Full_Plus_One_Shot...
  Samples: 64 (Generated: 32)
  Mean prompt tokens: 64351
  Mean completion tokens: 557
  Mean description length: 1112.8
  True positive rate: 28.1%
  False negative rate: 0.0%
  Mean AI score (generated): 27.0%
  Mean AI score (original): 13.5%

Analyzing P-11_Full_Plus_Few_Shot...
  Samples: 64 (Generated: 32)
  Mean prompt tokens: 74742
  Mean completion tokens: 451
  Mean description length: 922.2
  True positive rate: 15.6%
  False negative rate: 0.0%
  Mean AI score (generated): 22.5%
  Mean AI score (original): 13.5%

Analyzing P-1_Minimal...
  Samples: 64 (Generated: 32)
  Mean prompt tokens: 632
  Mean completion tokens: 190
  Mean description length: 500.2
  True positive rate: 3.1%
  False negative rate: 0.0%
  Mean AI score (generated): 4.7%
  Mean AI score (original): 13.5%

Analyzing P-2_Basic...
  Samples: 64 (Generated: 32)
  Mean prompt tokens: 681
  Mean completion tokens: 212
  Mean description length: 511.4
  True positive rate

## Results Summary Table

In [46]:
# Create results dataframe
results_df = pd.DataFrame(analysis_results)

# Reorder columns as requested, including new AI probability metrics
column_order = [
    'prompt_variation',
    'mean_prompt_tokens',
    'median_prompt_tokens', 
    'mean_completion_tokens',
    'median_completion_tokens',
    'mean_description_length',
    'median_description_length',
    'true_positive_pct',
    'false_negative_pct',
    'mean_ai_score_generated',
    'median_ai_score_generated',
    'mean_ai_score_original',
    'median_ai_score_original',
    'total_samples'
]

results_df = results_df[column_order]

# Round numeric columns for better display
numeric_cols = [col for col in results_df.columns if col != 'prompt_variation']
results_df[numeric_cols] = results_df[numeric_cols].round(1)

print("\n=== PROMPT VARIATIONS ANALYSIS RESULTS ===")
print("\nTarget Output Table with AI Probability Scores:")
print("=" * 160)

# Display the table
results_df


=== PROMPT VARIATIONS ANALYSIS RESULTS ===

Target Output Table with AI Probability Scores:


Unnamed: 0,prompt_variation,mean_prompt_tokens,median_prompt_tokens,mean_completion_tokens,median_completion_tokens,mean_description_length,median_description_length,true_positive_pct,false_negative_pct,mean_ai_score_generated,median_ai_score_generated,mean_ai_score_original,median_ai_score_original,total_samples
0,Original,0.0,0.0,0.0,0.0,1184.0,1187.5,0.0,0.0,0.0,0.0,13.5,9.9,352
1,P-10_Full_Plus_One_Shot,64351.4,28815.5,557.1,303.5,1112.8,983.0,28.1,0.0,27.0,16.8,13.5,9.9,64
2,P-11_Full_Plus_Few_Shot,74741.9,35300.0,451.0,246.5,922.2,761.5,15.6,0.0,22.5,18.1,13.5,9.9,64
3,P-1_Minimal,632.5,365.0,189.8,85.5,500.2,373.5,3.1,0.0,4.7,0.0,13.5,9.9,64
4,P-2_Basic,681.2,412.5,212.5,117.5,511.4,396.0,6.2,0.0,9.3,0.0,13.5,9.9,64
5,P-3_Diffs_Only,5574.5,2129.0,288.8,155.0,712.5,480.5,6.2,0.0,6.8,0.0,13.5,9.9,64
6,P-4_Diffs_Plus_Title,5619.8,2143.5,303.1,167.0,698.6,513.0,3.1,0.0,9.9,0.0,13.5,9.9,64
7,P-5_Code_Only,56585.1,24574.5,348.1,174.5,806.5,658.5,12.5,0.0,14.4,0.0,13.5,9.9,64
8,P-6_Issue_Only,2109.8,882.5,345.1,181.5,669.0,575.0,12.5,0.0,17.4,0.0,13.5,9.9,64
9,P-7_Template_Plus_Title,872.8,525.0,326.9,175.5,725.7,604.0,3.1,0.0,13.2,0.0,13.5,9.9,64


In [47]:
# Save results to CSV
output_file = "prompt_variations_analysis_results.csv"
results_df.to_csv(output_file, index=False)
print(f"\nResults saved to: {output_file}")

# Display summary statistics
print("\n=== SUMMARY STATISTICS ===")
print(f"Total prompt variations analyzed: {len(results_df)}")
print(f"Best performing prompt (highest true positive rate): {results_df.loc[results_df['true_positive_pct'].idxmax(), 'prompt_variation']}")
print(f"Most efficient prompt (lowest mean prompt tokens): {results_df[results_df['prompt_variation'] != 'Original'].loc[results_df[results_df['prompt_variation'] != 'Original']['mean_prompt_tokens'].idxmin(), 'prompt_variation']}")
print(f"Longest descriptions (highest mean length): {results_df.loc[results_df['mean_description_length'].idxmax(), 'prompt_variation']}")


Results saved to: prompt_variations_analysis_results.csv

=== SUMMARY STATISTICS ===
Total prompt variations analyzed: 12
Best performing prompt (highest true positive rate): P-10_Full_Plus_One_Shot
Most efficient prompt (lowest mean prompt tokens): P-1_Minimal
Longest descriptions (highest mean length): Original


## Additional Textual Analysis Features

Now let's add some additional textual analysis features similar to the jabref-prs-comparison notebook.

In [48]:
def extract_zerogpt_text_metrics(response_str):
    """Extract textual metrics from ZeroGPT response"""
    try:
        if pd.isna(response_str) or response_str == "":
            return {'textWords': None, 'aiWords': None, 'word_count': None}
        
        response = json.loads(response_str)
        
        if 'data' in response:
            data = response['data']
            return {
                'textWords': data.get('textWords', None),
                'aiWords': data.get('aiWords', None),
                'word_count': data.get('textWords', None)  # Same as textWords
            }
        else:
            return {'textWords': None, 'aiWords': None, 'word_count': None}
    except:
        return {'textWords': None, 'aiWords': None, 'word_count': None}

def calculate_extended_text_metrics(df):
    """Calculate extended textual metrics"""
    if len(df) == 0:
        return {}
    
    # Basic length metrics
    lengths = df['input_text'].str.len()
    
    # Extract ZeroGPT word counts
    zerogpt_metrics = df['zerogpt_response'].apply(extract_zerogpt_text_metrics)
    word_counts = pd.DataFrame(zerogpt_metrics.tolist())['textWords'].dropna()
    
    # Calculate sentences (approximate by counting periods, exclamation marks, question marks)
    sentence_counts = df['input_text'].str.count(r'[.!?]+')
    
    # Calculate newlines (as a proxy for paragraph structure)
    newline_counts = df['input_text'].str.count(r'\\n')
    
    return {
        'mean_char_length': lengths.mean(),
        'median_char_length': lengths.median(),
        'mean_word_count': word_counts.mean() if len(word_counts) > 0 else 0,
        'median_word_count': word_counts.median() if len(word_counts) > 0 else 0,
        'mean_sentence_count': sentence_counts.mean(),
        'median_sentence_count': sentence_counts.median(),
        'mean_newline_count': newline_counts.mean(),
        'median_newline_count': newline_counts.median()
    }

print("Extended textual analysis functions defined.")

Extended textual analysis functions defined.


In [49]:
# Extended textual analysis for each prompt variation
print("=== EXTENDED TEXTUAL ANALYSIS ===\\n")

extended_analysis_results = []

# Analyze Original
original_data = merged_df[merged_df['entry_type'] == 'original']
if len(original_data) > 0:
    ext_metrics = calculate_extended_text_metrics(original_data)
    ext_metrics['prompt_variation'] = 'Original'
    extended_analysis_results.append(ext_metrics)
    print(f"Original - Samples: {len(original_data)}")
    print(f"  Mean words: {ext_metrics['mean_word_count']:.1f}")
    print(f"  Mean sentences: {ext_metrics['mean_sentence_count']:.1f}")
    print(f"  Mean newlines: {ext_metrics['mean_newline_count']:.1f}\\n")

# Analyze each prompt variation
for pv in prompt_variations:
    pv_data = merged_df[merged_df['prompt_variation'] == pv]
    pv_generated = pv_data[pv_data['entry_type'] == 'generated']
    
    if len(pv_generated) > 0:
        ext_metrics = calculate_extended_text_metrics(pv_generated)
        ext_metrics['prompt_variation'] = pv
        extended_analysis_results.append(ext_metrics)
        print(f"{pv} - Samples: {len(pv_generated)}")
        print(f"  Mean words: {ext_metrics['mean_word_count']:.1f}")
        print(f"  Mean sentences: {ext_metrics['mean_sentence_count']:.1f}")
        print(f"  Mean newlines: {ext_metrics['mean_newline_count']:.1f}\\n")

=== EXTENDED TEXTUAL ANALYSIS ===\n
Original - Samples: 352
  Mean words: 160.7
  Mean sentences: 19.8
  Mean newlines: 0.0\n
P-10_Full_Plus_One_Shot - Samples: 32
  Mean words: 159.9
  Mean sentences: 9.8
  Mean newlines: 0.0\n
P-11_Full_Plus_Few_Shot - Samples: 32
  Mean words: 132.0
  Mean sentences: 8.2
  Mean newlines: 0.0\n
P-1_Minimal - Samples: 32
  Mean words: 62.0
  Mean sentences: 6.2
  Mean newlines: 0.0\n
P-2_Basic - Samples: 32
  Mean words: 67.8
  Mean sentences: 5.3
  Mean newlines: 0.0\n
P-3_Diffs_Only - Samples: 32
  Mean words: 93.9
  Mean sentences: 6.7
  Mean newlines: 0.0\n
P-4_Diffs_Plus_Title - Samples: 32
  Mean words: 94.4
  Mean sentences: 6.5
  Mean newlines: 0.0\n
P-5_Code_Only - Samples: 32
  Mean words: 109.9
  Mean sentences: 6.9
  Mean newlines: 0.0\n
P-6_Issue_Only - Samples: 32
  Mean words: 94.2
  Mean sentences: 6.4
  Mean newlines: 0.0\n
P-7_Template_Plus_Title - Samples: 32
  Mean words: 98.7
  Mean sentences: 8.0
  Mean newlines: 0.0\n
P-8_Full_C

In [50]:
# Create extended results DataFrame
extended_results_df = pd.DataFrame(extended_analysis_results)

# Merge with original results
final_results = pd.merge(results_df, extended_results_df, on='prompt_variation', how='left')

# Reorder columns for the final comprehensive table
final_column_order = [
    'prompt_variation',
    'mean_prompt_tokens',
    'median_prompt_tokens', 
    'mean_completion_tokens',
    'median_completion_tokens',
    'mean_description_length',
    'median_description_length',
    'mean_word_count',
    'median_word_count',
    'mean_sentence_count',
    'median_sentence_count',
    'true_positive_pct',
    'false_negative_pct',
    'mean_ai_score_generated',
    'median_ai_score_generated',
    'mean_ai_score_original',
    'median_ai_score_original',
    'total_samples'
]

final_results = final_results[final_column_order]

# Round numeric columns for better display
numeric_cols = [col for col in final_results.columns if col != 'prompt_variation']
final_results[numeric_cols] = final_results[numeric_cols].round(1)

print("\n=== COMPREHENSIVE ANALYSIS RESULTS ===\n")
print("Final Table with Textual Features and AI Probability Scores:")
print("=" * 180)

# Display the table
final_results


=== COMPREHENSIVE ANALYSIS RESULTS ===

Final Table with Textual Features and AI Probability Scores:


Unnamed: 0,prompt_variation,mean_prompt_tokens,median_prompt_tokens,mean_completion_tokens,median_completion_tokens,mean_description_length,median_description_length,mean_word_count,median_word_count,mean_sentence_count,median_sentence_count,true_positive_pct,false_negative_pct,mean_ai_score_generated,median_ai_score_generated,mean_ai_score_original,median_ai_score_original,total_samples
0,Original,0.0,0.0,0.0,0.0,1184.0,1187.5,160.7,169.0,19.8,19.0,0.0,0.0,0.0,0.0,13.5,9.9,352
1,P-10_Full_Plus_One_Shot,64351.4,28815.5,557.1,303.5,1112.8,983.0,159.9,140.0,9.8,9.5,28.1,0.0,27.0,16.8,13.5,9.9,64
2,P-11_Full_Plus_Few_Shot,74741.9,35300.0,451.0,246.5,922.2,761.5,132.0,116.0,8.2,7.0,15.6,0.0,22.5,18.1,13.5,9.9,64
3,P-1_Minimal,632.5,365.0,189.8,85.5,500.2,373.5,62.0,46.5,6.2,5.0,3.1,0.0,4.7,0.0,13.5,9.9,64
4,P-2_Basic,681.2,412.5,212.5,117.5,511.4,396.0,67.8,62.0,5.3,5.0,6.2,0.0,9.3,0.0,13.5,9.9,64
5,P-3_Diffs_Only,5574.5,2129.0,288.8,155.0,712.5,480.5,93.9,65.5,6.7,6.0,6.2,0.0,6.8,0.0,13.5,9.9,64
6,P-4_Diffs_Plus_Title,5619.8,2143.5,303.1,167.0,698.6,513.0,94.4,71.5,6.5,5.0,3.1,0.0,9.9,0.0,13.5,9.9,64
7,P-5_Code_Only,56585.1,24574.5,348.1,174.5,806.5,658.5,109.9,86.5,6.9,6.0,12.5,0.0,14.4,0.0,13.5,9.9,64
8,P-6_Issue_Only,2109.8,882.5,345.1,181.5,669.0,575.0,94.2,85.5,6.4,6.0,12.5,0.0,17.4,0.0,13.5,9.9,64
9,P-7_Template_Plus_Title,872.8,525.0,326.9,175.5,725.7,604.0,98.7,85.5,8.0,8.0,3.1,0.0,13.2,0.0,13.5,9.9,64


In [51]:
# Save comprehensive results
comprehensive_output_file = "comprehensive_prompt_variations_analysis.csv"
final_results.to_csv(comprehensive_output_file, index=False)
print(f"\nComprehensive results saved to: {comprehensive_output_file}")

# Final comprehensive summary
print("\n=== FINAL COMPREHENSIVE SUMMARY ===")
print(f"Total prompt variations analyzed: {len(final_results)}")

# Best performers
best_detection = final_results.loc[final_results['true_positive_pct'].idxmax()]
most_efficient = final_results[final_results['prompt_variation'] != 'Original'].loc[final_results[final_results['prompt_variation'] != 'Original']['mean_prompt_tokens'].idxmin()]
longest_content = final_results.loc[final_results['mean_description_length'].idxmax()]
most_words = final_results.loc[final_results['mean_word_count'].idxmax()]
highest_ai_score = final_results.loc[final_results['mean_ai_score_generated'].idxmax()]

print(f"\nBest detection performance: {best_detection['prompt_variation']} ({best_detection['true_positive_pct']:.1f}% true positive rate)")
print(f"Most efficient (lowest tokens): {most_efficient['prompt_variation']} ({most_efficient['mean_prompt_tokens']:.0f} mean prompt tokens)")
print(f"Longest descriptions: {longest_content['prompt_variation']} ({longest_content['mean_description_length']:.0f} chars)")
print(f"Most words per description: {most_words['prompt_variation']} ({most_words['mean_word_count']:.1f} words)")
print(f"Highest AI detectability: {highest_ai_score['prompt_variation']} ({highest_ai_score['mean_ai_score_generated']:.1f}% mean AI score)")

print("\n=== KEY INSIGHTS ===")
print("1. Detection Performance:")
generated_only = final_results[final_results['prompt_variation'] != 'Original']
print(f"   - Average true positive rate across all prompts: {generated_only['true_positive_pct'].mean():.1f}%")
print(f"   - Best performing prompts: {', '.join(generated_only.nlargest(3, 'true_positive_pct')['prompt_variation'].tolist())}")
print(f"   - Average AI score for generated content: {generated_only['mean_ai_score_generated'].mean():.1f}%")
print(f"   - Highest AI scores: {', '.join(generated_only.nlargest(3, 'mean_ai_score_generated')['prompt_variation'].tolist())}")

print("\n2. Token Efficiency:")
print(f"   - Most token-efficient prompts: {', '.join(generated_only.nsmallest(3, 'mean_prompt_tokens')['prompt_variation'].tolist())}")
print(f"   - Average prompt tokens: {generated_only['mean_prompt_tokens'].mean():.0f}")

print("\n3. Content Quality:")
original_row = final_results[final_results['prompt_variation'] == 'Original']
print(f"   - Original descriptions average {original_row['mean_word_count'].iloc[0]:.0f} words, AI score: {original_row['mean_ai_score_original'].iloc[0]:.1f}%")
print(f"   - Generated descriptions range from {generated_only['mean_word_count'].min():.0f} to {generated_only['mean_word_count'].max():.0f} words")
print(f"   - Generated AI scores range from {generated_only['mean_ai_score_generated'].min():.1f}% to {generated_only['mean_ai_score_generated'].max():.1f}%")
print(f"   - Average generated content length: {generated_only['mean_word_count'].mean():.0f} words")

print("\n4. AI Detection Score Analysis:")
print(f"   - Original (human) content: {original_row['mean_ai_score_original'].iloc[0]:.1f}% mean AI score (all below 50% threshold)")
print(f"   - Generated content: {generated_only['mean_ai_score_generated'].mean():.1f}% average mean AI score")
print(f"   - Detection gap: {generated_only['mean_ai_score_generated'].mean() - original_row['mean_ai_score_original'].iloc[0]:.1f} percentage points higher for AI content")


Comprehensive results saved to: comprehensive_prompt_variations_analysis.csv

=== FINAL COMPREHENSIVE SUMMARY ===
Total prompt variations analyzed: 12

Best detection performance: P-10_Full_Plus_One_Shot (28.1% true positive rate)
Most efficient (lowest tokens): P-1_Minimal (632 mean prompt tokens)
Longest descriptions: Original (1184 chars)
Most words per description: Original (160.7 words)
Highest AI detectability: P-9_Basic_One_Shot (27.9% mean AI score)

=== KEY INSIGHTS ===
1. Detection Performance:
   - Average true positive rate across all prompts: 11.1%
   - Best performing prompts: P-10_Full_Plus_One_Shot, P-9_Basic_One_Shot, P-11_Full_Plus_Few_Shot
   - Average AI score for generated content: 15.1%
   - Highest AI scores: P-9_Basic_One_Shot, P-10_Full_Plus_One_Shot, P-11_Full_Plus_Few_Shot

2. Token Efficiency:
   - Most token-efficient prompts: P-1_Minimal, P-2_Basic, P-7_Template_Plus_Title
   - Average prompt tokens: 24675

3. Content Quality:
   - Original descriptions a

## Debug: Understanding Original Detection Metrics

Let's examine why the Original descriptions show 0.0 for both true_positive_pct and false_negative_pct.

In [52]:
# Debug: Let's examine the Original descriptions detection logic step by step
print("=== DEBUGGING ORIGINAL DETECTION METRICS ===\n")

# Get original data again
original_data = merged_df[merged_df['entry_type'] == 'original']
print(f"1. Original data shape: {original_data.shape}")
print(f"   Entry types: {original_data['entry_type'].value_counts()}")

# Check zerogpt responses for original data
print(f"\n2. ZeroGPT response availability for Original data:")
print(f"   Non-null responses: {original_data['zerogpt_response'].notna().sum()}")
print(f"   Null/empty responses: {original_data['zerogpt_response'].isna().sum()}")

# Try parsing a few responses manually
print(f"\n3. Sample AI probabilities for Original descriptions:")
sample_original = original_data.head(5)
for idx, row in sample_original.iterrows():
    ai_prob = parse_zerogpt_response(row['zerogpt_response'])
    print(f"   Sample {idx}: AI probability = {ai_prob}")

# Now let's run the detection metrics function step by step
print(f"\n4. Running detection metrics calculation for Original:")

# Copy the function logic but with debug prints
original_debug = original_data.copy()
original_debug['ai_probability'] = original_debug['zerogpt_response'].apply(parse_zerogpt_response)

print(f"   AI probabilities parsed: {original_debug['ai_probability'].notna().sum()} out of {len(original_debug)}")
print(f"   AI probability range: {original_debug['ai_probability'].min():.1f} to {original_debug['ai_probability'].max():.1f}")

valid_df = original_debug[original_debug['ai_probability'].notna()].copy()
print(f"   Valid samples for analysis: {len(valid_df)}")

if len(valid_df) > 0:
    # Check detection threshold
    ai_threshold = 50.0
    valid_df['detected_as_ai'] = valid_df['ai_probability'] > ai_threshold
    print(f"   Using AI threshold: {ai_threshold}%")
    print(f"   Detected as AI: {valid_df['detected_as_ai'].sum()} out of {len(valid_df)}")
    
    # Break down by entry type
    generated_entries = valid_df[valid_df['entry_type'] == 'generated']
    original_entries = valid_df[valid_df['entry_type'] == 'original']
    
    print(f"   Generated entries: {len(generated_entries)} (detected as AI: {generated_entries['detected_as_ai'].sum() if len(generated_entries) > 0 else 0})")
    print(f"   Original entries: {len(original_entries)} (detected as AI: {original_entries['detected_as_ai'].sum() if len(original_entries) > 0 else 0})")
    
    # Calculate metrics
    tp_rate = 0
    if len(generated_entries) > 0:
        tp_rate = (generated_entries['detected_as_ai'].sum() / len(generated_entries)) * 100
    
    fp_rate = 0  
    if len(original_entries) > 0:
        fp_rate = (original_entries['detected_as_ai'].sum() / len(original_entries)) * 100
    
    print(f"   True Positive Rate: {tp_rate:.1f}% (Generated correctly identified as AI)")
    print(f"   False Positive Rate: {fp_rate:.1f}% (Original incorrectly identified as AI)")
    
else:
    print("   No valid samples to analyze!")

print(f"\n5. The issue explanation:")
print(f"   - For 'Original' analysis, we only have original (human-written) descriptions")
print(f"   - True Positive Rate = Generated content correctly identified as AI / Total Generated")
print(f"   - But we have 0 generated entries in the 'Original' dataset")
print(f"   - False Negative Rate = Original content incorrectly identified as AI / Total Original")
print(f"   - This should show the misclassification rate of human content")

=== DEBUGGING ORIGINAL DETECTION METRICS ===

1. Original data shape: (352, 31)
   Entry types: entry_type
original    352
Name: count, dtype: int64

2. ZeroGPT response availability for Original data:
   Non-null responses: 352
   Null/empty responses: 0

3. Sample AI probabilities for Original descriptions:
   Sample 0: AI probability = 0.0
   Sample 2: AI probability = 0.0
   Sample 4: AI probability = 0.0
   Sample 6: AI probability = 0.0
   Sample 8: AI probability = 0.0

4. Running detection metrics calculation for Original:
   AI probabilities parsed: 352 out of 352
   AI probability range: 0.0 to 48.4
   Valid samples for analysis: 352
   Using AI threshold: 50.0%
   Detected as AI: 0 out of 352
   Generated entries: 0 (detected as AI: 0)
   Original entries: 352 (detected as AI: 0)
   True Positive Rate: 0.0% (Generated correctly identified as AI)
   False Positive Rate: 0.0% (Original incorrectly identified as AI)

5. The issue explanation:
   - For 'Original' analysis, we on

In [53]:
print("\\n=== EXPLANATION OF ORIGINAL DETECTION METRICS ===\\n")

print("Why Original shows 0.0 for both metrics:")
print("\\n1. **True Positive Rate (0.0%)**:")
print("   - This measures: Generated content correctly identified as AI")
print("   - For 'Original' row: We have 0 generated entries (only human-written content)")
print("   - So: 0 generated entries detected as AI / 0 total generated entries = 0/0 = 0%")
print("   - This is correct but not meaningful for the Original baseline")

print("\\n2. **False Negative Rate (0.0%)**:")  
print("   - This measures: Original (human) content incorrectly identified as AI")
print("   - For 'Original' row: 0 out of 352 human descriptions were misclassified as AI")
print("   - So: 0 misclassified / 352 total human descriptions = 0%")
print("   - This means the detector correctly identified ALL original content as human-written!")

print("\\n=== CORRECTED INTERPRETATION ===\\n")
print("The Original row should be interpreted as:")
print("- True Positive Rate: N/A (no generated content to detect)")
print("- **Human Content Accuracy: 100%** (all 352 original descriptions correctly identified as human)")
print("- **AI Probability range: 0.0% to 48.4%** (all below 50% threshold)")

# Let's also check a few more statistics for Original
original_debug = merged_df[merged_df['entry_type'] == 'original'].copy()
original_debug['ai_probability'] = original_debug['zerogpt_response'].apply(parse_zerogpt_response)

print(f"\\n=== ORIGINAL CONTENT DETECTION STATISTICS ===")
print(f"Total original descriptions analyzed: {len(original_debug)}")
print(f"AI probability statistics:")
print(f"  Mean: {original_debug['ai_probability'].mean():.1f}%")
print(f"  Median: {original_debug['ai_probability'].median():.1f}%")
print(f"  Std Dev: {original_debug['ai_probability'].std():.1f}%")
print(f"  Max: {original_debug['ai_probability'].max():.1f}%")
print(f"  Min: {original_debug['ai_probability'].min():.1f}%")
print(f"\\nDistribution of AI probabilities:")
print(f"  0-10%: {((original_debug['ai_probability'] >= 0) & (original_debug['ai_probability'] < 10)).sum()} descriptions")
print(f"  10-20%: {((original_debug['ai_probability'] >= 10) & (original_debug['ai_probability'] < 20)).sum()} descriptions")
print(f"  20-30%: {((original_debug['ai_probability'] >= 20) & (original_debug['ai_probability'] < 30)).sum()} descriptions")
print(f"  30-40%: {((original_debug['ai_probability'] >= 30) & (original_debug['ai_probability'] < 40)).sum()} descriptions")
print(f"  40-50%: {((original_debug['ai_probability'] >= 40) & (original_debug['ai_probability'] < 50)).sum()} descriptions")
print(f"  Above 50%: {(original_debug['ai_probability'] >= 50).sum()} descriptions (would be misclassified)")

\n=== EXPLANATION OF ORIGINAL DETECTION METRICS ===\n
Why Original shows 0.0 for both metrics:
\n1. **True Positive Rate (0.0%)**:
   - This measures: Generated content correctly identified as AI
   - For 'Original' row: We have 0 generated entries (only human-written content)
   - So: 0 generated entries detected as AI / 0 total generated entries = 0/0 = 0%
   - This is correct but not meaningful for the Original baseline
\n2. **False Negative Rate (0.0%)**:
   - This measures: Original (human) content incorrectly identified as AI
   - For 'Original' row: 0 out of 352 human descriptions were misclassified as AI
   - So: 0 misclassified / 352 total human descriptions = 0%
   - This means the detector correctly identified ALL original content as human-written!
\n=== CORRECTED INTERPRETATION ===\n
The Original row should be interpreted as:
- True Positive Rate: N/A (no generated content to detect)
- **Human Content Accuracy: 100%** (all 352 original descriptions correctly identified as h

In [54]:
# VERIFY: Let's confirm the ZeroGPT responses for original entries are being used
print("=== VERIFICATION: ZeroGPT RESPONSES FOR ORIGINAL ENTRIES ===\n")

# Check the detection CSV file directly
print("1. Checking detection CSV structure:")
print(f"   Total entries in detection file: {len(detection_df)}")
print(f"   Entry types: {detection_df['entry_type'].value_counts()}")

# Check original entries specifically
original_detection_entries = detection_df[detection_df['entry_type'] == 'original']
print(f"\n2. Original entries in detection file:")
print(f"   Count: {len(original_detection_entries)}")
print(f"   Non-null zerogpt_response: {original_detection_entries['zerogpt_response'].notna().sum()}")
print(f"   Sample ZeroGPT responses for original entries:")

# Parse a few original entry responses
for i, (idx, row) in enumerate(original_detection_entries.head(3).iterrows()):
    response_sample = row['zerogpt_response'][:100] if isinstance(row['zerogpt_response'], str) else str(row['zerogpt_response'])
    ai_prob = parse_zerogpt_response(row['zerogpt_response'])
    print(f"     Sample {i+1}: AI probability = {ai_prob}% | Response: {response_sample}...")

# Now let's check what happens when we merge with generation data
print(f"\n3. After merging with generation data:")
original_merged = merged_df[merged_df['entry_type'] == 'original']
print(f"   Count: {len(original_merged)}")
print(f"   AI probabilities calculated:")
original_merged_copy = original_merged.copy()
original_merged_copy['ai_probability'] = original_merged_copy['zerogpt_response'].apply(parse_zerogpt_response)
print(f"     Mean: {original_merged_copy['ai_probability'].mean():.1f}%")
print(f"     Max: {original_merged_copy['ai_probability'].max():.1f}%")
print(f"     Above 50% threshold: {(original_merged_copy['ai_probability'] > 50).sum()}")

print(f"\n4. IMPORTANT INSIGHT:")
if (original_merged_copy['ai_probability'] > 50).sum() == 0:
    print(f"   ✅ The false_negative_pct is CORRECTLY 0.0% because:")
    print(f"   ✅ ALL {len(original_merged_copy)} original descriptions scored below 50% AI probability")
    print(f"   ✅ This means ZeroGPT correctly identified ALL original content as human-written")
    print(f"   ✅ No original content was misclassified as AI-generated")
else:
    misclassified = (original_merged_copy['ai_probability'] > 50).sum()
    total = len(original_merged_copy)
    print(f"   ❌ Something's wrong: {misclassified} out of {total} original entries scored > 50%")
    print(f"   ❌ False negative rate should be: {(misclassified/total)*100:.1f}%")

=== VERIFICATION: ZeroGPT RESPONSES FOR ORIGINAL ENTRIES ===

1. Checking detection CSV structure:
   Total entries in detection file: 704
   Entry types: entry_type
original     352
generated    352
Name: count, dtype: int64

2. Original entries in detection file:
   Count: 352
   Non-null zerogpt_response: 352
   Sample ZeroGPT responses for original entries:
     Sample 1: AI probability = 17.46% | Response: {"success": true, "code": 200, "message": "detection result passed to proxy", "data": {"sentences": ...
     Sample 2: AI probability = 26.09% | Response: {"success": true, "code": 200, "message": "detection result passed to proxy", "data": {"sentences": ...
     Sample 3: AI probability = 29.56% | Response: {"success": true, "code": 200, "message": "detection result passed to proxy", "data": {"sentences": ...

3. After merging with generation data:
   Count: 352
   AI probabilities calculated:
     Mean: 13.5%
     Max: 48.4%
     Above 50% threshold: 0

4. IMPORTANT INSIGHT:
 

In [55]:
print("=== FINAL CONFIRMATION OF DETECTION CALCULATION ===\n")

# Let's manually reproduce the exact calculation from our function
original_for_verification = merged_df[merged_df['entry_type'] == 'original'].copy()
original_for_verification['ai_probability'] = original_for_verification['zerogpt_response'].apply(parse_zerogpt_response)

print("Manual calculation verification:")
print(f"1. Total original entries: {len(original_for_verification)}")
print(f"2. AI probabilities successfully parsed: {original_for_verification['ai_probability'].notna().sum()}")
print(f"3. AI threshold used: 50.0%")
print(f"4. Entries above threshold (detected as AI): {(original_for_verification['ai_probability'] > 50.0).sum()}")
print(f"5. False Negative Rate calculation:")
print(f"   = Entries incorrectly identified as AI / Total original entries")
print(f"   = {(original_for_verification['ai_probability'] > 50.0).sum()} / {len(original_for_verification)}")
print(f"   = {((original_for_verification['ai_probability'] > 50.0).sum() / len(original_for_verification)) * 100:.1f}%")

print(f"\n6. Distribution of AI probabilities for original entries:")
ai_probs = original_for_verification['ai_probability']
print(f"   Min: {ai_probs.min():.1f}%")
print(f"   25th percentile: {ai_probs.quantile(0.25):.1f}%") 
print(f"   Median: {ai_probs.median():.1f}%")
print(f"   75th percentile: {ai_probs.quantile(0.75):.1f}%")
print(f"   Max: {ai_probs.max():.1f}%")

print(f"\n✅ CONFIRMATION: The ZeroGPT responses for original entries ARE being used!")
print(f"✅ The 0.0% false negative rate is CORRECT - it means perfect accuracy!")
print(f"✅ ZeroGPT successfully identified all original content as human-written.")

=== FINAL CONFIRMATION OF DETECTION CALCULATION ===

Manual calculation verification:
1. Total original entries: 352
2. AI probabilities successfully parsed: 352
3. AI threshold used: 50.0%
4. Entries above threshold (detected as AI): 0
5. False Negative Rate calculation:
   = Entries incorrectly identified as AI / Total original entries
   = 0 / 352
   = 0.0%

6. Distribution of AI probabilities for original entries:
   Min: 0.0%
   25th percentile: 0.0%
   Median: 9.9%
   75th percentile: 25.7%
   Max: 48.4%

✅ CONFIRMATION: The ZeroGPT responses for original entries ARE being used!
✅ The 0.0% false negative rate is CORRECT - it means perfect accuracy!
✅ ZeroGPT successfully identified all original content as human-written.
2. AI probabilities successfully parsed: 352
3. AI threshold used: 50.0%
4. Entries above threshold (detected as AI): 0
5. False Negative Rate calculation:
   = Entries incorrectly identified as AI / Total original entries
   = 0 / 352
   = 0.0%

6. Distribution o

## Side-by-Side Description Comparison

Let's create a side-by-side comparison of descriptions from all prompt variations plus the original for a single PR. This will be useful for presentation slides.

In [61]:
# Create side-by-side comparison for a single PR
print("=== SIDE-BY-SIDE DESCRIPTION COMPARISON ===\n")

# Let's pick a PR that has descriptions for all prompt variations
# First, find a PR ID that appears across all variations
pr_counts = merged_df.groupby('pr_id')['prompt_variation'].nunique().sort_values(ascending=False)
print(f"PR coverage across prompt variations:")
print(f"Max variations per PR: {pr_counts.max()}")
print(f"PRs with most variations: {pr_counts.head()}")

# Pick the first PR with good coverage
sample_pr_id = pr_counts.index[1]
print(f"\nUsing PR ID: {sample_pr_id}")

# Get all descriptions for this PR
sample_descriptions = merged_df[merged_df['pr_id'] == sample_pr_id].copy()

# Sort by entry_type and prompt_variation for consistent ordering
sample_descriptions = sample_descriptions.sort_values(['entry_type', 'prompt_variation'])

print(f"\nFound {len(sample_descriptions)} descriptions for PR {sample_pr_id}")
print(f"Entry types: {sample_descriptions['entry_type'].unique()}")
print(f"Prompt variations: {sorted(sample_descriptions['prompt_variation'].unique())}")

# Create the comparison
comparison_data = []

# First add the original description
original_desc = sample_descriptions[sample_descriptions['entry_type'] == 'original']
if len(original_desc) > 0:
    comparison_data.append({
        'Variation': 'Original (Human)',
        'Description': original_desc['input_text'].iloc[0][:500] + "..." if len(original_desc['input_text'].iloc[0]) > 500 else original_desc['input_text'].iloc[0],
        'Full_Description': original_desc['input_text'].iloc[0],
        'AI_Score': f"{parse_zerogpt_response(original_desc['zerogpt_response'].iloc[0]):.1f}%",
        'Word_Count': len(original_desc['input_text'].iloc[0].split())
    })

# Then add all generated descriptions
generated_descs = sample_descriptions[sample_descriptions['entry_type'] == 'generated']
for _, row in generated_descs.iterrows():
    desc = row['input_text']
    comparison_data.append({
        'Variation': row['prompt_variation'].replace('_', ' '),
        'Description': desc[:500] + "..." if len(desc) > 500 else desc,
        'Full_Description': desc,
        'AI_Score': f"{parse_zerogpt_response(row['zerogpt_response']):.1f}%",
        'Word_Count': len(desc.split())
    })

# Create DataFrame for easy viewing
comparison_df = pd.DataFrame(comparison_data)
print(f"\n=== SIDE-BY-SIDE COMPARISON FOR PR {sample_pr_id} ===")
print(f"Variation | AI Score | Words | Description Preview")
print("=" * 120)

for _, row in comparison_df.iterrows():
    print(f"{row['Variation']:<25} | {row['AI_Score']:>7} | {row['Word_Count']:>5} | {row['Description']}")
    print("-" * 120)

=== SIDE-BY-SIDE DESCRIPTION COMPARISON ===

PR coverage across prompt variations:
Max variations per PR: 11
PRs with most variations: pr_id
MDExOlB1bGxSZXF1ZXN0MTYwNDk0NTcy    11
MDExOlB1bGxSZXF1ZXN0MTYxNDI3MDY3    11
PR_kwDOAQ0TF86g3kgZ                 11
PR_kwDOAQ0TF86eA-bB                 11
PR_kwDOAQ0TF86awU2h                 11
Name: prompt_variation, dtype: int64

Using PR ID: MDExOlB1bGxSZXF1ZXN0MTYxNDI3MDY3

Found 22 descriptions for PR MDExOlB1bGxSZXF1ZXN0MTYxNDI3MDY3
Entry types: ['generated' 'original']
Prompt variations: ['P-10_Full_Plus_One_Shot', 'P-11_Full_Plus_Few_Shot', 'P-1_Minimal', 'P-2_Basic', 'P-3_Diffs_Only', 'P-4_Diffs_Plus_Title', 'P-5_Code_Only', 'P-6_Issue_Only', 'P-7_Template_Plus_Title', 'P-8_Full_Context', 'P-9_Basic_One_Shot']

=== SIDE-BY-SIDE COMPARISON FOR PR MDExOlB1bGxSZXF1ZXN0MTYxNDI3MDY3 ===
Variation | AI Score | Words | Description Preview
Original (Human)          |    0.0% |    90 | <!-- describe the changes you have made here: what, why, ... 

In [62]:
# Create a presentation-friendly format with full descriptions
print(f"\n\n=== FULL DESCRIPTIONS FOR PRESENTATION SLIDE ===")
print(f"PR ID: {sample_pr_id}")
print("=" * 100)

for i, row in comparison_df.iterrows():
    print(f"\n📝 **{row['Variation']}** (AI Score: {row['AI_Score']}, Words: {row['Word_Count']})")
    print("─" * 80)
    print(row['Full_Description'])
    print()

# Also save to a text file for easy copying
slide_content = f"Side-by-Side PR Description Comparison - PR {sample_pr_id}\n"
slide_content += "=" * 60 + "\n\n"

for _, row in comparison_df.iterrows():
    slide_content += f"{row['Variation']} (AI Score: {row['AI_Score']}, Words: {row['Word_Count']})\n"
    slide_content += "─" * 50 + "\n"
    slide_content += f"{row['Full_Description']}\n\n"

# Save to file
slide_file = f"side_by_side_comparison_PR_{sample_pr_id}.md"
with open(slide_file, 'w', encoding='utf-8') as f:
    f.write(slide_content)

print(f"💾 Slide content saved to: {slide_file}")

# Create a summary table for the slide
print(f"\n📊 **SUMMARY TABLE FOR SLIDE**")
summary_table = comparison_df[['Variation', 'AI_Score', 'Word_Count']].copy()
summary_table.columns = ['Prompt Variation', 'AI Detection Score', 'Word Count']
print(summary_table.to_string(index=False))



=== FULL DESCRIPTIONS FOR PRESENTATION SLIDE ===
PR ID: MDExOlB1bGxSZXF1ZXN0MTYxNDI3MDY3

📝 **Original (Human)** (AI Score: 0.0%, Words: 90)
────────────────────────────────────────────────────────────────────────────────
<!-- describe the changes you have made here: what, why, ... -->
The bug mentioned in #3145 was finally fixed in controlsfx. Thus we can reenable the validation messages.
Closes #3145.

----

- [x] Change in CHANGELOG.md described
- [ ] Tests created for changes
- [ ] Screenshots added (for bigger UI changes)
- [x] Manually tested changed features in running JabRef
- [ ] Check documentation status (Issue created for outdated help page at [help.jabref.org](https://github.com/JabRef/help.jabref.org/issues)?)
- [ ] If you changed the localization: Did you run `gradle localizationUpdate`?

  


📝 **P-10 Full Plus One Shot** (AI Score: 26.9%, Words: 196)
────────────────────────────────────────────────────────────────────────────────


📝 **P-11 Full Plus Few Shot** (AI S