# RAG System Ablation Study: Financial Question Answering

## Publication-Quality Analysis

This notebook analyzes results from a comprehensive experiment suite covering:
- **25 total experiments**
- Chunking strategies (standard vs element-based)
- Chunk sizes (500-3000 characters)
- Retrieval k values (3-30)
- Retrieval enhancements (hybrid, metadata, reranking)
- Embedding models (small vs large)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from scipy import stats

# Set publication style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16

RESULTS_DIR = Path('/Users/hansonxiong/Desktop/algoverse/LLM_as_a_Judge/experiments/results')

In [None]:
# Load all experiment results
def load_all_results():
    results = []
    for csv_file in RESULTS_DIR.glob('*.csv'):
        if '_with_judge' in csv_file.name or csv_file.name.startswith(('baseline', 'ablation', 'chunk', 'k_value', 'embedding', 'optimal')):
            df = pd.read_csv(csv_file)
            exp_name = csv_file.stem.split('_2')[0]  # Remove timestamp
            
            # Normalize score column
            score_col = 'judge_score' if 'judge_score' in df.columns else 'score'
            if score_col in df.columns:
                scores = pd.to_numeric(df[score_col], errors='coerce').dropna()
                results.append({
                    'experiment': exp_name,
                    'mean_score': scores.mean(),
                    'std_score': scores.std(),
                    'median_score': scores.median(),
                    'n_questions': len(scores),
                    'perfect_scores': (scores == 1.0).sum(),
                    'zero_scores': (scores == 0.0).sum(),
                })
    
    return pd.DataFrame(results).sort_values('mean_score', ascending=False)

results_df = load_all_results()
print(f'Loaded {len(results_df)} experiments')
results_df.head(10)

## Figure 1: Overall Performance Comparison

In [None]:
# Publication-quality bar chart
fig, ax = plt.subplots(figsize=(14, 8))

colors = plt.cm.RdYlGn(results_df['mean_score'].values)

bars = ax.barh(results_df['experiment'], results_df['mean_score'] * 100, 
               xerr=results_df['std_score'] * 100, 
               color=colors, capsize=3)

ax.set_xlabel('Mean Judge Score (%)')
ax.set_title('RAG Configuration Performance Comparison')
ax.axvline(x=50, color='red', linestyle='--', alpha=0.5, label='50% threshold')

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'fig1_overall_comparison.pdf', dpi=300, bbox_inches='tight')
plt.savefig(RESULTS_DIR / 'fig1_overall_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## Figure 2: Ablation Study Results

In [None]:
# Filter ablation experiments
ablation_df = results_df[results_df['experiment'].str.contains('ablation|baseline')].copy()

fig, ax = plt.subplots(figsize=(10, 6))

x = range(len(ablation_df))
bars = ax.bar(x, ablation_df['mean_score'] * 100, 
              yerr=ablation_df['std_score'] * 100,
              color=plt.cm.viridis(np.linspace(0.3, 0.9, len(ablation_df))),
              capsize=5)

ax.set_xticks(x)
ax.set_xticklabels(ablation_df['experiment'].str.replace('ablation_', '').str.replace('baseline_', ''), 
                   rotation=45, ha='right')
ax.set_ylabel('Mean Judge Score (%)')
ax.set_title('Retrieval Enhancement Ablation Study')

# Add value labels
for bar, val in zip(bars, ablation_df['mean_score'] * 100):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
            f'{val:.1f}%', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'fig2_ablation_study.pdf', dpi=300, bbox_inches='tight')
plt.savefig(RESULTS_DIR / 'fig2_ablation_study.png', dpi=300, bbox_inches='tight')
plt.show()

## Figure 3: Chunk Size Analysis

In [None]:
# Filter chunk experiments
chunk_df = results_df[results_df['experiment'].str.contains('chunk_')].copy()

# Extract chunk size and type
chunk_df['chunk_size'] = chunk_df['experiment'].str.extract(r'(\d+)').astype(int)
chunk_df['chunk_type'] = chunk_df['experiment'].apply(
    lambda x: 'Element-based' if 'element' in x else 'Standard'
)

fig, ax = plt.subplots(figsize=(10, 6))

for chunk_type, group in chunk_df.groupby('chunk_type'):
    group = group.sort_values('chunk_size')
    ax.errorbar(group['chunk_size'], group['mean_score'] * 100, 
                yerr=group['std_score'] * 100,
                marker='o', markersize=8, capsize=5, label=chunk_type)

ax.set_xlabel('Chunk Size (characters)')
ax.set_ylabel('Mean Judge Score (%)')
ax.set_title('Effect of Chunk Size on Performance')
ax.legend()

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'fig3_chunk_size.pdf', dpi=300, bbox_inches='tight')
plt.savefig(RESULTS_DIR / 'fig3_chunk_size.png', dpi=300, bbox_inches='tight')
plt.show()

## Figure 4: K-Value Sensitivity Analysis

In [None]:
# Filter k-value experiments
k_df = results_df[results_df['experiment'].str.contains('k_value')].copy()
k_df['k'] = k_df['experiment'].str.extract(r'(\d+)').astype(int)
k_df = k_df.sort_values('k')

fig, ax = plt.subplots(figsize=(10, 6))

ax.errorbar(k_df['k'], k_df['mean_score'] * 100, 
            yerr=k_df['std_score'] * 100,
            marker='s', markersize=10, capsize=5, 
            color='#2ecc71', linewidth=2)

ax.set_xlabel('Top-K Retrieved Chunks')
ax.set_ylabel('Mean Judge Score (%)')
ax.set_title('Retrieval Depth Sensitivity Analysis')

# Mark optimal k
best_k = k_df.loc[k_df['mean_score'].idxmax(), 'k']
ax.axvline(x=best_k, color='red', linestyle='--', alpha=0.5, 
           label=f'Optimal k={best_k}')
ax.legend()

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'fig4_k_sensitivity.pdf', dpi=300, bbox_inches='tight')
plt.savefig(RESULTS_DIR / 'fig4_k_sensitivity.png', dpi=300, bbox_inches='tight')
plt.show()

## Table 1: Summary Statistics

In [None]:
# Create publication-quality summary table
summary_table = results_df.copy()
summary_table['mean_score'] = (summary_table['mean_score'] * 100).round(1)
summary_table['std_score'] = (summary_table['std_score'] * 100).round(1)
summary_table['median_score'] = (summary_table['median_score'] * 100).round(1)

summary_table = summary_table.rename(columns={
    'experiment': 'Configuration',
    'mean_score': 'Mean (%)',
    'std_score': 'Std (%)',
    'median_score': 'Median (%)',
    'n_questions': 'N',
    'perfect_scores': 'Perfect',
    'zero_scores': 'Zero'
})

# Export to LaTeX for paper
latex_table = summary_table.to_latex(index=False, 
                                     caption='RAG Configuration Performance Summary',
                                     label='tab:results')

with open(RESULTS_DIR / 'table1_summary.tex', 'w') as f:
    f.write(latex_table)

print('Top 10 Configurations:')
summary_table.head(10)

## Statistical Analysis

In [None]:
# Statistical significance tests
print('Statistical Significance Tests')
print('=' * 50)

# Compare best vs baseline
best_exp = results_df.iloc[0]['experiment']
baseline_exp = results_df[results_df['experiment'].str.contains('baseline')].iloc[0]['experiment']

print(f'\nBest configuration: {best_exp}')
print(f'Baseline: {baseline_exp}')
print(f'\nImprovement: {(results_df.iloc[0]["mean_score"] - results_df[results_df["experiment"]==baseline_exp]["mean_score"].values[0]) * 100:.1f}%')