# NarrativeMind: Results Analysis

This notebook analyzes and visualizes the results from the NarrativeMind experiments.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from narrative_mind.utils import Visualizer
from narrative_mind.evaluation import NarrativeEvaluator
import numpy as np

## 1. Load Results

In [None]:
# Load test results
with open('results/test_results.json') as f:
    results = json.load(f)

# Create DataFrame of metrics
metrics_df = pd.DataFrame([s['metrics'] for s in results['generated_samples']])
print("\nMetrics Summary:")
print(metrics_df.describe())

## 2. Analyze BLEU Scores

In [None]:
# Plot BLEU score distribution
plt.figure(figsize=(10, 6))
sns.histplot(metrics_df['bleu'], bins=30)
plt.axvline(x=29.8, color='r', linestyle='--', label='Paper Result (29.8)')
plt.title('Distribution of BLEU Scores')
plt.xlabel('BLEU Score')
plt.ylabel('Count')
plt.legend()
plt.show()

print(f"Mean BLEU Score: {metrics_df['bleu'].mean():.2f} ± {metrics_df['bleu'].std():.2f}")

## 3. Analyze Cultural Preservation

In [None]:
# Plot cultural preservation metrics
cultural_metrics = ['pattern_f1', 'cultural_elements_score', 'cultural_preservation']

plt.figure(figsize=(12, 6))
metrics_df[cultural_metrics].boxplot()
plt.axhline(y=0.752, color='r', linestyle='--', label='Paper Result (75.2%)')
plt.title('Cultural Preservation Metrics')
plt.ylabel('Score')
plt.legend()
plt.show()

print("\nCultural Preservation Metrics:")
for metric in cultural_metrics:
    mean = metrics_df[metric].mean()
    std = metrics_df[metric].std()
    print(f"{metric}: {mean:.3f} ± {std:.3f}")

## 4. Analyze Dialect Accuracy

In [None]:
# Plot dialect accuracy by dialect
dialect_metrics = pd.DataFrame([
    {
        'dialect': s.get('dialect'),
        'accuracy': s['metrics']['dialect_accuracy'],
        'confidence': s['metrics']['dialect_confidence']
    }
    for s in results['generated_samples'] if s.get('dialect')
])

plt.figure(figsize=(12, 6))
sns.boxplot(data=dialect_metrics, x='dialect', y='accuracy')
plt.axhline(y=0.72, color='r', linestyle='--', label='Paper Result (κ=0.72)')
plt.title('Dialect Accuracy by Dialect')
plt.xlabel('Dialect')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.legend()
plt.show()

print("\nDialect Accuracy by Dialect:")
print(dialect_metrics.groupby('dialect')['accuracy'].describe())

## 5. Analyze Generated Stories

In [None]:
# Analyze story characteristics
story_lengths = [len(s['generated_text'].split()) for s in results['generated_samples']]

plt.figure(figsize=(10, 6))
sns.histplot(story_lengths, bins=30)
plt.title('Distribution of Generated Story Lengths')
plt.xlabel('Length (words)')
plt.ylabel('Count')
plt.show()

print(f"\nAverage story length: {np.mean(story_lengths):.1f} ± {np.std(story_lengths):.1f} words")

# Display sample stories with high metrics
print("\nExample High-Quality Generations:")
top_stories = sorted(
    results['generated_samples'],
    key=lambda x: x['metrics']['cultural_preservation'],
    reverse=True
)[:3]

for i, story in enumerate(top_stories, 1):
    print(f"\nStory {i}:")
    print(f"Text: {story['generated_text']}")
    print("\nMetrics:")
    for k, v in story['metrics'].items():
        print(f"{k}: {v:.3f}")