# 01: Data Exploration

**小红帽 (Little Red Riding Hood) — Initial Data Survey**

This notebook provides an interactive overview of the extracted game data before running formal experiments.

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import matplotlib.pyplot as plt
from parsers import EventDumpParser, NarrativeExtractionParser, load_json_data
from visualization import setup_style, COLORS

setup_style()
print("Modules loaded successfully!")

## 1. Event Dump Overview

The `EventTextDump.txt` contains raw RPG Maker event commands exported from the game.

In [None]:
# Load and parse the event dump
parser = EventDumpParser('../data/EventTextDump.txt')
parser.parse()

print(f"Total maps: {len(parser.maps)}")
print(f"Total events: {sum(len(m['events']) for m in parser.maps.values())}")
print(f"Total commands: {sum(sum(len(e['commands']) for e in m['events'].values()) for m in parser.maps.values())}")

In [None]:
# Preview some raw data
with open('../data/EventTextDump.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()[:50]
    
print("First 50 lines of EventTextDump.txt:")
print("=" * 60)
for line in lines:
    print(line.rstrip())

## 2. Command Type Distribution

Let's see what types of commands are most common in the game.

In [None]:
# Count command types across all maps
command_counts = {}
for map_data in parser.maps.values():
    for event_data in map_data['events'].values():
        for cmd in event_data['commands']:
            cmd_type = cmd.get('type', 'unknown')
            command_counts[cmd_type] = command_counts.get(cmd_type, 0) + 1

# Sort by frequency
sorted_counts = sorted(command_counts.items(), key=lambda x: x[1], reverse=True)

print("Top 20 Command Types:")
print("-" * 40)
for cmd_type, count in sorted_counts[:20]:
    print(f"{cmd_type:30} {count:>6}")

In [None]:
# Visualize command distribution
top_cmds = sorted_counts[:15]
labels = [c[0] for c in top_cmds]
values = [c[1] for c in top_cmds]

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh(range(len(labels)), values, color=COLORS['story'])
ax.set_yticks(range(len(labels)))
ax.set_yticklabels(labels)
ax.invert_yaxis()
ax.set_xlabel('Count')
ax.set_title('Top 15 Command Types in EventTextDump')
plt.tight_layout()
plt.show()

## 3. Narrative Extraction Overview

The `narrative_extraction.md` contains the human-annotated narrative structure.

In [None]:
# Load narrative structure
narrative_parser = NarrativeExtractionParser('../data/narrative_extraction.md')
narrative_parser.parse()

print(f"Arcs: {len(narrative_parser.arcs)}")
print(f"Total sequences: {sum(len(arc['sequences']) for arc in narrative_parser.arcs.values())}")
print(f"Total beats: {sum(sum(len(seq['beats']) for seq in arc['sequences'].values()) for arc in narrative_parser.arcs.values())}")
print(f"Characters: {len(narrative_parser.characters)}")

In [None]:
# List the arcs
print("Narrative Arcs:")
print("=" * 60)
for arc_id, arc_data in narrative_parser.arcs.items():
    print(f"\nArc {arc_id}: {arc_data.get('title', 'Untitled')}")
    for seq_id, seq_data in arc_data['sequences'].items():
        print(f"  └─ Sequence {seq_id}: {seq_data.get('title', 'Untitled')} ({len(seq_data['beats'])} beats)")

## 4. Dialogue Extraction

Let's extract and preview dialogue from the game.

In [None]:
# Extract all dialogues
dialogues = parser.extract_dialogue()
print(f"Total dialogue segments: {len(dialogues)}")

# Preview first 10
print("\nSample dialogues:")
print("-" * 60)
for d in dialogues[:10]:
    map_name = d.get('map_name', d.get('map_id', 'Unknown'))
    text = d.get('text', '')[:50]
    print(f"[{map_name}] {text}...")

## 5. Context Keywords

Review the keyword lists used for semantic analysis.

In [None]:
# Load context keywords
keywords = load_json_data('../data/context_keywords.json')

print("Entities:")
print(keywords.get('entities', []))

print("\nThemes:")
print(keywords.get('themes', []))

print("\nMotif Contexts:")
for context, terms in keywords.get('motif_contexts', {}).items():
    print(f"  {context}: {terms}")

## 6. Quick Statistics

Summary statistics for the dataset.

In [None]:
# Calculate some basic statistics
stats = {
    'Event Dump': {
        'Maps': len(parser.maps),
        'Events': sum(len(m['events']) for m in parser.maps.values()),
        'Commands': sum(sum(len(e['commands']) for e in m['events'].values()) for m in parser.maps.values()),
        'Dialogues': len(dialogues)
    },
    'Narrative Structure': {
        'Arcs': len(narrative_parser.arcs),
        'Sequences': sum(len(arc['sequences']) for arc in narrative_parser.arcs.values()),
        'Beats': sum(sum(len(seq['beats']) for seq in arc['sequences'].values()) for arc in narrative_parser.arcs.values()),
        'Characters': len(narrative_parser.characters)
    },
    'Keywords': {
        'Entities': len(keywords.get('entities', [])),
        'Themes': len(keywords.get('themes', [])),
        'Motif Contexts': len(keywords.get('motif_contexts', {}))
    }
}

df_stats = pd.DataFrame(stats).T
print("Dataset Summary:")
print(df_stats.to_string())

---

**Next**: Proceed to `02_alignment_deep_dive.ipynb` to explore the structural alignment analysis in detail.