# Narrative Consistency Analysis - Exploratory Notebook

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.utils import load_config
from src.narrative_analyzer import NarrativeAnalyzer

In [None]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print(f"Training data: {len(train_df)} examples")
print(f"Test data: {len(test_df)} examples")
print(f"Columns: {train_df.columns.tolist()}")

In [None]:
print("Label distribution in training data:")
print(train_df['label'].value_counts())

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
train_df['label'].value_counts().plot(kind='bar')
plt.title('Label Distribution')

plt.subplot(1, 2, 2)
train_df['book_name'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Book Distribution')

plt.tight_layout()
plt.show()

In [None]:
print("Top characters by book:")
for book in train_df['book_name'].unique():
    book_data = train_df[train_df['book_name'] == book]
    char_counts = book_data['char'].value_counts()
    print(f"\n{book}:")
    for char, count in char_counts.head(5).items():
        print(f"  {char}: {count} examples")

In [None]:
config = load_config('../config.yaml')
analyzer = NarrativeAnalyzer(config.get('models', {}), config.get('narratives', {}))

sample = test_df.iloc[0]
result = analyzer.analyze(
    backstory=str(sample['content']),
    book_name=sample['book_name'],
    character=sample['char'],
    evidence={'passages': []}
)

print(f"Analysis result for {sample['char']} in {sample['book_name']}:")
print(f"  Temporal score: {result.temporal_score:.2f}")
print(f"  Thematic score: {result.thematic_score:.2f}")
print(f"  Character score: {result.character_score:.2f}")
print(f"  Semantic score: {result.semantic_score:.2f}")
print(f"  Factual score: {result.factual_score:.2f}")
print(f"  Confidence: {result.confidence:.2f}")

In [None]:
plt.figure(figsize=(12, 6))

consistent_lengths = []
contradict_lengths = []

for idx, row in train_df.iterrows():
    if row['label'] == 'consistent':
        consistent_lengths.append(len(str(row['content'])))
    else:
        contradict_lengths.append(len(str(row['content'])))

plt.subplot(1, 2, 1)
plt.boxplot([consistent_lengths, contradict_lengths], labels=['Consistent', 'Contradict'])
plt.title('Backstory Length by Label')
plt.ylabel('Character Count')

plt.subplot(1, 2, 2)
books = train_df['book_name'].unique()
consistency_rates = []
for book in books:
    book_data = train_df[train_df['book_name'] == book]
    rate = len(book_data[book_data['label'] == 'consistent']) / len(book_data)
    consistency_rates.append(rate)

plt.bar(books, consistency_rates)
plt.title('Consistency Rate by Book')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Consistency Rate')

plt.tight_layout()
plt.show()

In [None]:
print("Character consistency rates:")
for char in train_df['char'].unique():
    char_data = train_df[train_df['char'] == char]
    if len(char_data) >= 5:
        consistent = len(char_data[char_data['label'] == 'consistent'])
        total = len(char_data)
        rate = consistent / total * 100
        print(f"  {char}: {consistent}/{total} ({rate:.1f}% consistent)")

In [None]:
print("Sample backstories from test data:")
for i in range(3):
    row = test_df.iloc[i]
    print(f"\n{i+1}. Character: {row['char']}")
    print(f"   Book: {row['book_name']}")
    print(f"   Content: {str(row['content'])[:150]}...")