# Historical Event Narrator - Dataset Analysis

This notebook explores the dataset generated for the Historical Event Narrator project. We analyze token lengths, keyword distribution, and overall data quality.

In [None]:
import json
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from wordcloud import WordCloud

In [None]:
# Load Data
data_path = "../data/processed/train.jsonl"
data = []
with open(data_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)
print(f"Total Examples: {len(df)}")
df.head()

## 1. Instruction Distribution
Let's see which instructions are most common.

In [None]:
plt.figure(figsize=(10, 6))
df['instruction'].value_counts().plot(kind='barh', color='skyblue')
plt.title("Instruction Distribution")
plt.xlabel("Count")
plt.show()

## 2. Word Cloud of Titles
What historical events are we covering?

In [None]:
text = " ".join(df['input'].tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Common Words in Event Titles")
plt.show()

## 3. Length Analysis
How long are the narratives?

In [None]:
df['length'] = df['output'].apply(lambda x: len(x.split()))

plt.figure(figsize=(10, 6))
plt.hist(df['length'], bins=50, color='salmon', edgecolor='black')
plt.title("Narrative Length Distribution (Words)")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.show()