In [None]:
import sys
sys.path.insert(0, '../src')

from data_loader import HotpotQALoader
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Dataset

In [None]:
# Initialize loader
loader = HotpotQALoader(subset_size=100, random_seed=42)

# Load full dataset
loader.load_dataset(split='validation')

## 2. Dataset Statistics

In [None]:
# Get a few examples
examples = [loader.dataset[i] for i in range(5)]

print("Example question:")
print(f"Question: {examples[0]['question']}")
print(f"Answer: {examples[0]['answer']}")
print(f"Type: {examples[0]['type']}")
print(f"\nSupporting facts: {examples[0]['supporting_facts']}")

In [None]:
# Analyze answer lengths
answer_lengths = [len(ex['answer'].split()) for ex in loader.dataset]

plt.figure(figsize=(10, 5))
plt.hist(answer_lengths, bins=50, edgecolor='black')
plt.xlabel('Answer Length (words)')
plt.ylabel('Frequency')
plt.title('Distribution of Answer Lengths in HotpotQA')
plt.show()

print(f"Mean answer length: {sum(answer_lengths)/len(answer_lengths):.2f} words")
print(f"Median answer length: {sorted(answer_lengths)[len(answer_lengths)//2]} words")

In [None]:
# Question types
question_types = [ex['type'] for ex in loader.dataset]
type_counts = pd.Series(question_types).value_counts()

plt.figure(figsize=(8, 5))
type_counts.plot(kind='bar')
plt.xlabel('Question Type')
plt.ylabel('Count')
plt.title('Distribution of Question Types')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Create and Analyze Subset

In [None]:
# Create subset
subset = loader.create_subset(strategy='random')

print(f"Created subset with {len(subset)} examples")

In [None]:
# Analyze subset characteristics
subset_df = pd.DataFrame([
    {
        'id': ex['id'],
        'question_length': len(ex['question'].split()),
        'answer_length': len(ex['answer'].split()),
        'type': ex['type'],
        'n_supporting_facts': len(ex['supporting_facts']['title'])
    }
    for ex in subset
])

print(subset_df.describe())
print(f"\nQuestion types in subset:")
print(subset_df['type'].value_counts())

In [None]:
# Visualize subset statistics
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Question lengths
axes[0].hist(subset_df['question_length'], bins=20, edgecolor='black')
axes[0].set_xlabel('Question Length (words)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Subset: Question Lengths')

# Supporting facts
axes[1].hist(subset_df['n_supporting_facts'], bins=10, edgecolor='black')
axes[1].set_xlabel('Number of Supporting Facts')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Subset: Supporting Facts per Question')

plt.tight_layout()
plt.show()

## 4. Prepare Corpus for Retrieval

In [None]:
# Prepare corpus
corpus = loader.prepare_corpus()

print(f"Corpus size: {len(corpus)} passages")

# Analyze corpus
passage_lengths = [len(p['text'].split()) for p in corpus]

plt.figure(figsize=(10, 5))
plt.hist(passage_lengths, bins=50, edgecolor='black')
plt.xlabel('Passage Length (words)')
plt.ylabel('Frequency')
plt.title('Distribution of Passage Lengths in Corpus')
plt.axvline(sum(passage_lengths)/len(passage_lengths), color='red', linestyle='--', label='Mean')
plt.legend()
plt.show()

print(f"Mean passage length: {sum(passage_lengths)/len(passage_lengths):.2f} words")

## 5. Sample Questions

In [None]:
# Display sample questions with answers
import random
random.seed(42)

samples = random.sample(subset, 5)

for i, example in enumerate(samples, 1):
    print(f"\n{'='*80}")
    print(f"Example {i}")
    print(f"{'='*80}")
    print(f"Question: {example['question']}")
    print(f"Answer: {example['answer']}")
    print(f"Type: {example['type']}")
    print(f"Supporting facts: {example['supporting_facts']['title'][:3]}...")

## 6. Save Subset

In [None]:
# Save subset for experiments
loader.save_subset('../data/hotpotqa_subset.json')

# Save corpus
import json
with open('../data/corpus.json', 'w') as f:
    json.dump(corpus, f, indent=2)

print("Dataset subset and corpus saved successfully!")

## Summary

- Loaded HotpotQA validation set
- Created random subset of 100 examples
- Prepared corpus with passages for retrieval
- Analyzed dataset characteristics
- Saved data for experiments