In [None]:
# Data Exploration for Truyện Kiều

## 1. Load and Explore the Text

import sys
sys.path.append('..')  # Add parent directory to path
from src.preprocessor import KieuPreprocessor
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pandas as pd

# Load the poem
preprocessor = KieuPreprocessor(stopwords_file='../data/vietnamese_stopwords.txt')
verses = preprocessor.load_poem('../data/truyen_kieu.txt')

# Basic statistics
print(f"Total verses: {len(verses)}")
print(f"First few verses:")
for i in range(min(5, len(verses))):
    print(f"{i+1}: {verses[i]}")

2


In [None]:
## 2. Analyze Verse Length

# Calculate verse lengths
verse_lengths = [len(verse) for verse in verses]

# Visualize distribution
plt.figure(figsize=(10, 6))
plt.hist(verse_lengths, bins=20)
plt.title('Distribution of Verse Lengths')
plt.xlabel('Characters per Verse')
plt.ylabel('Frequency')
plt.show()

In [None]:
## 3. Word Frequency Analysis

# Tokenize and count words
tokenized_verses = preprocessor.preprocess_all_verses(verses)
all_words = [word for verse in tokenized_verses for word in verse]
word_counts = Counter(all_words)

# Plot top words
top_n = 20
top_words = word_counts.most_common(top_n)
df = pd.DataFrame(top_words, columns=['Word', 'Count'])

plt.figure(figsize=(12, 8))
sns.barplot(x='Count', y='Word', data=df)
plt.title(f'Top {top_n} Words in Truyện Kiều')
plt.show()