In [None]:
# TF-IDF Visualization for Truyện Kiều

import sys
sys.path.append('..')
from src.preprocessor import KieuPreprocessor
from src.vectorizer import TfidfVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
import pandas as pd

# Load and preprocess data
preprocessor = KieuPreprocessor(stopwords_file='../data/vietnamese_stopwords.txt')
verses = preprocessor.load_poem('../data/truyen_kieu.txt')
tokenized_verses = preprocessor.preprocess_all_verses(verses)

In [None]:
# Create TF-IDF matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tokenized_verses)

# Visualize TF-IDF weights for a sample verse
sample_idx = 100  # Choose a verse to analyze
sample_verse = verses[sample_idx]
print(f"Sample verse: {sample_verse}")

sample_vector = tfidf_matrix[sample_idx].toarray().flatten()
word_indices = np.where(sample_vector > 0)[0]

# Get words and their TF-IDF scores
words = []
scores = []

for idx in word_indices:
    word = [w for w, i in vectorizer.vocabulary.items() if i == idx][0]
    words.append(word)
    scores.append(sample_vector[idx])

# Sort by score
sorted_indices = np.argsort(-np.array(scores))
words = [words[i] for i in sorted_indices]
scores = [scores[i] for i in sorted_indices]

In [None]:
# Plot
plt.figure(figsize=(10, 6))
plt.barh(words, scores)
plt.title(f'TF-IDF Scores for Words in Verse {sample_idx+1}')
plt.xlabel('TF-IDF Score')
plt.ylabel('Words')
plt.tight_layout()
plt.show()

# Dimensionality reduction for visualization
svd = TruncatedSVD(n_components=2)
tfidf_2d = svd.fit_transform(tfidf_matrix)

# Plot verses in 2D space
plt.figure(figsize=(12, 10))
plt.scatter(tfidf_2d[:, 0], tfidf_2d[:, 1], alpha=0.5)
plt.title('Verses in 2D TF-IDF Space')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()