In [None]:
# Authorship Attribution Analysis

import sys
sys.path.append('..')
from src.preprocessor import KieuPreprocessor
from src.vectorizer import TfidfVectorizer
from src.authorship import AuthorshipClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Load Truyện Kiều
preprocessor = KieuPreprocessor(stopwords_file='../data/vietnamese_stopwords.txt')
verses = preprocessor.load_poem('../data/truyen_kieu.txt')
tokenized_verses = preprocessor.preprocess_all_verses(verses)

In [None]:
# Create TF-IDF matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tokenized_verses)

# For demonstration, we'll simulate having comparison texts
# In a real scenario, you would load actual comparison texts

# Create synthetic data for demonstration
# 1 = Nguyễn Du, 0 = Not Nguyễn Du
# Normally, you'd load real comparison texts from other authors
labels = np.ones(len(verses))  # All verses from Truyện Kiều are by Nguyễn Du

# For demonstration, let's pretend some random verses are not by Nguyễn Du
np.random.seed(42)
random_indices = np.random.choice(len(verses), size=int(len(verses)*0.3), replace=False)
labels[random_indices] = 0

# Initialize authorship classifier
classifier = AuthorshipClassifier()

# Extract features
features = classifier.extract_features(tfidf_matrix, tokenized_verses)

# Train the classifier
classifier.train(features, labels)

# Test on some "unknown" verses
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42)

predictions, confidence = classifier.predict(X_test)

In [None]:
# Visualize confidence scores
plt.figure(figsize=(10, 6))
plt.hist(confidence, bins=20)
plt.axvline(0.5, color='red', linestyle='--')
plt.title('Distribution of Confidence Scores')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')
plt.show()

# Display sample predictions
print("Sample predictions:")
for i in range(10):
    idx = np.random.randint(0, len(X_test))
    verse_idx = int(idx / len(features) * len(verses))
    auth = "Nguyễn Du" if predictions[idx] == 1 else "Not Nguyễn Du"
    print(f"Verse: {verses[verse_idx]}")
    print(f"Prediction: {auth} (confidence: {confidence[idx]:.4f})")
    print(f"Actual: {'Nguyễn Du' if y_test[idx] == 1 else 'Not Nguyễn Du'}")
    print("---")