In [None]:
# 4_authorship_attribution.ipynb

import sys
sys.path.append('..')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from src.preprocessor import KieuPreprocessor
from src.vectorizer import TfidfVectorizer
from src.authorship import AuthorshipClassifier
from authorship_analysis import load_author_texts, prepare_dataset

# Load data
authors_data = load_author_texts()

# Print dataset statistics
for author, verses in authors_data.items():
    print(f"{author}: {len(verses)} verses")

# Prepare dataset
preprocessor = KieuPreprocessor(stopwords_file='../data/vietnamese_stopwords.txt')
X, y = prepare_dataset(authors_data, preprocessor)

# Preprocess all verses
tokenized_verses = [preprocessor.preprocess_verse(verse) for verse in X]

# Create TF-IDF features
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tokenized_verses)

# Initialize classifier
classifier = AuthorshipClassifier()

# Extract features (TF-IDF + stylometric)
features = classifier.extract_features(tfidf_matrix, tokenized_verses)

# Visualize with dimensionality reduction
# t-SNE for visualization
tsne = TSNE(n_components=2, random_state=42)
features_2d = tsne.fit_transform(features)

# Create DataFrame for plotting
df = pd.DataFrame({
    'x': features_2d[:, 0],
    'y': features_2d[:, 1],
    'author': y
})

# Plot
plt.figure(figsize=(12, 10))
sns.scatterplot(data=df, x='x', y='y', hue='author', palette='viridis', alpha=0.7)
plt.title('t-SNE Visualization of Author Styles')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Feature importance analysis
# Train a model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    features, y, test_size=0.2, random_state=42, stratify=y)

classifier.train(X_train, y_train)

# For SVM, we can look at feature weights for "Nguyễn Du" vs others
if hasattr(classifier.classifier, 'coef_'):
    nguyen_du_idx = np.where(classifier.classifier.classes_ == 'Nguyễn Du')[0][0]
    feature_weights = classifier.classifier.coef_[nguyen_du_idx]
    
    # Get top features
    top_feature_indices = np.argsort(np.abs(feature_weights))[-20:]
    top_feature_weights = feature_weights[top_feature_indices]
    
    # Get vocabulary words for these features
    top_features = []
    for idx in top_feature_indices:
        if idx < len(vectorizer.vocabulary):
            # It's a word feature
            word = [w for w, i in vectorizer.vocabulary.items() if i == idx]
            if word:
                top_features.append(word[0])
            else:
                top_features.append(f"feature_{idx}")
        else:
            # It's a stylometric feature
            feature_names = ["avg_word_length", "verse_length", "function_word_ratio", "etc"]
            style_idx = idx - len(vectorizer.vocabulary)
            if style_idx < len(feature_names):
                top_features.append(feature_names[style_idx])
            else:
                top_features.append(f"style_{style_idx}")
    
    # Plot
    plt.figure(figsize=(12, 8))
    plt.barh(top_features, top_feature_weights)
    plt.title('Top Features for Identifying Nguyễn Du')
    plt.xlabel('Feature Importance')
    plt.tight_layout()
    plt.show()