In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import nltk

nltk.download('averaged_perceptron_tagger')

# Function to apply POS tagging
def text_to_pos_features(text):
    """
    Convert text to Part-of-Speech (POS) tagged representation.
    Args:
        text (str): Input text.
    Returns:
        str: POS-tagged text.
    """
    tokens = text.split()
    pos_tags = nltk.pos_tag(tokens)  # Generate POS tags
    return " ".join([f"{word}_{tag}" for word, tag in pos_tags])

# Load and preprocess data
def load_and_preprocess(data_path):
    """
    Load the dataset and preprocess it.
    Args:
        data_path (str): Path to the dataset CSV file.
    Returns:
        pd.DataFrame: Preprocessed dataset.
    """
    data = pd.read_csv(data_path)
    return data

# Load the data
data_path = "../data/Russian/author_data.csv"  # Replace with your dataset
data = load_and_preprocess(data_path)

# Split data into training and test sets
X = data['text']
y = data['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply POS tagging to train and test sets
X_train_pos = X_train.apply(text_to_pos_features)
X_test_pos = X_test.apply(text_to_pos_features)

# Vectorize POS-tagged text
vectorizer = CountVectorizer(ngram_range=(1, 3))
X_train_vec = vectorizer.fit_transform(X_train_pos)
X_test_vec = vectorizer.transform(X_test_pos)

# Train the Naive Bayes model
nb_model = MultinomialNB(alpha=1.0)
nb_model.fit(X_train_vec, y_train)

# Evaluate the model
y_pred = nb_model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Detailed classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Accuracy: 0.82
Classification Report:
              precision    recall  f1-score   support

   bulgakov        0.99      0.68      0.80      2789
    chekhov        0.99      0.45      0.62      2599
dostoevskiy        0.65      0.99      0.79      4362
      gorky        0.94      0.87      0.90      3285
    tolstoy        0.88      0.93      0.90      4253

    accuracy                           0.82     17288
   macro avg       0.89      0.78      0.80     17288
weighted avg       0.87      0.82      0.82     17288



In [4]:
# save the model to disk
import pickle
filename = 'finalized_model.sav'
pickle.dump(nb_model, open(filename, 'wb'))