In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import nltk

nltk.download('averaged_perceptron_tagger')

# Function to apply POS tagging
def text_to_pos_features(text):
    """
    Convert text to Part-of-Speech (POS) tagged representation.
    Args:
        text (str): Input text.
    Returns:
        str: POS-tagged text.
    """
    tokens = text.split()
    pos_tags = nltk.pos_tag(tokens)  # Generate POS tags
    return " ".join([f"{word}_{tag}" for word, tag in pos_tags])

# Load and preprocess data
def load_and_preprocess(data_path):
    """
    Load the dataset and preprocess it.
    Args:
        data_path (str): Path to the dataset CSV file.
    Returns:
        pd.DataFrame: Preprocessed dataset.
    """
    data = pd.read_csv(data_path)
    return data

# Load the data
data_path = "../data/Russian/author_data.csv"  # Replace with your dataset
data = load_and_preprocess(data_path)

# Split data into training and test sets
X = data['text']
y = data['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply POS tagging to train and test sets
X_train_pos = X_train.apply(text_to_pos_features)
X_test_pos = X_test.apply(text_to_pos_features)

# Vectorize POS-tagged text
vectorizer = CountVectorizer(ngram_range=(1, 3))
X_train_vec = vectorizer.fit_transform(X_train_pos)
X_test_vec = vectorizer.transform(X_test_pos)

# Train the Naive Bayes model
nb_model = MultinomialNB(alpha=1.0)
nb_model.fit(X_train_vec, y_train)

# Evaluate the model
y_pred = nb_model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Detailed classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Accuracy: 0.82
Classification Report:
              precision    recall  f1-score   support

   bulgakov        0.99      0.68      0.80      2789
    chekhov        0.99      0.45      0.62      2599
dostoevskiy        0.65      0.99      0.79      4362
      gorky        0.94      0.87      0.90      3285
    tolstoy        0.88      0.93      0.90      4253

    accuracy                           0.82     17288
   macro avg       0.89      0.78      0.80     17288
weighted avg       0.87      0.82      0.82     17288



In [4]:
# save the model to disk
import pickle
filename = 'finalized_model.sav'
pickle.dump(nb_model, open(filename, 'wb'))

In [5]:
import numpy as np
import random

def generate_text_from_author(nb_model, vectorizer, author, max_len=50, temperature=1.0):
    """
    Generate text in the style of a given author using a trained Naive Bayes classifier.
    Args:
        nb_model: Trained Naive Bayes model.
        vectorizer: Fitted CountVectorizer.
        author (str): Author whose style to mimic.
        max_len (int): Maximum length of the generated text.
        temperature (float): Sampling temperature to control randomness.
    Returns:
        str: Generated text.
    """
    if author not in nb_model.classes_:
        raise ValueError(f"Author '{author}' not found in the trained model.")

    # Get the index of the author
    author_index = np.where(nb_model.classes_ == author)[0][0]

    # Extract the word probabilities for the given author
    word_probs = np.exp(nb_model.feature_log_prob_[author_index])  # Convert log probabilities to probabilities
    word_probs /= np.sum(word_probs)  # Normalize probabilities

    # Vocabulary from the vectorizer
    vocab = vectorizer.get_feature_names_out()

    # Start with a random seed word
    generated_text = [random.choice(vocab)]

    for _ in range(max_len - 1):
        # Adjust probabilities with temperature
        adjusted_probs = word_probs ** (1 / temperature)
        adjusted_probs /= np.sum(adjusted_probs)

        # Sample the next word
        next_word = np.random.choice(vocab, p=adjusted_probs)

        # Append to the generated text
        generated_text.append(next_word)

    return " ".join(generated_text)

In [8]:
target_author = "tolstoy "  # Replace with an actual author from your dataset
generated_text = generate_text_from_author(nb_model, vectorizer, target_author, max_len=50, temperature=1.0)
print(f"Generated text in the style of {target_author}:\n{generated_text}")

Generated text in the style of tolstoy :
плаксиво _nnp влюбленный_nn время_nnp такое_nnp же_nnp весьма_nnp _nnp спину_nnp ход_nnp дел шерсть_jj и_nnp _nnp секретарь_nnp сидел_nnp нетрудно _nnp ибо_nnp рабы_nnp господа_nnp же _nnp закричал_nnp девушки _nnp все_nn _nnp нибудь_jj теперь_nnp онучи_nnp и_nnp лапти едва_nnp отъехал _jj мускулистой_nnp она_nnp прибирала_nnp на_nnp рассказывает_jj не_nnp взволноваться _nnp нам_nnp скучно сыграл_nnp трехактную_nnp рядами_nnp стулья_nnp с_nnp _nnp выронила_nnp на_nnp которой_nnp обратились_nnp дело_nnp сказала_nnp память _nnp помимо_nnp сделаем _nnp унизительным_nnp и_nnp от_vbd 1_cd декабря_nn изба_nnp завалиться_nnp хочет кременчугский_ 8_cd _vbd 2_cd могу_nnp тому _nnp что_nnp кроме_nnp я_nnp тогда_nnp ничего_nnp _nnp памяти_nn кто_nnp вам_nnp измокшее_nnp _nnp да_nnp оттого сделать_nnp _nnp долли_nnp дожидаясь_nnp обычных_nnp не_nnp захочешь_nnp _nnp крепко_nnp растирая_nnp что_nnp остров_nnp жены _nnp хрипя_nnp _nnp хуже
