In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import nltk

nltk.download('averaged_perceptron_tagger')

# Function to apply POS tagging
def text_to_pos_features(text):
    """
    Convert text to Part-of-Speech (POS) tagged representation.
    Args:
        text (str): Input text.
    Returns:
        str: POS-tagged text.
    """
    tokens = text.split()
    pos_tags = nltk.pos_tag(tokens)  # Generate POS tags
    return " ".join([f"{word}_{tag}" for word, tag in pos_tags])

# Load and preprocess data
def load_and_preprocess(data_path):
    """
    Load the dataset and preprocess it.
    Args:
        data_path (str): Path to the dataset CSV file.
    Returns:
        pd.DataFrame: Preprocessed dataset.
    """
    data = pd.read_csv(data_path)
    return data

# Load the data
data_path = "../data/Russian/author_data.csv"  # Replace with your dataset
data = load_and_preprocess(data_path)

# Split data into training and test sets
X = data['text']
y = data['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply POS tagging to train and test sets
X_train_pos = X_train.apply(text_to_pos_features)
X_test_pos = X_test.apply(text_to_pos_features)

# Vectorize POS-tagged text
vectorizer = CountVectorizer(ngram_range=(1, 3))
X_train_vec = vectorizer.fit_transform(X_train_pos)
X_test_vec = vectorizer.transform(X_test_pos)

# Train the Naive Bayes model
nb_model = MultinomialNB(alpha=1.0)
nb_model.fit(X_train_vec, y_train)

# Evaluate the model
y_pred = nb_model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Detailed classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Accuracy: 0.82
Classification Report:
              precision    recall  f1-score   support

   bulgakov        0.99      0.68      0.80      2789
    chekhov        0.99      0.45      0.62      2599
dostoevskiy        0.65      0.99      0.79      4362
      gorky        0.94      0.87      0.90      3285
    tolstoy        0.88      0.93      0.90      4253

    accuracy                           0.82     17288
   macro avg       0.89      0.78      0.80     17288
weighted avg       0.87      0.82      0.82     17288



In [4]:
# save the model to disk
import pickle
filename = 'finalized_model.sav'
pickle.dump(nb_model, open(filename, 'wb'))

In [5]:
import numpy as np
import random

def generate_text_from_author(nb_model, vectorizer, author, max_len=50, temperature=1.0):
    """
    Generate text in the style of a given author using a trained Naive Bayes classifier.
    Args:
        nb_model: Trained Naive Bayes model.
        vectorizer: Fitted CountVectorizer.
        author (str): Author whose style to mimic.
        max_len (int): Maximum length of the generated text.
        temperature (float): Sampling temperature to control randomness.
    Returns:
        str: Generated text.
    """
    if author not in nb_model.classes_:
        raise ValueError(f"Author '{author}' not found in the trained model.")

    # Get the index of the author
    author_index = np.where(nb_model.classes_ == author)[0][0]

    # Extract the word probabilities for the given author
    word_probs = np.exp(nb_model.feature_log_prob_[author_index])  # Convert log probabilities to probabilities
    word_probs /= np.sum(word_probs)  # Normalize probabilities

    # Vocabulary from the vectorizer
    vocab = vectorizer.get_feature_names_out()

    # Start with a random seed word
    generated_text = [random.choice(vocab)]

    for _ in range(max_len - 1):
        # Adjust probabilities with temperature
        adjusted_probs = word_probs ** (1 / temperature)
        adjusted_probs /= np.sum(adjusted_probs)

        # Sample the next word
        next_word = np.random.choice(vocab, p=adjusted_probs)

        # Append to the generated text
        generated_text.append(next_word)

    return " ".join(generated_text)

In [22]:
target_author = "chekhov "  # Replace with an actual author from your dataset
generated_text = generate_text_from_author(nb_model, vectorizer, target_author, max_len=50, temperature=1.0)
print(f"Generated text in the style of {target_author}:\n{generated_text}")

Generated text in the style of chekhov :
2_cd мес _nnp подать _nnp только_nnp сергеевна_nnp и_nnp _nnp ему_nnp сопутствует_nnp находился_nnp степан_nnp _nnp настроение_nnp пьера был_nnp _nnp чтобы_nnp ждать_nn очевидно _nnp хотел_nnp говорил_nnp больше_nnp всех_nnp служат_nnp как_nnp наилучшей_nnp в_nnp сказал_nnp павлович_nnp карамазов_nnp на_nnp голове _nnp _nnp кашлять доме_nnp у_nnp ростовых здоровенных_nnp служанок_nnp казались_nnp ты_nnp мой_nnp злейший_nnp забора _nnp рабочие_nnp своею_nnp собственною_nnp волей_nnp стояли_nnp нагроможденные_nnp одна_nn прямых_nnp скобках_nnp на_nnp сейчас_nnp совершится_nnp на_nnp не_nnp обидной_nnp с_nn 16 присылали_nnp за_nnp дома_nnp забрались_nnp в_nnp квартиру_nnp петра_nnp университет_nnp и_nnp должен_nnp _nnp восклицает_nnp отчаянный не_nnp потерявший_nnp своей_nnp плодятся_nnp понемногу делалось_nnp не_nnp варвару _nnp поезде_nnp участка_nnp р_nnp запрыгал _nnp слово_nnp отлично_nnp рекомендованным_nnp жильцам постепенного_nnp счастье _n

In [14]:
# Run classification on a new text
new_text = "В тусклом свете заката, среди руин старого города, он стоял, как призрак, обречённый на забвение. Его глаза, полные горя и отчаяния, рассказывали историю о потерянной любви и разрушенных мечтах. В этом мгновении время остановилось, и он, как символ страдания, стал вечным свидетелем человеческой печали."
# Preprocess the new text by lowercasing and applying POS tagging
new_text = new_text.lower()
new_text_pos = text_to_pos_features(new_text)
new_text_vec = vectorizer.transform([new_text_pos])

# Predict the author of the new text
predicted_author = nb_model.predict(new_text_vec)[0]
print(f"Predicted author of the new text: {predicted_author}")

Predicted author of the new text: tolstoy 


In [23]:
def parse_txt_file(file_path):
    """
    Parse the structured text file to extract sentences and their corresponding authors.
    Args:
        file_path (str): Path to the text file.
    Returns:
        list of tuples: List of (sentence, author) pairs.
    """
    data = []
    current_author = None

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()

            # Match author headings
            match = re.match(r"Generating sentences for (.+) in Russian", line)
            if match:
                current_author = match.group(1).strip()  # Strip any whitespace from author name
                continue

            # Add sentences with the current author
            if current_author and line:
                data.append((line, current_author))

    return data


In [24]:
def map_true_labels(true_authors):
    """
    Map true author names to the required format: 'last name + space' in lowercase.
    Args:
        true_authors (list): List of original author names.
    Returns:
        list: Mapped author names.
    """
    # Define a mapping from full names to the required format
    author_mapping = {
        "Leo Tolstoy": "tolstoy ",
        "Maxim Gorky": "gorky ",
        "Fyodor Dostoevskiy": "dostoevskiy ",
        "Anton Chekhov": "chekhov ",
        "Mikhail Bulgakov": "bulgakov ",
    }

    # Map each true author name
    mapped_authors = [author_mapping.get(author.strip(), author.strip()) for author in true_authors]
    return mapped_authors

def evaluate_classifier(nb_model, vectorizer, parsed_data):
    """
    Evaluate the classifier on parsed data.
    Args:
        nb_model: Trained Naive Bayes model.
        vectorizer: Fitted CountVectorizer.
        parsed_data (list): List of (sentence, author) pairs.
    Returns:
        None: Prints evaluation metrics.
    """
    sentences, true_authors = zip(*parsed_data)

    # Preprocess and vectorize the sentences
    sentences = [sentence.lower() for sentence in sentences]
    sentence_vectors = vectorizer.transform(sentences)

    # Predict authors
    predicted_authors = nb_model.predict(sentence_vectors)

    # Map true authors to the required format
    mapped_true_authors = map_true_labels(true_authors)

    # Evaluate performance
    accuracy = accuracy_score(mapped_true_authors, predicted_authors)
    print(f"Accuracy: {accuracy:.2f}")

    report = classification_report(mapped_true_authors, predicted_authors)
    print("Classification Report:")
    print(report)

# Example usage
file_path = "../data/Russian/llm.txt"  # Replace with your file path
parsed_data = parse_txt_file(file_path)

# Evaluate the classifier
evaluate_classifier(nb_model, vectorizer, parsed_data)


Accuracy: 0.22
Classification Report:
              precision    recall  f1-score   support

   bulgakov        0.50      0.03      0.06       100
    chekhov        0.00      0.00      0.00       100
dostoevskiy        0.31      0.52      0.39       100
      gorky        0.16      0.39      0.22       122
    tolstoy        0.28      0.11      0.16       100

    accuracy                           0.22       522
   macro avg       0.25      0.21      0.16       522
weighted avg       0.24      0.22      0.17       522



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
