In [1]:
import pandas as pd
import numpy as np
from hmmlearn import hmm
from collections import defaultdict

def load_and_preprocess(data_path):
    """
    Load the dataset and preprocess the text and labels.
    Args:
        data_path (str): Path to the dataset CSV file.
    Returns:
        list: Preprocessed sentences as lists of tokens.
        list: Corresponding author labels.
        dict: Word-to-index mapping.
        dict: Index-to-word mapping.
    """
    # Load the dataset
    data = pd.read_csv(data_path)

    # Convert text to lowercase
    data['text'] = data['text'].str.lower()

    # Tokenize text into words
    tokenized_sentences = [sentence.split() for sentence in data['text']]

    # Create vocabulary and mappings
    vocab = set(word for sentence in tokenized_sentences for word in sentence)
    word_to_index = {word: i for i, word in enumerate(vocab)}
    index_to_word = {i: word for word, i in word_to_index.items()}

    # Map sentences to sequences of indices
    tokenized_sentences = [[word_to_index[word] for word in sentence] for sentence in tokenized_sentences]

    return tokenized_sentences, data['author'].tolist(), word_to_index, index_to_word

# Load data
data_path = "../data/Russian/author_data.csv"  # Replace with your file path
sentences, authors, word_to_index, index_to_word = load_and_preprocess(data_path)
print(f"Vocabulary size: {len(word_to_index)}")


Vocabulary size: 363992


In [2]:
def group_by_author(sentences, authors):
    """
    Group sentences by author for separate HMM training.
    Args:
        sentences (list): List of tokenized sentences.
        authors (list): List of corresponding authors.
    Returns:
        dict: Dictionary where keys are authors and values are lists of tokenized sentences.
    """
    author_data = defaultdict(list)
    for sentence, author in zip(sentences, authors):
        author_data[author].append(sentence)
    return author_data

# Group data by author
author_data = group_by_author(sentences, authors)
print(f"Authors: {list(author_data.keys())}")


Authors: ['bulgakov ', 'chekhov ', 'dostoevskiy ', 'gorky ', 'tolstoy ']


In [16]:
def train_hmm_for_author(sentences, n_states=5):
    """
    Train an HMM on tokenized sentences for a single author.
    Args:
        sentences (list): List of tokenized sentences (as sequences of word indices).
        n_states (int): Number of hidden states in the HMM.
    Returns:
        model: Trained HMM model.
    """
    # Flatten all sentences into a single observation sequence
    observations = np.concatenate(sentences).reshape(-1, 1)

    # Initialize and train the HMM
    model = hmm.MultinomialHMM(n_components=n_states, random_state=42, n_iter=100)
    model.fit(observations)

    return model

# Train HMMs for each author
hmm_models = {}
for author, sentences in author_data.items():
    print(f"Training HMM for {author}...")
    hmm_models[author] = train_hmm_for_author(sentences, n_states=20)
print("HMM training completed for all authors.")

MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


Training HMM for bulgakov ...


MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


Training HMM for chekhov ...


MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


Training HMM for dostoevskiy ...


MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


Training HMM for gorky ...


MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


Training HMM for tolstoy ...
HMM training completed for all authors.


In [4]:
def generate_text(model, index_to_word, max_len=50):
    """
    Generate text using a trained HMM.
    Args:
        model: Trained HMM model.
        index_to_word (dict): Index-to-word mapping.
        max_len (int): Maximum length of the generated text.
    Returns:
        str: Generated text.
    """
    model.n_trials = 1  # Ensure n_trials is set for sampling
    random_state = np.random.RandomState()  # Reinitialize random state for variability
    observations, _ = model.sample(n_samples=max_len, random_state=random_state)
    generated_words = [index_to_word[idx[0]] for idx in observations]
    return " ".join(generated_words)


# Example usage: Generate text for an author
target_author = "tolstoy "  # Replace with an author in your dataset
if target_author in hmm_models:
    generated_text = generate_text(hmm_models[target_author], index_to_word, max_len=50)
    print(f"Generated text for {target_author}:\n{generated_text}")
else:
    print(f"No HMM model found for author: {target_author}")


Generated text for tolstoy :
купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую купеческую


In [13]:
def generate_text_with_temperature(model, index_to_word, max_len=50, temperature=1.0):
    """
    Generate text using a trained HMM with temperature scaling for diversification.
    Args:
        model: Trained HMM model.
        index_to_word (dict): Index-to-word mapping.
        max_len (int): Maximum length of the generated text.
        temperature (float): Controls randomness; higher values increase diversity.
    Returns:
        str: Generated text.
    """
    model.n_trials = 1  # Ensure n_trials is set for sampling
    random_state = np.random.RandomState()  # Add randomness for variability

    # Generate the initial state sequence
    states = [random_state.choice(model.startprob_.size, p=model.startprob_)]
    words = []

    for _ in range(max_len):
        # Scale the emission probabilities with temperature
        emission_probs = model.emissionprob_[states[-1]]
        scaled_probs = emission_probs ** (1 / temperature)
        scaled_probs /= scaled_probs.sum()  # Normalize

        # Sample the next word and state
        next_word_idx = random_state.choice(len(scaled_probs), p=scaled_probs)
        next_state = random_state.choice(model.transmat_.shape[1], p=model.transmat_[states[-1]])
        words.append(index_to_word[next_word_idx])
        states.append(next_state)

    return " ".join(words)


In [14]:
# Example usage: Generate text with temperature scaling
target_author = "chekhov "  # Replace with an author in your dataset
if target_author in hmm_models:
    generated_text = generate_text_with_temperature(hmm_models[target_author], index_to_word, max_len=50, temperature=0.5)
    print(f"Generated text for {target_author} with temperature scaling:\n{generated_text}")
else:
    print(f"No HMM model found for author: {target_author}")

Generated text for chekhov  with temperature scaling:
трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке,


In [9]:
def generate_text_with_penalty(model, index_to_word, max_len=50, penalty=0.5):
    """
    Generate text using a trained HMM with a penalty for repeated words.
    Args:
        model: Trained HMM model.
        index_to_word (dict): Index-to-word mapping.
        max_len (int): Maximum length of the generated text.
        penalty (float): Probability multiplier for already-generated words.
    Returns:
        str: Generated text.
    """
    model.n_trials = 1  # Ensure n_trials is set for sampling
    random_state = np.random.RandomState()  # Add randomness for variability

    states = [random_state.choice(model.startprob_.size, p=model.startprob_)]
    words = []
    word_counts = defaultdict(int)

    for _ in range(max_len):
        emission_probs = model.emissionprob_[states[-1]]

        # Apply penalty to already-generated words
        for idx, prob in enumerate(emission_probs):
            if index_to_word[idx] in words:
                emission_probs[idx] *= penalty

        emission_probs /= emission_probs.sum()  # Normalize

        # Sample the next word and state
        next_word_idx = random_state.choice(len(emission_probs), p=emission_probs)
        next_state = random_state.choice(model.transmat_.shape[1], p=model.transmat_[states[-1]])

        generated_word = index_to_word[next_word_idx]
        words.append(generated_word)
        word_counts[generated_word] += 1
        states.append(next_state)

    return " ".join(words)


In [10]:
# Example usage: Generate text with penalty for repeated words
target_author = "tolstoy "  # Replace with an author in your
if target_author in hmm_models:
    generated_text = generate_text_with_penalty(hmm_models[target_author], index_to_word, max_len=50, penalty=0.5)
    print(f"Generated text for {target_author} with penalty for repeated words:\n{generated_text}")
else:
    print(f"No HMM model found for author: {target_author}")

Generated text for tolstoy  with penalty for repeated words:
трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке,


In [17]:
def generate_text_with_randomness(model, index_to_word, max_len=50, randomness=0.5):
    """
    Generate text using a trained HMM with enhanced randomness for variability.
    Args:
        model: Trained HMM model.
        index_to_word (dict): Index-to-word mapping.
        max_len (int): Maximum length of the generated text.
        randomness (float): Adds noise to probabilities for more diversity (higher = more random).
    Returns:
        str: Generated text.
    """
    model.n_trials = 1
    random_state = np.random.RandomState()

    states = [random_state.choice(model.startprob_.size, p=model.startprob_)]
    words = []

    for _ in range(max_len):
        # Get emission probabilities
        emission_probs = model.emissionprob_[states[-1]]

        # Add randomness by perturbing probabilities
        noisy_probs = emission_probs + randomness * random_state.rand(len(emission_probs))
        noisy_probs /= noisy_probs.sum()

        # Sample the next word
        next_word_idx = random_state.choice(len(noisy_probs), p=noisy_probs)

        # Sample the next state
        trans_probs = model.transmat_[states[-1]]
        next_state = random_state.choice(model.transmat_.shape[1], p=trans_probs)

        words.append(index_to_word[next_word_idx])
        states.append(next_state)

    return " ".join(words)


In [24]:
# Example usage: Generate text with enhanced randomness
target_author = "dostoevskiy "  # Replace with an author in your
if target_author in hmm_models:
    generated_text = generate_text_with_randomness(hmm_models[target_author], index_to_word, max_len=50, randomness=0.9)
    print(f"Generated text for {target_author} with enhanced randomness:\n{generated_text}")

Generated text for dostoevskiy  with enhanced randomness:
трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке, трейчке,


In [25]:
# Observe the transition matrix for an author
target_author = "tolstoy "  # Replace with an author in your
if target_author in hmm_models:
    print(f"Transition matrix for {target_author}:\n{hmm_models[target_author].transmat_}")

Transition matrix for tolstoy :
[[5.81401480e-19 5.79198957e-30 1.96413502e-12 8.12086841e-11
  6.16172878e-06 6.50649432e-01 3.10801917e-01 3.69086827e-05
  9.39046886e-22 1.38857356e-27 6.70410949e-09 2.52870227e-02
  1.01545247e-11 1.03340578e-17 3.04498906e-23 6.16108630e-03
  7.52783445e-46 1.05000961e-03 6.00745421e-03 1.32895600e-09]
 [8.12634909e-02 3.82403768e-10 1.10497562e-10 2.82103663e-03
  1.40991873e-01 5.53151843e-19 6.51445437e-03 8.49877784e-03
  3.58289589e-06 1.95817363e-32 1.36552999e-30 1.37111292e-10
  2.21925677e-01 2.83415943e-08 2.38610431e-13 2.68627898e-11
  3.59313349e-01 1.66821246e-04 1.95081574e-02 1.58992750e-01]
 [6.16571079e-03 4.97065466e-11 6.36799651e-14 7.99832669e-03
  3.09125356e-44 1.14565575e-08 1.66590675e-19 1.37144496e-01
  8.86377268e-07 7.26040583e-10 2.14854587e-01 3.79529915e-07
  5.49301637e-12 2.22929345e-05 7.59165292e-27 6.48688693e-02
  7.39327044e-18 5.68787073e-01 1.57366965e-04 1.46580899e-13]
 [4.62398085e-09 2.44635573e-04 2.9