In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import re
import string
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("TensorFlow version:", tf.__version__)
print("Advanced LSTM Text Generation initialized!")

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# 1. Advanced Text Preprocessor
class AdvancedTextPreprocessor:
    """
    Sophisticated text preprocessing for generation tasks
    """
    
    def __init__(self, level='char', max_vocab_size=10000):
        self.level = level  # 'char' or 'word'
        self.max_vocab_size = max_vocab_size
        self.vocab = {}
        self.reverse_vocab = {}
        self.vocab_size = 0
        
    def create_sample_texts(self):
        """
        Create diverse sample texts for training
        """
        texts = [
            # Literary style
            "The morning sun cast golden rays through the ancient oak trees, illuminating the dewdrops that sparkled like diamonds on the emerald grass. A gentle breeze whispered secrets of forgotten times.",
            
            # Technical style
            "Neural networks utilize backpropagation algorithms to optimize weight parameters through gradient descent. The loss function measures prediction accuracy and guides the learning process.",
            
            # Conversational style
            "Hey there! How's your day going? I just discovered this amazing coffee shop downtown. You should definitely check it out when you have time. The atmosphere is incredible!",
            
            # Narrative style
            "Sarah walked through the empty hallway, her footsteps echoing in the silence. The old building held memories of countless students who had passed through these same corridors decades ago.",
            
            # Scientific style
            "The experimental results demonstrate a significant correlation between temperature variations and molecular movement patterns. Statistical analysis reveals confidence intervals within acceptable parameters.",
            
            # Poetic style
            "Moonbeams dance on silver streams, while midnight owls share ancient dreams. The world sleeps beneath starlit skies, as time itself gently flies.",
            
            # Dialogue style
            "I can't believe you said that!" exclaimed Maria. "What did you expect me to do?" replied James, shrugging his shoulders. "Sometimes honesty is the best policy.",
            
            # Descriptive style
            "The vintage bookstore contained thousands of leather-bound volumes, their pages yellowed with age. Dust motes floated lazily in the afternoon sunlight streaming through tall windows."
        ]
        
        return texts
    
    def build_vocabulary(self, texts):
        """
        Build vocabulary from texts
        """
        if self.level == 'char':
            # Character-level vocabulary
            all_chars = ''.join(texts)
            char_counts = Counter(all_chars)
            
            # Keep most common characters
            most_common = char_counts.most_common(self.max_vocab_size - 2)  # -2 for special tokens
            
            # Create vocab mapping
            self.vocab = {'<UNK>': 0, '<PAD>': 1}
            for i, (char, _) in enumerate(most_common):
                self.vocab[char] = i + 2
                
        elif self.level == 'word':
            # Word-level vocabulary
            all_words = []
            for text in texts:
                # Simple tokenization
                words = re.findall(r'\b\w+\b|[^\w\s]', text.lower())
                all_words.extend(words)
            
            word_counts = Counter(all_words)
            most_common = word_counts.most_common(self.max_vocab_size - 2)
            
            # Create vocab mapping
            self.vocab = {'<UNK>': 0, '<PAD>': 1}
            for i, (word, _) in enumerate(most_common):
                self.vocab[word] = i + 2
        
        # Create reverse mapping
        self.reverse_vocab = {v: k for k, v in self.vocab.items()}
        self.vocab_size = len(self.vocab)
        
        print(f"Vocabulary size ({self.level}-level): {self.vocab_size}")
        
    def encode_text(self, text):
        """
        Encode text to sequence of integers
        """
        if self.level == 'char':
            return [self.vocab.get(char, 0) for char in text]
        elif self.level == 'word':
            words = re.findall(r'\b\w+\b|[^\w\s]', text.lower())
            return [self.vocab.get(word, 0) for word in words]
    
    def decode_sequence(self, sequence):
        """
        Decode sequence of integers back to text
        """
        if self.level == 'char':
            return ''.join([self.reverse_vocab.get(idx, '<UNK>') for idx in sequence])
        elif self.level == 'word':
            words = [self.reverse_vocab.get(idx, '<UNK>') for idx in sequence]
            return ' '.join(words)
    
    def create_training_sequences(self, texts, sequence_length=50):
        """
        Create training sequences from texts
        """
        sequences = []
        targets = []
        
        for text in texts:
            encoded = self.encode_text(text)
            
            # Create overlapping sequences
            for i in range(len(encoded) - sequence_length):
                seq = encoded[i:i + sequence_length]
                target = encoded[i + 1:i + sequence_length + 1]
                
                sequences.append(seq)
                targets.append(target)
        
        return np.array(sequences), np.array(targets)

# 2. Advanced Sampling Strategies
class AdvancedSampler:
    """
    Implement sophisticated sampling strategies for text generation
    """
    
    def __init__(self):
        pass
    
    def temperature_sampling(self, logits, temperature=1.0):
        """
        Sample using temperature scaling
        """
        if temperature == 0:
            return np.argmax(logits)
        
        # Apply temperature
        logits = logits / temperature
        
        # Convert to probabilities
        exp_logits = np.exp(logits - np.max(logits))  # Numerical stability
        probabilities = exp_logits / np.sum(exp_logits)
        
        # Sample from distribution
        return np.random.choice(len(probabilities), p=probabilities)
    
    def top_k_sampling(self, logits, k=40, temperature=1.0):
        """
        Sample from top-k most probable tokens
        """
        # Get top-k indices
        top_k_indices = np.argpartition(logits, -k)[-k:]
        top_k_logits = logits[top_k_indices]
        
        # Apply temperature
        if temperature != 1.0:
            top_k_logits = top_k_logits / temperature
        
        # Convert to probabilities
        exp_logits = np.exp(top_k_logits - np.max(top_k_logits))
        probabilities = exp_logits / np.sum(exp_logits)
        
        # Sample and map back to original indices
        sampled_idx = np.random.choice(len(probabilities), p=probabilities)
        return top_k_indices[sampled_idx]
    
    def nucleus_sampling(self, logits, p=0.9, temperature=1.0):
        """
        Nucleus (top-p) sampling
        """
        # Apply temperature
        if temperature != 1.0:
            logits = logits / temperature
        
        # Convert to probabilities
        exp_logits = np.exp(logits - np.max(logits))
        probabilities = exp_logits / np.sum(exp_logits)
        
        # Sort probabilities in descending order
        sorted_indices = np.argsort(probabilities)[::-1]
        sorted_probs = probabilities[sorted_indices]
        
        # Find nucleus (cumulative probability >= p)
        cumulative_probs = np.cumsum(sorted_probs)
        nucleus_size = np.searchsorted(cumulative_probs, p) + 1
        
        # Sample from nucleus
        nucleus_indices = sorted_indices[:nucleus_size]
        nucleus_probs = sorted_probs[:nucleus_size]
        nucleus_probs = nucleus_probs / np.sum(nucleus_probs)  # Renormalize
        
        sampled_idx = np.random.choice(len(nucleus_probs), p=nucleus_probs)
        return nucleus_indices[sampled_idx]

# 3. Advanced LSTM Text Generator
class AdvancedLSTMGenerator:
    """
    Sophisticated LSTM-based text generator
    """
    
    def __init__(self, vocab_size, embedding_dim=256, lstm_units=512, num_layers=2):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.lstm_units = lstm_units
        self.num_layers = num_layers
        self.model = None
        self.sampler = AdvancedSampler()
        
    def build_model(self, sequence_length):
        """
        Build advanced multi-layer LSTM model
        """
        model = keras.Sequential([
            layers.Embedding(self.vocab_size, self.embedding_dim, 
                           input_length=sequence_length),
            layers.Dropout(0.2)
        ])
        
        # Add multiple LSTM layers
        for i in range(self.num_layers):
            return_sequences = (i < self.num_layers - 1)  # All but last layer return sequences
            
            model.add(layers.LSTM(
                self.lstm_units,
                return_sequences=return_sequences,
                dropout=0.3,
                recurrent_dropout=0.3
            ))
            
            if return_sequences:
                model.add(layers.Dropout(0.3))
        
        # Output layer
        model.add(layers.Dense(self.vocab_size, activation='softmax'))
        
        self.model = model
        return model
    
    def train_model(self, X, y, epochs=50, batch_size=64):
        """
        Train the text generation model
        """
        if self.model is None:
            raise ValueError("Model not built. Call build_model() first.")
        
        # Compile model
        self.model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=0.001),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        # Callbacks
        callbacks = [
            keras.callbacks.ReduceLROnPlateau(
                monitor='loss', factor=0.5, patience=5, min_lr=1e-7, verbose=1
            ),
            keras.callbacks.EarlyStopping(
                monitor='loss', patience=10, restore_best_weights=True
            )
        ]
        
        # Train model
        history = self.model.fit(
            X, y,
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=1
        )
        
        return history
    
    def generate_text(self, seed_text, preprocessor, length=200, 
                     sampling_strategy='temperature', **sampling_kwargs):
        """
        Generate text using various sampling strategies
        """
        if self.model is None:
            raise ValueError("Model not trained. Train model first.")
        
        # Encode seed text
        encoded_seed = preprocessor.encode_text(seed_text)
        sequence_length = self.model.input_shape[1]
        
        # Pad or truncate seed to sequence length
        if len(encoded_seed) < sequence_length:
            encoded_seed = [1] * (sequence_length - len(encoded_seed)) + encoded_seed  # Pad with PAD token
        else:
            encoded_seed = encoded_seed[-sequence_length:]
        
        generated = encoded_seed.copy()
        
        # Generate text
        for _ in range(length):
            # Prepare input
            input_seq = np.array([generated[-sequence_length:]])
            
            # Predict next token probabilities
            predictions = self.model.predict(input_seq, verbose=0)[0]
            
            # Sample next token based on strategy
            if sampling_strategy == 'temperature':
                temperature = sampling_kwargs.get('temperature', 1.0)
                next_token = self.sampler.temperature_sampling(predictions, temperature)
            elif sampling_strategy == 'top_k':
                k = sampling_kwargs.get('k', 40)
                temperature = sampling_kwargs.get('temperature', 1.0)
                next_token = self.sampler.top_k_sampling(predictions, k, temperature)
            elif sampling_strategy == 'nucleus':
                p = sampling_kwargs.get('p', 0.9)
                temperature = sampling_kwargs.get('temperature', 1.0)
                next_token = self.sampler.nucleus_sampling(predictions, p, temperature)
            else:  # Greedy
                next_token = np.argmax(predictions)
            
            generated.append(next_token)
        
        # Decode generated sequence (excluding seed)
        generated_tokens = generated[len(encoded_seed):]
        return preprocessor.decode_sequence(generated_tokens)

# Initialize components
print("Initializing advanced text generation components...")

# Create preprocessors for both character and word level
char_preprocessor = AdvancedTextPreprocessor(level='char', max_vocab_size=100)
word_preprocessor = AdvancedTextPreprocessor(level='word', max_vocab_size=2000)

# Generate sample texts
sample_texts = char_preprocessor.create_sample_texts()
print(f"Created {len(sample_texts)} sample texts covering different styles")

# Build vocabularies
print("\nBuilding vocabularies...")
char_preprocessor.build_vocabulary(sample_texts)
word_preprocessor.build_vocabulary(sample_texts)

# Create training sequences
sequence_length = 50
char_X, char_y = char_preprocessor.create_training_sequences(sample_texts, sequence_length)
word_X, word_y = word_preprocessor.create_training_sequences(sample_texts, sequence_length//2)  # Shorter for words

print(f"\nCharacter-level sequences: {char_X.shape}")
print(f"Word-level sequences: {word_X.shape}")

# Visualize sample generation strategies
print("\nDemonstrating sampling strategies...")

# Create a simple test case
test_logits = np.array([1.0, 2.0, 0.5, 3.0, 1.5, 0.8, 2.5])  # Example logits
sampler = AdvancedSampler()

plt.figure(figsize=(15, 10))

# Temperature sampling comparison
temperatures = [0.5, 1.0, 1.5, 2.0]
temp_samples = []

for temp in temperatures:
    samples = [sampler.temperature_sampling(test_logits, temp) for _ in range(100)]
    temp_samples.append(samples)

plt.subplot(2, 3, 1)
for i, (temp, samples) in enumerate(zip(temperatures, temp_samples)):
    counts = np.bincount(samples, minlength=len(test_logits))
    plt.bar(np.arange(len(test_logits)) + i*0.2, counts, width=0.2, 
           label=f'T={temp}', alpha=0.7)
plt.title('Temperature Sampling Comparison')
plt.xlabel('Token Index')
plt.ylabel('Sample Count')
plt.legend()

# Top-k sampling
plt.subplot(2, 3, 2)
k_values = [1, 2, 3, 5]
for k in k_values:
    samples = [sampler.top_k_sampling(test_logits, k) for _ in range(100)]
    counts = np.bincount(samples, minlength=len(test_logits))
    plt.bar(np.arange(len(test_logits)), counts, alpha=0.7, label=f'k={k}')
plt.title('Top-k Sampling (k variations)')
plt.xlabel('Token Index')
plt.ylabel('Sample Count')
plt.legend()

# Nucleus sampling
plt.subplot(2, 3, 3)
p_values = [0.5, 0.7, 0.9, 0.95]
for p in p_values:
    samples = [sampler.nucleus_sampling(test_logits, p) for _ in range(100)]
    counts = np.bincount(samples, minlength=len(test_logits))
    plt.bar(np.arange(len(test_logits)), counts, alpha=0.7, label=f'p={p}')
plt.title('Nucleus Sampling (p variations)')
plt.xlabel('Token Index')
plt.ylabel('Sample Count')
plt.legend()

# Logits distribution
plt.subplot(2, 3, 4)
softmax_probs = np.exp(test_logits) / np.sum(np.exp(test_logits))
plt.bar(range(len(test_logits)), test_logits, alpha=0.7, label='Logits')
plt.title('Original Logits')
plt.xlabel('Token Index')
plt.ylabel('Logit Value')

# Probability distribution
plt.subplot(2, 3, 5)
plt.bar(range(len(test_logits)), softmax_probs, alpha=0.7, color='orange')
plt.title('Softmax Probabilities')
plt.xlabel('Token Index')
plt.ylabel('Probability')

# Sampling strategy comparison
plt.subplot(2, 3, 6)
greedy_choice = np.argmax(test_logits)
temp_choice = sampler.temperature_sampling(test_logits, 1.0)
topk_choice = sampler.top_k_sampling(test_logits, 3)
nucleus_choice = sampler.nucleus_sampling(test_logits, 0.9)

strategies = ['Greedy', 'Temperature', 'Top-k', 'Nucleus']
choices = [greedy_choice, temp_choice, topk_choice, nucleus_choice]

plt.bar(strategies, choices, alpha=0.7, color=['red', 'blue', 'green', 'purple'])
plt.title('Sample Choices by Strategy')
plt.ylabel('Selected Token Index')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print(f"\nSampling Strategy Analysis:")
print(f"Greedy choice: {greedy_choice}")
print(f"Temperature choice: {temp_choice}")
print(f"Top-k choice: {topk_choice}")
print(f"Nucleus choice: {nucleus_choice}")

print(f"\nAdvanced LSTM Text Generation Setup Complete!")
print(f"Ready to build and train sophisticated generation models!")
print(f"Character vocab size: {char_preprocessor.vocab_size}")
print(f"Word vocab size: {word_preprocessor.vocab_size}")
