In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import seaborn as sns
import string
import re
from collections import Counter
import os
import requests
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("TensorFlow version:", tf.__version__)
print("Libraries imported successfully!")

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Create sample text data for demonstration
sample_text = """
To be or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles
And by opposing end them. To die—to sleep,
No more; and by a sleep to say we end
The heart-ache and the thousand natural shocks
That flesh is heir to: 'tis a consummation
Devoutly to be wish'd. To die, to sleep;
To sleep, perchance to dream—ay, there's the rub:
For in that sleep of death what dreams may come,
When we have shuffled off this mortal coil,
Must give us pause. There's the respect
That makes calamity of so long life.

The quick brown fox jumps over the lazy dog.
Machine learning is revolutionizing the way we process data.
Deep neural networks can learn complex patterns from large datasets.
Artificial intelligence will transform various industries in the coming years.
Natural language processing enables computers to understand human language.
The future of technology lies in the intersection of AI and human creativity.
"""

# Clean and prepare the text
def preprocess_text(text):
    """
    Basic text preprocessing for character-level generation
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace but keep single spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters except basic punctuation
    text = re.sub(r'[^\w\s\.\,\;\:\!\?\-\'\"]', '', text)
    
    return text.strip()

# Preprocess the text
processed_text = preprocess_text(sample_text)

print("Original text length:", len(sample_text))
print("Processed text length:", len(processed_text))
print("\nFirst 200 characters of processed text:")
print(repr(processed_text[:200]))

# Text statistics
print(f"\nText Statistics:")
print(f"Total characters: {len(processed_text)}")
print(f"Total words: {len(processed_text.split())}")
print(f"Average word length: {np.mean([len(word) for word in processed_text.split()]):.2f}")

# Visualize character frequency
char_counts = Counter(processed_text)
print(f"\nUnique characters: {len(char_counts)}")
print(f"Most common characters: {char_counts.most_common(10)}")

# Plot character frequency
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
chars, counts = zip(*char_counts.most_common(20))
plt.bar(range(len(chars)), counts)
plt.xticks(range(len(chars)), chars, rotation=45)
plt.title('Top 20 Character Frequencies')
plt.xlabel('Characters')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
# Character frequency distribution
frequencies = list(char_counts.values())
plt.hist(frequencies, bins=20, alpha=0.7, edgecolor='black')
plt.title('Distribution of Character Frequencies')
plt.xlabel('Frequency')
plt.ylabel('Number of Characters')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Create character vocabulary and mappings
class CharacterVocabulary:
    """
    A class to handle character-level vocabulary for text generation
    """
    
    def __init__(self, text, add_special_tokens=True):
        """
        Initialize vocabulary from text
        
        Parameters:
        text: input text string
        add_special_tokens: whether to add special tokens
        """
        # Get unique characters and sort them for consistency
        self.chars = sorted(list(set(text)))
        
        # Add special tokens if requested
        if add_special_tokens:
            special_tokens = ['<START>', '<END>', '<UNK>']
            self.chars = special_tokens + self.chars
        
        # Create mappings
        self.char_to_idx = {char: idx for idx, char in enumerate(self.chars)}
        self.idx_to_char = {idx: char for idx, char in enumerate(self.chars)}
        
        # Store vocabulary size
        self.vocab_size = len(self.chars)
        
        print(f"Vocabulary created with {self.vocab_size} characters")
        print(f"Characters: {self.chars}")
    
    def encode(self, text):
        """Convert text to sequence of indices"""
        return [self.char_to_idx.get(char, self.char_to_idx.get('<UNK>', 0)) for char in text]
    
    def decode(self, indices):
        """Convert sequence of indices back to text"""
        return ''.join([self.idx_to_char.get(idx, '<UNK>') for idx in indices])
    
    def get_char_info(self):
        """Get detailed character information"""
        info = []
        for char in self.chars:
            idx = self.char_to_idx[char]
            char_repr = repr(char) if char.isprintable() else f"'{char}'"
            info.append((idx, char_repr, char))
        return info

# Create vocabulary from our processed text
vocab = CharacterVocabulary(processed_text, add_special_tokens=True)

# Display character mappings
print("\nCharacter to Index Mappings:")
char_info = vocab.get_char_info()
for idx, char_repr, char in char_info[:20]:  # Show first 20
    print(f"Index {idx:2d}: {char_repr:8s} -> '{char}'")

if len(char_info) > 20:
    print(f"... and {len(char_info) - 20} more characters")

# Test encoding and decoding
test_string = "hello world"
encoded = vocab.encode(test_string)
decoded = vocab.decode(encoded)

print(f"\nEncoding/Decoding Test:")
print(f"Original: '{test_string}'")
print(f"Encoded:  {encoded}")
print(f"Decoded:  '{decoded}'")

# Visualize the vocabulary
plt.figure(figsize=(12, 8))

# Character index mapping
plt.subplot(2, 2, 1)
indices = list(range(min(20, vocab.vocab_size)))
chars_subset = [vocab.idx_to_char[i] for i in indices]
plt.bar(indices, [1] * len(indices))
plt.xticks(indices, [repr(c) for c in chars_subset], rotation=45)
plt.title('Character Index Mapping (First 20)')
plt.xlabel('Character Index')
plt.ylabel('Presence')

# Vocabulary size comparison
plt.subplot(2, 2, 2)
categories = ['Letters', 'Digits', 'Punctuation', 'Whitespace', 'Special']
counts = [
    sum(1 for c in vocab.chars if c.isalpha()),
    sum(1 for c in vocab.chars if c.isdigit()),
    sum(1 for c in vocab.chars if c in string.punctuation),
    sum(1 for c in vocab.chars if c.isspace()),
    sum(1 for c in vocab.chars if c.startswith('<'))
]
plt.pie(counts, labels=categories, autopct='%1.1f%%')
plt.title('Character Type Distribution')

# Encoding example
plt.subplot(2, 2, 3)
sample_chars = processed_text[:20]
sample_encoded = vocab.encode(sample_chars)
plt.plot(sample_encoded, 'o-', linewidth=2, markersize=8)
plt.title('Sample Text Encoding')
plt.xlabel('Character Position')
plt.ylabel('Character Index')
plt.grid(True, alpha=0.3)

# Vocabulary coverage
plt.subplot(2, 2, 4)
text_chars = set(processed_text)
vocab_chars = set(vocab.chars)
coverage = len(text_chars.intersection(vocab_chars)) / len(text_chars) * 100
plt.bar(['Coverage'], [coverage], color='green', alpha=0.7)
plt.ylim(0, 100)
plt.title(f'Vocabulary Coverage: {coverage:.1f}%')
plt.ylabel('Percentage')

plt.tight_layout()
plt.show()


In [None]:
# Create training sequences and labels
class SequenceGenerator:
    """
    A class to generate training sequences for character-level text generation
    """
    
    def __init__(self, text, vocab, sequence_length=40, step=1):
        """
        Initialize sequence generator
        
        Parameters:
        text: input text string
        vocab: CharacterVocabulary object
        sequence_length: length of input sequences
        step: step size for sliding window
        """
        self.text = text
        self.vocab = vocab
        self.sequence_length = sequence_length
        self.step = step
        
        # Encode the entire text
        self.encoded_text = vocab.encode(text)
        
        print(f"Text length: {len(text)} characters")
        print(f"Encoded length: {len(self.encoded_text)} indices")
        print(f"Sequence length: {sequence_length}")
        print(f"Step size: {step}")
    
    def create_sequences(self):
        """
        Create input sequences and corresponding targets
        """
        sequences = []
        targets = []
        
        # Slide window across the text
        for i in range(0, len(self.encoded_text) - self.sequence_length, self.step):
            # Input sequence (context)
            seq = self.encoded_text[i:i + self.sequence_length]
            # Target (next character)
            target = self.encoded_text[i + self.sequence_length]
            
            sequences.append(seq)
            targets.append(target)
        
        print(f"Created {len(sequences)} training sequences")
        return np.array(sequences), np.array(targets)
    
    def create_sequence_to_sequence_data(self):
        """
        Create sequence-to-sequence data where input and output are sequences
        """
        input_sequences = []
        target_sequences = []
        
        for i in range(0, len(self.encoded_text) - self.sequence_length, self.step):
            # Input sequence
            input_seq = self.encoded_text[i:i + self.sequence_length]
            # Target sequence (shifted by one)
            target_seq = self.encoded_text[i + 1:i + self.sequence_length + 1]
            
            input_sequences.append(input_seq)
            target_sequences.append(target_seq)
        
        print(f"Created {len(input_sequences)} sequence-to-sequence pairs")
        return np.array(input_sequences), np.array(target_sequences)
    
    def visualize_sequences(self, sequences, targets, num_examples=3):
        """
        Visualize some example sequences
        """
        print("Example Training Sequences:")
        print("=" * 80)
        
        for i in range(min(num_examples, len(sequences))):
            input_text = self.vocab.decode(sequences[i])
            target_char = self.vocab.idx_to_char[targets[i]]
            
            print(f"Example {i+1}:")
            print(f"Input:  '{input_text}'")
            print(f"Target: '{target_char}'")
            print(f"Indices: {sequences[i][:10]}... -> {targets[i]}")
            print("-" * 40)

# Create sequence generator
sequence_length = 25  # Length of context to use for prediction
step_size = 3         # Step size for sliding window (smaller = more overlap)

seq_gen = SequenceGenerator(processed_text, vocab, sequence_length, step_size)

# Create training sequences
X_sequences, y_targets = seq_gen.create_sequences()

print(f"\nTraining data shapes:")
print(f"Input sequences: {X_sequences.shape}")
print(f"Target characters: {y_targets.shape}")

# Visualize some examples
seq_gen.visualize_sequences(X_sequences, y_targets, num_examples=5)

# Create sequence-to-sequence data as well (for comparison)
X_seq2seq, y_seq2seq = seq_gen.create_sequence_to_sequence_data()

print(f"\nSequence-to-sequence data shapes:")
print(f"Input sequences: {X_seq2seq.shape}")
print(f"Target sequences: {y_seq2seq.shape}")

# Analyze sequence statistics
print(f"\nSequence Statistics:")
print(f"Total training examples: {len(X_sequences)}")
print(f"Sequence length: {sequence_length}")
print(f"Vocabulary size: {vocab.vocab_size}")
print(f"Coverage: {len(X_sequences) / len(processed_text) * 100:.1f}% of original text")

# Visualize sequence creation process
plt.figure(figsize=(15, 10))

# Distribution of target characters
plt.subplot(2, 3, 1)
target_counts = Counter(y_targets)
chars = [vocab.idx_to_char[idx] for idx in target_counts.keys()]
counts = list(target_counts.values())
plt.bar(range(len(counts)), counts)
plt.title('Target Character Distribution')
plt.xlabel('Character Index')
plt.ylabel('Frequency')
plt.xticks(range(0, len(counts), max(1, len(counts)//10)))

# Sequence length distribution
plt.subplot(2, 3, 2)
seq_lengths = [len(seq) for seq in X_sequences]
plt.hist(seq_lengths, bins=20, alpha=0.7, edgecolor='black')
plt.title('Sequence Length Distribution')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.axvline(sequence_length, color='red', linestyle='--', label=f'Target: {sequence_length}')
plt.legend()

# First few sequences visualization
plt.subplot(2, 3, 3)
first_seqs = X_sequences[:10]
for i, seq in enumerate(first_seqs):
    plt.plot(seq, alpha=0.7, label=f'Seq {i+1}' if i < 3 else "")
plt.title('First 10 Sequences')
plt.xlabel('Position in Sequence')
plt.ylabel('Character Index')
if len(first_seqs) <= 3:
    plt.legend()

# Character transition matrix (for first 10 most common chars)
plt.subplot(2, 3, 4)
common_chars = [idx for idx, _ in Counter(y_targets).most_common(10)]
transition_matrix = np.zeros((len(common_chars), len(common_chars)))

for seq, target in zip(X_sequences, y_targets):
    if len(seq) > 0 and seq[-1] in common_chars and target in common_chars:
        prev_idx = common_chars.index(seq[-1])
        target_idx = common_chars.index(target)
        transition_matrix[prev_idx, target_idx] += 1

plt.imshow(transition_matrix, cmap='Blues')
plt.title('Character Transition Heatmap\n(Top 10 chars)')
plt.xlabel('Target Character')
plt.ylabel('Previous Character')
plt.colorbar()

# Sequence overlap visualization
plt.subplot(2, 3, 5)
overlaps = []
for i in range(len(X_sequences) - 1):
    overlap = sum(1 for a, b in zip(X_sequences[i], X_sequences[i+1]) if a == b)
    overlaps.append(overlap)

plt.hist(overlaps, bins=20, alpha=0.7, edgecolor='black')
plt.title('Sequence Overlap Distribution')
plt.xlabel('Number of Overlapping Characters')
plt.ylabel('Frequency')
plt.axvline(sequence_length - step_size, color='red', linestyle='--', 
           label=f'Expected: {sequence_length - step_size}')
plt.legend()

# Data creation efficiency
plt.subplot(2, 3, 6)
total_chars = len(processed_text)
used_chars = len(X_sequences) * sequence_length
efficiency = used_chars / total_chars * 100

plt.bar(['Efficiency'], [efficiency], color='green', alpha=0.7)
plt.ylim(0, 500)  # Allow for > 100% due to overlap
plt.title(f'Data Utilization: {efficiency:.1f}%')
plt.ylabel('Percentage')

plt.tight_layout()
plt.show()


In [None]:
# One-Hot Encoding Implementation
class OneHotEncoder:
    """
    A class to handle one-hot encoding for character sequences
    """
    
    def __init__(self, vocab_size):
        """
        Initialize one-hot encoder
        
        Parameters:
        vocab_size: size of the vocabulary
        """
        self.vocab_size = vocab_size
        print(f"One-hot encoder initialized for vocabulary size: {vocab_size}")
    
    def encode_sequences(self, sequences):
        """
        Convert sequences of indices to one-hot encoded arrays
        
        Parameters:
        sequences: array of shape (num_sequences, sequence_length)
        
        Returns:
        one_hot: array of shape (num_sequences, sequence_length, vocab_size)
        """
        num_sequences, sequence_length = sequences.shape
        
        # Initialize one-hot array
        one_hot = np.zeros((num_sequences, sequence_length, self.vocab_size))
        
        # Fill in the one-hot encoding
        for i in range(num_sequences):
            for j in range(sequence_length):
                char_idx = sequences[i, j]
                one_hot[i, j, char_idx] = 1
        
        return one_hot
    
    def encode_targets(self, targets):
        """
        Convert target indices to one-hot encoded arrays
        
        Parameters:
        targets: array of shape (num_targets,)
        
        Returns:
        one_hot: array of shape (num_targets, vocab_size)
        """
        num_targets = len(targets)
        one_hot = np.zeros((num_targets, self.vocab_size))
        
        for i, target in enumerate(targets):
            one_hot[i, target] = 1
        
        return one_hot
    
    def decode_predictions(self, predictions):
        """
        Convert one-hot predictions back to character indices
        
        Parameters:
        predictions: array of shape (num_predictions, vocab_size)
        
        Returns:
        indices: array of predicted character indices
        """
        return np.argmax(predictions, axis=-1)

# Create one-hot encoder
encoder = OneHotEncoder(vocab.vocab_size)

# Demonstrate one-hot encoding on a small sample
sample_size = 5
sample_sequences = X_sequences[:sample_size]
sample_targets = y_targets[:sample_size]

print("Original sequences (indices):")
print(sample_sequences)
print(f"Shape: {sample_sequences.shape}")

# Encode sequences
encoded_sequences = encoder.encode_sequences(sample_sequences)
encoded_targets = encoder.encode_targets(sample_targets)

print(f"\nOne-hot encoded sequences shape: {encoded_sequences.shape}")
print(f"One-hot encoded targets shape: {encoded_targets.shape}")

# Show example of one-hot encoding
print(f"\nExample: Character index {sample_targets[0]} becomes:")
print(encoded_targets[0])
print(f"Character: '{vocab.idx_to_char[sample_targets[0]]}'")

# Visualize one-hot encoding
plt.figure(figsize=(15, 12))

# Visualize first sequence's one-hot encoding
plt.subplot(3, 2, 1)
first_seq_onehot = encoded_sequences[0]
plt.imshow(first_seq_onehot.T, cmap='Blues', aspect='auto')
plt.title('One-Hot Encoding of First Sequence')
plt.xlabel('Position in Sequence')
plt.ylabel('Character Index')
plt.colorbar()

# Show target one-hot encodings
plt.subplot(3, 2, 2)
plt.imshow(encoded_targets[:sample_size].T, cmap='Reds', aspect='auto')
plt.title('One-Hot Encoding of Targets')
plt.xlabel('Example Number')
plt.ylabel('Character Index')
plt.colorbar()

# Memory usage comparison
plt.subplot(3, 2, 3)
original_size = X_sequences.nbytes / 1024  # KB
onehot_size = encoded_sequences.nbytes / 1024 if encoded_sequences.size > 0 else 0
compression_ratio = onehot_size / original_size if original_size > 0 else 0

sizes = [original_size, onehot_size]
labels = ['Original\n(indices)', f'One-Hot\n(ratio: {compression_ratio:.1f}x)']
plt.bar(labels, sizes, color=['lightblue', 'lightcoral'])
plt.title('Memory Usage Comparison')
plt.ylabel('Size (KB)')

# Sparsity analysis
plt.subplot(3, 2, 4)
sparsity = 1 - np.count_nonzero(encoded_sequences) / encoded_sequences.size
plt.bar(['Sparsity'], [sparsity * 100], color='green', alpha=0.7)
plt.title(f'One-Hot Sparsity: {sparsity*100:.1f}%')
plt.ylabel('Percentage of Zeros')
plt.ylim(0, 100)

# Character distribution in one-hot format
plt.subplot(3, 2, 5)
char_sums = np.sum(encoded_sequences, axis=(0, 1))
plt.bar(range(len(char_sums)), char_sums)
plt.title('Character Frequency in One-Hot Data')
plt.xlabel('Character Index')
plt.ylabel('Total Occurrences')

# Encoding/decoding demonstration
plt.subplot(3, 2, 6)
# Create some dummy predictions to demonstrate decoding
dummy_predictions = np.random.rand(5, vocab.vocab_size)
dummy_predictions = dummy_predictions / dummy_predictions.sum(axis=1, keepdims=True)  # Normalize
decoded_indices = encoder.decode_predictions(dummy_predictions)

x_pos = range(len(decoded_indices))
plt.bar(x_pos, decoded_indices, alpha=0.7)
plt.title('Decoded Predictions Example')
plt.xlabel('Example Number')
plt.ylabel('Predicted Character Index')

for i, idx in enumerate(decoded_indices):
    char = vocab.idx_to_char.get(idx, '?')
    plt.text(i, idx + 0.5, f"'{char}'", ha='center')

plt.tight_layout()
plt.show()

# Alternative: Using TensorFlow's built-in one-hot encoding
print("\nUsing TensorFlow's one-hot encoding:")
tf_onehot_sequences = tf.one_hot(sample_sequences, vocab.vocab_size)
tf_onehot_targets = tf.one_hot(sample_targets, vocab.vocab_size)

print(f"TensorFlow one-hot sequences shape: {tf_onehot_sequences.shape}")
print(f"TensorFlow one-hot targets shape: {tf_onehot_targets.shape}")

# Verify they're the same
sequences_match = np.allclose(encoded_sequences, tf_onehot_sequences.numpy())
targets_match = np.allclose(encoded_targets, tf_onehot_targets.numpy())

print(f"Manual vs TensorFlow encoding match - Sequences: {sequences_match}, Targets: {targets_match}")

# Memory efficiency analysis
print(f"\nMemory Efficiency Analysis:")
print(f"Original data size: {X_sequences.nbytes / 1024:.2f} KB")
print(f"One-hot data size: {encoded_sequences.nbytes / 1024:.2f} KB")
print(f"Expansion factor: {encoded_sequences.nbytes / X_sequences.nbytes:.1f}x")
print(f"Efficiency: Each index expanded to {vocab.vocab_size} floats")

# Summary statistics
print(f"\nSummary Statistics:")
print(f"- Vocabulary size: {vocab.vocab_size}")
print(f"- Sequence length: {sequence_length}")
print(f"- Number of training examples: {len(X_sequences)}")
print(f"- Input shape after one-hot: {encoded_sequences.shape}")
print(f"- Target shape after one-hot: {encoded_targets.shape}")
print(f"- Data sparsity: {sparsity*100:.1f}% zeros")

print("\nData preparation complete! Ready for model training.")
