# Week 6, Day 5: Language Models and Text Generation

## Learning Objectives
- Understand language model concepts
- Learn text generation techniques
- Master sequence modeling
- Practice implementing language models

## Topics Covered
1. Language Model Fundamentals
2. Text Generation Methods
3. Sequence-to-Sequence Models
4. Model Evaluation

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import nltk
from nltk.tokenize import word_tokenize
from nltk.lm import MLE
from nltk.util import ngrams

## 1. N-gram Language Models

In [None]:
def ngram_model_example():
    # Sample text
    text = """
    Natural language processing is a field of artificial intelligence.
    Language models help computers understand and generate human text.
    Modern language models use deep learning techniques.
    These models can generate coherent and contextually relevant text.
    """
    
    # Tokenize text
    tokens = word_tokenize(text.lower())
    
    # Create n-grams
    bigrams = list(ngrams(tokens, 2))
    trigrams = list(ngrams(tokens, 3))
    
    # Calculate probabilities
    def calculate_ngram_probs(ngrams):
        ngram_freq = {}
        for ngram in ngrams:
            if ngram in ngram_freq:
                ngram_freq[ngram] += 1
            else:
                ngram_freq[ngram] = 1
        return ngram_freq
    
    bigram_probs = calculate_ngram_probs(bigrams)
    trigram_probs = calculate_ngram_probs(trigrams)
    
    # Print most common n-grams
    print("Most Common Bigrams:")
    for bigram, freq in sorted(bigram_probs.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"{bigram}: {freq}")
    
    print("\nMost Common Trigrams:")
    for trigram, freq in sorted(trigram_probs.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"{trigram}: {freq}")
    
    # Visualize n-gram distributions
    plt.figure(figsize=(12, 5))
    
    plt.subplot(121)
    plt.bar(range(len(bigram_probs)), sorted(bigram_probs.values(), reverse=True))
    plt.title('Bigram Distribution')
    plt.xlabel('Bigram Rank')
    plt.ylabel('Frequency')
    
    plt.subplot(122)
    plt.bar(range(len(trigram_probs)), sorted(trigram_probs.values(), reverse=True))
    plt.title('Trigram Distribution')
    plt.xlabel('Trigram Rank')
    plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

ngram_model_example()

## 2. Neural Language Models

In [None]:
def neural_lm_example():
    # Sample sentences
    sentences = [
        "The cat sat on the mat",
        "The dog ran in the park",
        "A bird flew over the tree",
        "The sun shines in the sky",
        "A fish swims in the sea"
    ]
    
    # Tokenize text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    total_words = len(tokenizer.word_index) + 1
    
    # Create input sequences
    input_sequences = []
    for sentence in sentences:
        token_list = tokenizer.texts_to_sequences([sentence])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    
    # Pad sequences
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
    
    # Create predictors and target
    X = input_sequences[:, :-1]
    y = input_sequences[:, -1]
    y = tf.keras.utils.to_categorical(y, num_classes=total_words)
    
    # Create model
    model = Sequential([
        Embedding(total_words, 16, input_length=max_sequence_len-1),
        LSTM(32),
        Dense(total_words, activation='softmax')
    ])
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Train model
    history = model.fit(X, y, epochs=100, verbose=0)
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    
    plt.subplot(121)
    plt.plot(history.history['loss'])
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    
    plt.subplot(122)
    plt.plot(history.history['accuracy'])
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    
    plt.tight_layout()
    plt.show()
    
    # Generate text
    def generate_text(seed_text, next_words=5):
        for _ in range(next_words):
            token_list = tokenizer.texts_to_sequences([seed_text])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
            predicted = model.predict(token_list, verbose=0)
            predicted = np.argmax(predicted, axis=-1)
            output_word = ""
            for word, index in tokenizer.word_index.items():
                if index == predicted:
                    output_word = word
                    break
            seed_text += " " + output_word
        return seed_text
    
    # Test text generation
    print("\nGenerated Text Examples:")
    print(generate_text("The cat"))
    print(generate_text("A bird"))

neural_lm_example()

## 3. Text Generation

In [None]:
def text_generation_example():
    # Sample text corpus
    text = """
    The quick brown fox jumps over the lazy dog.
    A quick brown dog jumps over the lazy fox.
    The lazy brown fox sleeps under the quick dog.
    A lazy dog sleeps under the quick brown fox.
    """
    
    # Tokenize text
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts([text])
    total_chars = len(tokenizer.word_index) + 1
    
    # Create character sequences
    input_sequences = []
    for line in text.split('\n'):
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    
    # Pad sequences
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
    
    # Create predictors and target
    X = input_sequences[:, :-1]
    y = input_sequences[:, -1]
    y = tf.keras.utils.to_categorical(y, num_classes=total_chars)
    
    # Create model
    model = Sequential([
        Embedding(total_chars, 16, input_length=max_sequence_len-1),
        LSTM(32, return_sequences=True),
        LSTM(32),
        Dense(total_chars, activation='softmax')
    ])
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Train model
    history = model.fit(X, y, epochs=100, verbose=0)
    
    # Generate text
    def generate_text(seed_text, next_chars=50):
        for _ in range(next_chars):
            token_list = tokenizer.texts_to_sequences([seed_text])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
            predicted = model.predict(token_list, verbose=0)
            predicted = np.argmax(predicted, axis=-1)
            output_char = ""
            for char, index in tokenizer.word_index.items():
                if index == predicted:
                    output_char = char
                    break
            seed_text += output_char
        return seed_text
    
    # Test text generation
    print("Generated Text:")
    print(generate_text("The quick"))
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    
    plt.subplot(121)
    plt.plot(history.history['loss'])
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    
    plt.subplot(122)
    plt.plot(history.history['accuracy'])
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    
    plt.tight_layout()
    plt.show()

text_generation_example()

## Practical Exercises

In [None]:
# Exercise 1: Custom Language Model

def language_model_exercise():
    # Sample text corpus
    corpus = [
        "Machine learning is a subset of artificial intelligence.",
        "Deep learning models can process complex data.",
        "Neural networks are inspired by biological systems.",
        "Data science combines statistics and programming."
    ]
    
    print("Task: Build a custom language model")
    print("1. Implement text preprocessing")
    print("2. Create model architecture")
    print("3. Train the model")
    print("4. Generate text")
    
    # Your code here

language_model_exercise()

In [None]:
# Exercise 2: Text Style Transfer

def style_transfer_exercise():
    # Sample texts in different styles
    formal_texts = [
        "The meeting is scheduled for tomorrow afternoon.",
        "Please find attached the requested documents.",
        "We look forward to your response."
    ]
    
    informal_texts = [
        "Hey, let's meet up tomorrow!",
        "Here are the docs you wanted.",
        "Can't wait to hear back from you!"
    ]
    
    print("Task: Implement text style transfer")
    print("1. Create style embeddings")
    print("2. Build transfer model")
    print("3. Transform text")
    print("4. Evaluate results")
    
    # Your code here

style_transfer_exercise()

## MCQ Quiz

1. What is a language model?
   - a) Translation system
   - b) Probability distribution over text
   - c) Text editor
   - d) Speech recognition

2. What is perplexity?
   - a) Model size
   - b) Model performance measure
   - c) Text length
   - d) Training time

3. What are n-grams?
   - a) Neural networks
   - b) Sequence of n items
   - c) Text classification
   - d) Model parameters

4. What is beam search?
   - a) Data preprocessing
   - b) Text generation method
   - c) Model architecture
   - d) Training algorithm

5. What is temperature in text generation?
   - a) Hardware metric
   - b) Randomness parameter
   - c) Model size
   - d) Training time

6. What is the purpose of tokenization?
   - a) Text generation
   - b) Text to discrete units
   - c) Model training
   - d) Evaluation

7. What is sequence-to-sequence model?
   - a) Classification model
   - b) Sequence transformation
   - c) Clustering algorithm
   - d) Visualization tool

8. What is attention mechanism?
   - a) Training method
   - b) Focus on relevant parts
   - c) Data preprocessing
   - d) Model evaluation

9. What is teacher forcing?
   - a) Model architecture
   - b) Training technique
   - c) Evaluation metric
   - d) Data augmentation

10. What is the purpose of sampling strategies?
    - a) Data collection
    - b) Text generation control
    - c) Model training
    - d) Error analysis

Answers: 1-b, 2-b, 3-b, 4-b, 5-b, 6-b, 7-b, 8-b, 9-b, 10-b