In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Task 1: Dataset Setup**

In this task, we will:
1. Download and upload the CUB-200-2011 dataset to the Kaggle notebook.
2. Extract the dataset and explore its folder structure to understand its contents.
3. Load and display metadata, including:
   - Class labels (`classes.txt`)
   - File paths (`images.txt`)
   - Train/test split (`train_test_split.txt`)

In [None]:
# Import necessary libraries
import pandas as pd
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import layers, Model
from tensorflow.keras.utils import plot_model
from transformers import BertTokenizer
from PIL import Image
from tqdm import tqdm

# Metadata file paths
classes_file = "/kaggle/input/assignment-3-cub200-2011/CUB_200_2011/classes.txt"
images_file = "/kaggle/input/assignment-3-cub200-2011/CUB_200_2011/images.txt"
train_test_split_file = "/kaggle/input/assignment-3-cub200-2011/CUB_200_2011/train_test_split.txt"

# Step 1: Display folder structure
print("Dataset folder structure:")
for root, dirs, files in os.walk("/kaggle/input/assignment-3-cub200-2011/"):
    print(f"Root: {root}")
    print(f"Dirs: {dirs}")
    print(f"Files: {files[:5]}")  # Note: Displaying the first 5 rows of each metadata file as a sample for clarity.
    print("\n")

# Step 2: Load metadata into pandas DataFrames
classes_df = pd.read_csv(classes_file, sep=" ", header=None, names=["Class_ID", "Class_Name"])
images_df = pd.read_csv(images_file, sep=" ", header=None, names=["Image_ID", "Image_Path"])
train_test_df = pd.read_csv(train_test_split_file, sep=" ", header=None, names=["Image_ID", "Is_Training"])

# Step 3: Display samples from each metadata file
print("Classes Metadata (first 5 rows):")
print(classes_df.head(), "\n")

print("Images Metadata (first 5 rows):")
print(images_df.head(), "\n")

print("Train/Test Split Metadata (first 5 rows):")
print(train_test_df.head(), "\n")

## **Task 2: Data Preprocessing**

In this task, we will:
1. Preprocess the textual descriptions using a BERT tokenizer to create input embeddings for all images.
2. Resize and normalize all bird images to 64 × 64 pixels.
3. Create a TensorFlow dataset pipeline that combines the image data and BERT embeddings for the entire dataset.

### Deliverables:
- Python scripts to preprocess the text and images.
- Output displaying the shape of batches (images and embeddings).

In [None]:
# File paths
image_root = "/kaggle/input/assignment-3-cub200-2011/CUB_200_2011/images"

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Mock textual descriptions for simplicity (replace with actual descriptions if available)
text_descriptions = [f"Description for image {i+1}" for i in range(11788)]

# Step 1: Tokenize textual descriptions
def tokenize_texts(texts, tokenizer, max_length=128):
    print("Tokenizing textual descriptions...")
    tokenized_input_ids = []
    tokenized_attention_masks = []
    
    for text in tqdm(texts, desc="Tokenizing text"):
        tokens = tokenizer(
            text,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="np",
        )
        tokenized_input_ids.append(tokens['input_ids'][0])
        tokenized_attention_masks.append(tokens['attention_mask'][0])
    
    return np.array(tokenized_input_ids), np.array(tokenized_attention_masks)

input_ids, attention_masks = tokenize_texts(text_descriptions, tokenizer)

# Step 2: Process images
def process_image(image_path):
    img = Image.open(image_path).convert('RGB')
    img = img.resize((64, 64))  # Resize to 64x64
    img_array = np.array(img) / 255.0  # Normalize pixel values to [0, 1]
    return img_array

def load_images(image_root):
    print("Processing images...")
    image_data = []
    image_paths = sorted([os.path.join(dp, f) for dp, dn, fn in os.walk(image_root) for f in fn if f.endswith('.jpg')])
    
    with tqdm(total=len(image_paths), desc="Processing images") as pbar:
        for img_path in image_paths:
            image_data.append(process_image(img_path))
            pbar.update(1)  # Update progress bar dynamically
    return np.array(image_data)

image_data = load_images(image_root)

# Step 3: Combine images and text embeddings into a TensorFlow dataset
def create_dataset(image_data, input_ids, attention_masks, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices((image_data, (input_ids, attention_masks)))
    dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

batch_size = 32
dataset = create_dataset(image_data, input_ids, attention_masks, batch_size=batch_size)

# Display the shape of batches
for images, (ids, masks) in dataset.take(1):
    print(f"Image batch shape: {images.shape}")
    print(f"Text embeddings shape: {ids.shape}, Attention masks shape: {masks.shape}")

## **Task 3: GAN Implementation**

### Objectives:
1. **Implement the Generator Model**:
   - A neural network conditioned on BERT embeddings, capable of generating bird images of size `(64, 64, 3)`.
2. **Implement the Discriminator Model**:
   - A neural network that distinguishes real bird images from fake images produced by the generator.
3. **Combine the Models into a cGAN**:
   - Integrate the generator and discriminator into a Conditional GAN (cGAN) architecture.
4. **Visualize the Architectures**:
   - Save visualizations of the following model architectures:
     - Generator
     - Discriminator
     - Combined cGAN

### Deliverables:
- Python scripts defining the GAN, cGAN, and their components.
- Saved visualizations of the model architectures:
  - `generator.png`
  - `discriminator.png`
  - `cgan.png`


In [None]:
# Generator Model
def build_generator(input_dim, embedding_dim=128, image_size=(64, 64, 3)):
    input_embedding = layers.Input(shape=(input_dim,), name="BERT_Embedding")

    x = layers.Dense(8 * 8 * 256, activation="relu")(input_embedding)
    x = layers.Reshape((8, 8, 256))(x)

    x = layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2DTranspose(64, (4, 4), strides=(2, 2), padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2DTranspose(3, (4, 4), strides=(2, 2), padding="same", activation="tanh")(x)

    generator = Model(input_embedding, x, name="Generator")
    return generator

# Discriminator Model
def build_discriminator(image_size=(64, 64, 3), embedding_dim=128):
    image_input = layers.Input(shape=image_size, name="Image_Input")
    embedding_input = layers.Input(shape=(embedding_dim,), name="BERT_Embedding")

    x = layers.Conv2D(64, (4, 4), strides=(2, 2), padding="same", activation="leaky_relu")(image_input)
    x = layers.Conv2D(128, (4, 4), strides=(2, 2), padding="same", activation="leaky_relu")(x)
    x = layers.Flatten()(x)

    y = layers.Dense(8 * 8 * 128, activation="relu")(embedding_input)
    y = layers.Reshape((8, 8, 128))(y)
    y = layers.Flatten()(y)

    combined = layers.Concatenate()([x, y])
    combined = layers.Dense(256, activation="leaky_relu")(combined)
    combined = layers.Dense(1, activation="sigmoid")(combined)

    discriminator = Model([image_input, embedding_input], combined, name="Discriminator")
    return discriminator

# cGAN Model
def build_cgan(generator, discriminator):
    discriminator.trainable = False

    input_embedding = layers.Input(shape=(128,), name="BERT_Embedding")
    generated_image = generator(input_embedding)
    validity = discriminator([generated_image, input_embedding])

    cgan = Model(input_embedding, validity, name="cGAN")
    return cgan

# Build Models
embedding_dim = 128
image_size = (64, 64, 3)
generator = build_generator(input_dim=embedding_dim, image_size=image_size)
discriminator = build_discriminator(image_size=image_size, embedding_dim=embedding_dim)
cgan = build_cgan(generator, discriminator)

# Compile Discriminator
optimizer = tf.keras.optimizers.Adam(0.0002, 0.5)
discriminator.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

# Compile cGAN
cgan.compile(loss="binary_crossentropy", optimizer=optimizer)

# Save Model Visualizations
plot_model(generator, to_file="generator.png", show_shapes=True, show_layer_names=True)
plot_model(discriminator, to_file="discriminator.png", show_shapes=True, show_layer_names=True)
plot_model(cgan, to_file="cgan.png", show_shapes=True, show_layer_names=True)

# Summary Outputs
print("Generator Summary:")
generator.summary()
print("Discriminator Summary:")
discriminator.summary()
print("cGAN Summary:")
cgan.summary()

## **Task 4: Training the cGAN**

In this task, we will:
1. Implement the training loop for the cGAN, which includes:
   - Updating the generator to improve the quality of generated images.
   - Updating the discriminator to distinguish between real and fake images.
2. Track and visualize the loss values for both the generator and the discriminator during training.

### Deliverables:
- A Python script that implements the cGAN training loop.
- Loss plots for the generator and discriminator to observe the progression of training.

In [None]:
# Constants
latent_dim = 128

# Define loss functions
def generator_loss(fake_output):
    return tf.keras.losses.BinaryCrossentropy(from_logits=True)(
        tf.ones_like(fake_output), fake_output
    )

def discriminator_loss(real_output, fake_output):
    real_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)(
        tf.ones_like(real_output), real_output
    )
    fake_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)(
        tf.zeros_like(fake_output), fake_output
    )
    return real_loss + fake_loss

# Optimizers
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

# Training step
@tf.function
def train_step(generator, discriminator, images, bert_embeddings):
    noise = tf.random.normal([images.shape[0], latent_dim])

    # Ensure discriminator is trainable
    discriminator.trainable = True

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator(bert_embeddings, training=True)

        real_output = discriminator([images, bert_embeddings], training=True)
        fake_output = discriminator([generated_images, bert_embeddings], training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

    return gen_loss, disc_loss

# Training loop
def train(generator, discriminator, dataset, epochs):
    gen_losses = []
    disc_losses = []

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        epoch_gen_loss = 0
        epoch_disc_loss = 0

        for images, (bert_embeddings, _) in tqdm(dataset, desc=f"Epoch {epoch + 1}", leave=False):
            gen_loss, disc_loss = train_step(generator, discriminator, images, bert_embeddings)
            epoch_gen_loss += gen_loss
            epoch_disc_loss += disc_loss

        gen_losses.append(epoch_gen_loss / len(dataset))
        disc_losses.append(epoch_disc_loss / len(dataset))

        print(f"Generator Loss: {gen_losses[-1]:.4f}, Discriminator Loss: {disc_losses[-1]:.4f}")

    return gen_losses, disc_losses

# Plotting function
def plot_losses(gen_losses, disc_losses):
    plt.figure(figsize=(10, 5))
    plt.plot(gen_losses, label="Generator Loss")
    plt.plot(disc_losses, label="Discriminator Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.title("Generator and Discriminator Loss")
    plt.show()

# Training execution
EPOCHS = 50
print("Starting training...")
gen_losses, disc_losses = train(generator, discriminator, dataset, EPOCHS)

# Plot losses
plot_losses(gen_losses, disc_losses)

## **Task 5: Evaluation and Reflection**

In this task, we will:
1. Generate synthetic bird images using the trained cGAN:
   - Use the generator to create a batch of images conditioned on BERT embeddings.
   - Visualize the generated images to assess their quality.
2. Reflect on the quality of the generated images:
   - Evaluate the realism and diversity of the synthetic images.
   - Identify any noticeable artifacts or issues in the generated outputs.
3. Suggest potential improvements to the model or training process based on the observations.

### Deliverables:
- A Python script to generate and visualize synthetic bird images.
- A written reflection (3-5 sentences) discussing the quality of the generated images and proposing potential improvements.

In [None]:
# Ensure BERT embeddings are correctly defined
# Use the embeddings from your preprocessing pipeline (Task 2)
if 'input_ids' in locals():
    bert_embeddings = input_ids  # Replace with the correct variable if named differently
else:
    raise ValueError("BERT embeddings (input_ids) not found. Ensure Task 2 has been completed.")

# Function to generate and visualize synthetic bird images
def generate_synthetic_images(generator, embeddings, num_images=10):
    # Ensure the number of images does not exceed available embeddings
    num_images = min(num_images, len(embeddings))

    # Select embeddings sequentially or randomly (based on your preference)
    selected_embeddings = embeddings[:num_images]

    # Generate synthetic images
    print("Generating synthetic bird images...")
    synthetic_images = generator.predict(selected_embeddings, verbose=0)

    # Rescale images from [-1, 1] to [0, 1] for visualization
    synthetic_images = (synthetic_images + 1) / 2.0

    # Visualize the generated images
    plt.figure(figsize=(15, 5))
    for i in range(num_images):
        plt.subplot(1, num_images, i + 1)
        plt.imshow(synthetic_images[i])
        plt.axis('off')
    plt.suptitle("Generated Synthetic Bird Images", fontsize=16)
    plt.show()

# Number of images to generate
num_images_to_generate = 10

# Call the function to generate and visualize images
generate_synthetic_images(generator, bert_embeddings, num_images=num_images_to_generate)

## **Knowledge Questions**

### 1. GANs often face mode collapse, where the generator produces limited variations of data. What techniques can be introduced to mitigate mode collapse, and how would you evaluate their effectiveness in this assignment’s context?

**Answer**:  
Mode collapse occurs when the generator produces limited variations of outputs. Techniques to mitigate this include:  
- **Mini-batch discrimination**: Introduce a layer in the discriminator to compare samples within a batch to encourage diversity.  
- **Feature matching**: Modify the generator’s objective to match the discriminator’s intermediate layer features instead of focusing solely on fooling the discriminator.  
- **Spectral normalization**: Apply this to the discriminator to stabilize training and improve gradient flows.  
- **Noise regularization**: Add random noise to the input or latent space to encourage variability.  

In this assignment’s context, evaluating effectiveness could involve analyzing the diversity of generated bird images using metrics like the Fréchet Inception Distance (FID) or qualitative assessment of variations in key attributes such as color, size, or pose.

---

### 2. Compare and contrast Wasserstein GANs (WGANs) with standard GANs in terms of training stability and convergence. Would WGAN principles be beneficial for the cGAN architecture used here? Why or why not?

**Answer**:  
WGANs differ from standard GANs in the following ways:
- **Training Stability**: WGANs use the Earth Mover's Distance (Wasserstein distance) as a loss function, which provides smoother gradients and avoids vanishing gradient issues common in standard GANs.
- **Convergence**: WGANs tend to converge more reliably due to the use of gradient clipping and critic-based training.  

For the cGAN in this assignment, WGAN principles could be beneficial as they may stabilize training, especially given the adversarial dynamics between the generator and discriminator. However, integrating WGAN principles would require modifications like replacing the binary cross-entropy loss with Wasserstein loss and potentially rebalancing the discriminator's training frequency.

---

### 3. How would you assess whether BERT embeddings capture enough semantic information relevant to the bird species in the dataset? Propose a method for evaluating their effectiveness.

**Answer**:  
To assess the effectiveness of BERT embeddings in capturing semantic information:
1. **Embedding Visualization**: Use dimensionality reduction techniques like t-SNE or PCA to visualize the embeddings and check for clustering corresponding to bird species.
2. **Classification Task**: Train a simple classifier (e.g., logistic regression) on the embeddings to predict bird species and evaluate its accuracy.  
3. **Semantic Correlation**: Calculate the cosine similarity between embeddings of similar species descriptions and compare them with dissimilar species. High intra-species similarity and low inter-species similarity indicate effectiveness.

---

### 4. GAN training is inherently unstable due to the adversarial dynamics between the generator and discriminator. Propose a method to detect instability early in the training process and adjust hyperparameters dynamically to stabilize training.

**Answer**:  
To detect instability early:
- **Monitor Loss Trends**: Plot the generator and discriminator losses. Oscillating or diverging losses indicate instability.
- **Gradient Norms**: Track gradient norms for exploding or vanishing gradients.  
- **Diversity Metrics**: Measure the diversity of generated images to detect mode collapse.  

Dynamic adjustments:
- **Learning Rate Adjustment**: Reduce the learning rate if instability is detected.
- **Training Frequency**: Adjust the training frequency of the generator and discriminator to balance their updates.
- **Regularization**: Introduce techniques like weight decay or gradient penalty to stabilize gradients.

---

### 5. Discuss the ethical implications of using cGANs in sensitive applications such as healthcare or media generation. What guidelines would you propose for responsible use?

**Answer**:  
Ethical implications of cGANs include:
- **Misinformation**: cGANs could generate fake content, leading to the spread of misinformation.
- **Bias Propagation**: Training on biased datasets may lead to biased outputs.  
- **Privacy Concerns**: Generated data resembling real individuals or entities could breach privacy.  

Guidelines for responsible use:
- **Dataset Transparency**: Ensure datasets are unbiased and sourced ethically.  
- **Content Labeling**: Clearly label synthetic content to prevent misuse.  
- **Restricted Access**: Use cGANs in controlled environments and for positive applications like medical imaging or artistic creativity.

---

### 6. Text-to-image generation tasks raise concerns about intellectual property rights, especially when trained on publicly available datasets. Analyze how such concerns apply to this assignment and suggest ways to mitigate potential legal risks.

**Answer**:  
Concerns:
- **Copyright Issues**: Training on datasets with copyrighted images could infringe intellectual property rights.  
- **Derivative Works**: Generated images may resemble original copyrighted works, raising legal issues.

Mitigation:
- **Dataset Licensing**: Use datasets with clear licensing terms allowing derivative works.  
- **Attribution**: Acknowledge the sources of datasets.  
- **Content Screening**: Regularly audit generated images to ensure they do not closely replicate copyrighted material.

---
