# Multimodal Prototypes in TensorFlow
This notebook implements:
- Visual Question Answering (VQA) using CNN + LSTM fusion.
- Cross-Modal Retrieval (CLIP-style) with dual encoders.

Lightweight, Colab-ready, for undergrad learning.

In [None]:
!pip install tensorflow

## Imports

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.datasets import cifar10
from sklearn.model_selection import train_test_split

print(tf.__version__)

## Dummy Text Data (Questions/Labels)
For simplicity, we'll create toy questions like `is this a plane?`.

In [None]:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train, x_test = x_train/255.0, x_test/255.0

classes = ["airplane","automobile","bird","cat","deer","dog","frog","horse","ship","truck"]

# create simple yes/no questions
def make_questions(images, labels):
    questions, answers = [], []
    for lbl in labels.flatten():
        obj = classes[lbl]
        q = f"is this a {obj}?"
        questions.append(q)
        answers.append(1)  # always yes (toy)
    return questions, np.array(answers)

train_q, train_a = make_questions(x_train, y_train)
test_q, test_a = make_questions(x_test, y_test)

## Tokenizer for Questions

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_q)

max_len = 6
train_seq = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train_q), maxlen=max_len)
test_seq = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test_q), maxlen=max_len)

## Model 1: VQA Prototype

In [None]:
img_in = layers.Input(shape=(32,32,3))
base_cnn = ResNet50(weights=None, include_top=False, input_shape=(32,32,3), pooling="avg")
img_feat = base_cnn(img_in)

txt_in = layers.Input(shape=(max_len,))
emb = layers.Embedding(len(tokenizer.word_index)+1, 16)(txt_in)
txt_feat = layers.LSTM(32)(emb)

fusion = layers.Concatenate()([img_feat, txt_feat])
out = layers.Dense(2, activation="softmax")(fusion)

vqa_model = models.Model([img_in, txt_in], out)
vqa_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
vqa_model.summary()

### Train VQA

In [None]:
hist = vqa_model.fit([x_train, train_seq], train_a, epochs=1, batch_size=64,
                    validation_data=([x_test, test_seq], test_a))

## Model 2: Cross-Modal Retrieval Prototype

In [None]:
def build_image_encoder():
    inp = layers.Input(shape=(32,32,3))
    base = ResNet50(weights=None, include_top=False, input_shape=(32,32,3), pooling="avg")
    x = base(inp)
    x = layers.Dense(128)(x)
    return models.Model(inp, x)

def build_text_encoder():
    inp = layers.Input(shape=(max_len,))
    x = layers.Embedding(len(tokenizer.word_index)+1, 32)(inp)
    x = layers.LSTM(64)(x)
    x = layers.Dense(128)(x)
    return models.Model(inp, x)

img_enc = build_image_encoder()
txt_enc = build_text_encoder()

# simple forward pass example
img_emb = img_enc(x_train[:10])
txt_emb = txt_enc(train_seq[:10])
print("Embeddings shape:", img_emb.shape, txt_emb.shape)

## Contrastive Loss (InfoNCE)

In [None]:
def contrastive_loss(img_emb, txt_emb, temperature=0.07):
    img_emb = tf.math.l2_normalize(img_emb, axis=-1)
    txt_emb = tf.math.l2_normalize(txt_emb, axis=-1)
    logits = tf.matmul(img_emb, txt_emb, transpose_b=True) / temperature
    labels = tf.range(tf.shape(logits)[0])
    loss_i2t = tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
    loss_t2i = tf.keras.losses.sparse_categorical_crossentropy(labels, tf.transpose(logits), from_logits=True)
    return tf.reduce_mean(loss_i2t + loss_t2i)

# demo loss
print("Loss demo:", contrastive_loss(img_emb, txt_emb).numpy())