# Implementation: Cross Attention (Conceptual)

**Goal**: Mix Vision and Text.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CrossAttention(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.query = nn.Linear(d_model, d_model) # For Image Features
        self.key = nn.Linear(d_model, d_model)   # For Text Embeddings
        self.value = nn.Linear(d_model, d_model) # For Text Embeddings
        
    def forward(self, img_features, text_embeddings):
        # img: [Batch, Pixels, D]
        # text: [Batch, Tokens, D]
        
        Q = self.query(img_features)
        K = self.key(text_embeddings)
        V = self.value(text_embeddings)
        
        # Attention = Softmax(Q @ K.T) @ V
        scores = torch.bmm(Q, K.transpose(1, 2))
        attn_map = F.softmax(scores, dim=-1)
        
        output = torch.bmm(attn_map, V)
        return output

# 1. Mock Data
img_feat = torch.randn(1, 10, 64) # 10 pixels
txt_emb = torch.randn(1, 5, 64)   # 5 words

# 2. Forward
layer = CrossAttention(64)
out = layer(img_feat, txt_emb)

print(f"Output Shape: {out.shape} (Same as Image Features)")
print("But now the pixels contain information about the words.")

## Conclusion
This mechanism allows Stable Diffusion to paint 'A dog' in the specific region of the image that looks like a dog shape (high attention score).