In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LoRALayer(nn.Module):
    def __init__(self, in_features, out_features, rank=4):
        super().__init__()
        self.down = nn.Linear(in_features, rank, bias=False)
        self.up = nn.Linear(rank, out_features, bias=False)

    def forward(self, x):
        return self.up(self.down(x))

class ImageTextToVideoGenerator(nn.Module):
    def __init__(self, image_size=32, text_size=512, latent_dim=128, rank=4):
        super().__init__()
        
        self.image_size = image_size
        self.text_size = text_size
        self.latent_dim = latent_dim
        self.input_size = image_size * image_size * 3 + text_size

        # Two LSTMs for forward and backward motion
        self.lstm_next = nn.LSTM(input_size=self.input_size, hidden_size=self.latent_dim, batch_first=True)
        self.lstm_prev = nn.LSTM(input_size=self.input_size, hidden_size=self.latent_dim, batch_first=True)

        # LoRA layers for frame generation
        self.lora_fc1 = LoRALayer(self.latent_dim, 512, rank=rank)
        self.lora_fc2 = LoRALayer(512, 256, rank=rank)
        self.lora_fc3 = LoRALayer(256, 128, rank=rank)

        # Output layers for next/previous frames
        self.fc_out = nn.Linear(128, 3 * image_size * image_size)

    def forward(self, image_features, text_features, hidden_next=None, hidden_prev=None):
        batch_size = image_features.size(0)
        image_features = image_features.view(batch_size, -1)  
        text_features = text_features.view(batch_size, -1)  
        
        combined_features = torch.cat([image_features, text_features], dim=1).unsqueeze(1)
        noise = torch.randn_like(combined_features) * 0.1
        combined_features = combined_features + noise
        # Generate next frame
        lstm_out_next, hidden_next = self.lstm_next(combined_features, hidden_next)
        lstm_out_next = lstm_out_next.squeeze(1)
        x_next = F.relu(self.lora_fc1(lstm_out_next))
        x_next = F.relu(self.lora_fc2(x_next))
        x_next = F.relu(self.lora_fc3(x_next))
        output_frame_next = self.fc_out(x_next).view(batch_size, 3, self.image_size, self.image_size)

        # Generate previous frame
        lstm_out_prev, hidden_prev = self.lstm_prev(combined_features, hidden_prev)
        lstm_out_prev = lstm_out_prev.squeeze(1)
        x_prev = F.relu(self.lora_fc1(lstm_out_prev))
        x_prev = F.relu(self.lora_fc2(x_prev))
        x_prev = F.relu(self.lora_fc3(x_prev))
        output_frame_prev = self.fc_out(x_prev).view(batch_size, 3, self.image_size, self.image_size)

        return output_frame_prev, output_frame_next, hidden_prev, hidden_next


In [2]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import clip
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np

transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
])

# CIFAR-10 Dataset
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Step 2: Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

# Step 3: Random text feature generator
text_descriptions = [
    "A cat jumping over a fence.",
    "A dog playing with a ball in the park.",
    "A person sitting at a desk working on a laptop.",
    "A car driving down a road during sunset.",
    "A group of people walking down the street.",
    "A child playing with a toy in the living room.",
    "A beautiful mountain landscape during sunrise.",
    "A person cooking food in the kitchen.",
    "A group of birds flying in the sky."
]

def generate_random_text_feature(batch_size):
    random_descriptions = np.random.choice(text_descriptions, size=batch_size)
    text_input = clip.tokenize(random_descriptions).to(device)
    with torch.no_grad():
        text_features = clip_model.encode_text(text_input)
    return text_features, random_descriptions

In [7]:
def train(generator, dataloader, num_epochs=50, device="cuda"):
    generator.to(device)
    optimizer = optim.Adam(generator.parameters(), lr=0.0002)

    for epoch in range(num_epochs):
        for images, _ in dataloader:
            images = images.to(device)
            
            # Generate random text features for each batch
            text_features, _ = generate_random_text_feature(images.size(0))

            optimizer.zero_grad()

            # Initialize hidden states
            hidden_next = None
            hidden_prev = None

            # Forward pass through the generator
            prev_frame, next_frame, hidden_prev, hidden_next = generator(images, text_features, hidden_next, hidden_prev)
            recon_frame, _, _,_ = generator(prev_frame, text_features, None, hidden_prev)
            _,recon_frame2, _, _ = generator(next_frame, text_features, hidden_next, None)
            # Loss calculation using cycle consistency
            loss = F.mse_loss(recon_frame, images)  # Enforce cycle consistency
            loss += F.mse_loss(recon_frame2, images)
            # Backpropagation
            loss.backward()
            optimizer.step()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")
        if (epoch+1) % 10 == 0:
            torch.save(generator.state_dict(), f"image_text_to_video_epoch{epoch+1}.pth")
            print(f"Model saved! Epoch: {epoch+1}")
    # Save the trained model
    torch.save(generator.state_dict(), "image_text_to_video.pth")
    print("Model saved!")

In [None]:
generator = ImageTextToVideoGenerator()  # Use your model here
train(generator, train_dataloader, num_epochs=50, device=device)

Epoch [1/50], Loss: 0.06837911158800125
Epoch [2/50], Loss: 0.063211590051651
Epoch [3/50], Loss: 0.06870967894792557
Epoch [4/50], Loss: 0.06174229457974434
Epoch [5/50], Loss: 0.05803699791431427
Epoch [6/50], Loss: 0.07680699229240417
Epoch [7/50], Loss: 0.07444904744625092
Epoch [8/50], Loss: 0.06532055884599686
Epoch [9/50], Loss: 0.07597209513187408
Epoch [10/50], Loss: 0.05918559432029724
Model saved! Epoch: 10
Epoch [11/50], Loss: 0.06262199580669403
Epoch [12/50], Loss: 0.06586286425590515
Epoch [13/50], Loss: 0.051472101360559464
Epoch [14/50], Loss: 0.06625089049339294
Epoch [15/50], Loss: 0.060234081000089645
Epoch [16/50], Loss: 0.06675697863101959
Epoch [17/50], Loss: 0.06832742691040039
Epoch [18/50], Loss: 0.06786681711673737
Epoch [19/50], Loss: 0.068850576877594
Epoch [20/50], Loss: 0.07954797893762589
Model saved! Epoch: 20
Epoch [21/50], Loss: 0.06120537221431732
Epoch [22/50], Loss: 0.06305098533630371
Epoch [23/50], Loss: 0.0741385966539383
Epoch [24/50], Loss: 0.

In [20]:
import torch
import cv2
import numpy as np
from PIL import Image
import torchvision.transforms as transforms
import clip
# Load the trained model
device = "cuda" if torch.cuda.is_available() else "cpu"
generator = ImageTextToVideoGenerator()  # Your model class
generator.load_state_dict(torch.load("image_text_to_video_epoch20.pth"))
generator.to(device)

def load_and_preprocess_image(image_path):
    image = Image.open(image_path)
    preprocess_transform = transforms.Compose([
        transforms.Resize((32, 32)),
        transforms.ToTensor(),
    ])
    image_tensor = preprocess_transform(image).unsqueeze(0)  # Add batch dimension
    return image_tensor.to(device)
# Function to generate a video from the model
def generate_video(generator, initial_image, text_embedding, num_frames=20, filename="outputs/output.avi"):
    generator.eval()  # Set model to evaluation mode
    initial_image = initial_image.to(device).unsqueeze(0)  # Add batch dimension
    text_embedding = text_embedding.to(device).unsqueeze(0)

    # Initialize hidden states for previous and next frames
    hidden_next = None
    hidden_prev = None
    
    frames = []

    # Start with the initial image and generate frames
    current_image = initial_image
    original_image = initial_image.squeeze(0).squeeze(0)  # Remove batch dimension
    original_image_np = original_image.permute(1, 2, 0).cpu().numpy()  # Shape (3, 32, 32) -> (32, 32, 3)
    original_image_np = (original_image_np * 255).astype(np.uint8)  # Convert to [0, 255] range
    frames.append(original_image_np)
    for _ in range(num_frames):
        # Generate previous and next frames from the current image and text embedding
        prev_frame, next_frame, hidden_prev, hidden_next = generator(current_image, text_embedding, hidden_next, hidden_prev)

        # Use the next frame as the new input for the next iteration
        current_image = next_frame
        
        # Convert the generated frame to a numpy array and append to the frames list
        frame_np = next_frame.squeeze(0).permute(1, 2, 0).detach().cpu().numpy()  # Shape (3, 32, 32) -> (32, 32, 3)
        frame_np = (frame_np * 255).astype(np.uint8)  # Convert to [0, 255] range
        frames.append(frame_np)

    # Save the frames as a video using OpenCV
    height, width, _ = frames[0].shape
    fourcc = cv2.VideoWriter_fourcc(*"XVID")  # Codec for video writing
    out = cv2.VideoWriter(filename, fourcc, 10, (width, height))  # 10 FPS

    for frame in frames:
        out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))  # Convert RGB to BGR

    out.release()  # Finalize the video file
    print(f"Video saved as {filename}!")

# Example to generate a video
sample_image_path = "images/test.jpg"  # Path to your image
sample_image = load_and_preprocess_image(sample_image_path)
sample_text, real_text = generate_random_text_feature(1)  # Generate random text feature
print(real_text)
generate_video(generator, sample_image, sample_text)


['A group of birds flying in the sky.']
Video saved as outputs/output.avi!


: 