In [6]:
import os
import numpy as np
import cv2
import pandas as pd
import glob
# Parameters
base_dir = '/content/ucf101'
selected_classes = ["Bowling", "Diving", "Biking", "Fencing", "Skiing"]
videos_per_class = 3  # Limit videos per class
frame_size = (64, 64)
color_mode = 'grayscale'
seq_length = 20
pred_length = 5
batch_size = 500

def preprocess_ucf101(base_dir, selected_classes, videos_per_class=3):
    data = []
    labels_list = []
    batch_count = 0

    for class_name in selected_classes:
        class_dir = os.path.join(base_dir, 'train', class_name)
        if not os.path.exists(class_dir):
            print(f"Directory not found: {class_dir}")
            continue

        # Get only specified number of videos
        videos = [v for v in os.listdir(class_dir) if v.endswith('.avi')][:videos_per_class]
        print(f"Processing {len(videos)} videos from {class_name}")

        for video in videos:
            video_path = os.path.join(class_dir, video)
            cap = cv2.VideoCapture(video_path)

            video_frames = []
            while True:
                ret, frame = cap.read()
                if not ret:
                    break

                frame = cv2.resize(frame, frame_size)
                if color_mode == 'grayscale':
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                frame = frame / 255.0
                video_frames.append(frame)

            cap.release()

            if len(video_frames) >= seq_length + pred_length:
                for i in range(len(video_frames) - seq_length - pred_length + 1):
                    input_sequence = np.array(video_frames[i:i+seq_length])
                    target_sequence = np.array(video_frames[i+seq_length:i+seq_length+pred_length])

                    data.append(input_sequence)
                    labels_list.append(target_sequence)

                    if len(data) >= batch_size:
                        np.save(f"data_batch_{batch_count}.npy", np.array(data))
                        np.save(f"labels_batch_{batch_count}.npy", np.array(labels_list))
                        print(f"Saved batch {batch_count}")
                        batch_count += 1
                        data = []
                        labels_list = []

    # Save remaining data
    if len(data) > 0:
        np.save(f"data_batch_{batch_count}.npy", np.array(data))
        np.save(f"labels_batch_{batch_count}.npy", np.array(labels_list))
        print(f"Saved final batch {batch_count}")

if __name__ == "__main__":
    # Download dataset using kaggle API
    if not os.path.exists('/content/ucf101'):
        from google.colab import files
        uploaded = files.upload()  # Upload kaggle.json

        !mkdir -p ~/.kaggle
        !cp kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
        !kaggle datasets download -d matthewjansen/ucf101-action-recognition
        !unzip ucf101-action-recognition.zip -d /content/ucf101

    # Run preprocessing
    preprocess_ucf101(base_dir, selected_classes, videos_per_class)

    # Verify files
    data_files = sorted(glob.glob('data_batch_*.npy'))
    print(f"\nGenerated {len(data_files)} data batch files")

    if data_files:
        data = np.load(data_files[0])
        labels = np.load(data_files[0].replace('data', 'labels'))
        print(f'Sample batch shapes - Data: {data.shape}, Labels: {labels.shape}')

Processing 3 videos from Bowling
Processing 3 videos from Diving
Saved batch 0
Processing 3 videos from Biking
Saved batch 1
Saved batch 2
Processing 3 videos from Fencing
Processing 3 videos from Skiing
Saved batch 3
Saved final batch 4

Generated 28 data batch files
Sample batch shapes - Data: (500, 20, 64, 64), Labels: (500, 5, 64, 64)


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import glob
import os

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class VideoTransformer(nn.Module):
    def __init__(self, frame_size=64, patch_size=8, num_frames=10, d_model=256):
        super().__init__()
        self.patch_size = patch_size
        self.num_patches = (frame_size // patch_size) ** 2
        self.patch_dim = patch_size * patch_size
        self.d_model = d_model

        self.patch_embed = nn.Linear(self.patch_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=8,
            dim_feedforward=1024,
            dropout=0.1,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=8)

        self.decoder = nn.Sequential(
            nn.Linear(d_model, 512),
            nn.ReLU(),
            nn.Linear(512, self.patch_dim),
            nn.Sigmoid()
        )

    def to_patches(self, x):
        B, T, C, H, W = x.shape
        x = x.view(B, T, C, H//self.patch_size, self.patch_size, W//self.patch_size, self.patch_size)
        x = x.permute(0, 1, 3, 5, 2, 4, 6).contiguous()
        x = x.view(B, T, self.num_patches, -1)
        return x

    def from_patches(self, x, H, W):
        B, T, L, D = x.shape
        h = H // self.patch_size
        w = W // self.patch_size
        x = x.view(B, T, h, w, self.patch_size, self.patch_size)
        x = x.permute(0, 1, 4, 2, 5, 3).contiguous()
        x = x.view(B, T, H, W)
        return x.unsqueeze(2)

    def forward(self, x, future_frames=5):
        B, T, C, H, W = x.shape
        x = self.to_patches(x)
        x = self.patch_embed(x)
        x = x.view(B * T, self.num_patches, self.d_model)
        x = self.pos_encoder(x)
        x = x.view(B, T * self.num_patches, self.d_model)

        memory = self.transformer_encoder(x)

        outputs = []
        for _ in range(future_frames):
            next_frame = self.decoder(memory[:, -self.num_patches:])
            next_frame = next_frame.view(B, 1, self.num_patches, self.patch_dim)
            outputs.append(self.from_patches(next_frame, H, W))

            next_embedded = self.patch_embed(next_frame)
            next_embedded = next_embedded.view(B, self.num_patches, self.d_model)
            next_embedded = self.pos_encoder(next_embedded)
            memory = torch.cat([memory[:, self.num_patches:], next_embedded], dim=1)

        return torch.cat(outputs, dim=1)

class VideoDataset(Dataset):
    def __init__(self, data_files, device):
        self.sequences = []
        self.targets = []
        self.device = device

        print(f"Loading {len(data_files)} data files...")
        for file in data_files:
            data = np.load(file)
            label_file = file.replace('data', 'labels')
            labels = np.load(label_file)

            data = data[:, :10]
            labels = labels[:, :5]

            if len(data.shape) == 4:
                data = data[:, :, None, :, :]
                labels = labels[:, :, None, :, :]

            self.sequences.append(torch.FloatTensor(data).to(device))
            self.targets.append(torch.FloatTensor(labels).to(device))

        self.sequences = torch.cat(self.sequences, dim=0)
        self.targets = torch.cat(self.targets, dim=0)

        print(f"Dataset loaded. Total sequences: {len(self.sequences)}")
        print(f"Sequence shape: {self.sequences.shape}")
        print(f"Target shape: {self.targets.shape}")

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

def train_model(model, train_loader, val_loader, device, num_epochs=15):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

    best_val_loss = float('inf')
    train_losses = []
    val_losses = []

    print("Starting training...")

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        batch_count = 0

        for batch_idx, (inputs, targets) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            epoch_loss += loss.item()
            batch_count += 1

            if batch_idx % 10 == 0:
                print(f'Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.6f}')

        avg_train_loss = epoch_loss / batch_count
        train_losses.append(avg_train_loss)

        # Validation
        model.eval()
        val_loss = 0
        val_batch_count = 0

        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                val_loss += criterion(outputs, targets).item()
                val_batch_count += 1

        avg_val_loss = val_loss / val_batch_count
        val_losses.append(avg_val_loss)

        scheduler.step(avg_val_loss)

        print(f'Epoch {epoch}: Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_val_loss:.6f}')

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': best_val_loss,
            }, 'best_model.pth')

        # Regular checkpoint
        if epoch % 5 == 0:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_losses': train_losses,
                'val_losses': val_losses,
            }, f'transformer_checkpoint_epoch_{epoch}.pth')

    return train_losses, val_losses

# Main execution
if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    data_files = sorted(glob.glob('data_batch_*.npy'))
    train_files, val_files = train_test_split(data_files, test_size=0.2, random_state=42)

    train_dataset = VideoDataset(train_files, device)
    val_dataset = VideoDataset(val_files, device)

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

    model = VideoTransformer().to(device)
    train_losses, val_losses = train_model(model, train_loader, val_loader, device)

    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (MSE)')
    plt.title('Training and Validation Losses')
    plt.legend()
    plt.savefig('transformer_training_metrics.png')
    plt.close()

Using device: cuda
Loading 22 data files...
Dataset loaded. Total sequences: 10197
Sequence shape: torch.Size([10197, 10, 1, 64, 64])
Target shape: torch.Size([10197, 5, 1, 64, 64])
Loading 6 data files...
Dataset loaded. Total sequences: 3000
Sequence shape: torch.Size([3000, 10, 1, 64, 64])
Target shape: torch.Size([3000, 5, 1, 64, 64])
Starting training...
Epoch 0, Batch 0, Loss: 0.076774
Epoch 0, Batch 10, Loss: 0.095175
Epoch 0, Batch 20, Loss: 0.090566
Epoch 0, Batch 30, Loss: 0.091743
Epoch 0, Batch 40, Loss: 0.065457
Epoch 0, Batch 50, Loss: 0.053068
Epoch 0, Batch 60, Loss: 0.040627
Epoch 0, Batch 70, Loss: 0.053482
Epoch 0, Batch 80, Loss: 0.061655
Epoch 0, Batch 90, Loss: 0.043614
Epoch 0, Batch 100, Loss: 0.099519
Epoch 0, Batch 110, Loss: 0.063384
Epoch 0, Batch 120, Loss: 0.045464
Epoch 0, Batch 130, Loss: 0.062861
Epoch 0, Batch 140, Loss: 0.057597
Epoch 0, Batch 150, Loss: 0.067772
Epoch 0, Batch 160, Loss: 0.087222
Epoch 0, Batch 170, Loss: 0.062527
Epoch 0, Batch 180,

In [13]:
import torch
import cv2
import numpy as np
from moviepy.editor import ImageSequenceClip
import glob

def generate_prediction_video(model, input_sequence, output_path='predicted_video.mp4', fps=30):
    model.eval()
    device = next(model.parameters()).device

    # Ensure input has correct shape (B,T,C,H,W)
    if len(input_sequence.shape) == 4:
        input_sequence = input_sequence.unsqueeze(2)  # Add channel dimension
    if len(input_sequence.shape) == 3:
        input_sequence = input_sequence.unsqueeze(0).unsqueeze(2)  # Add batch and channel
    input_sequence = input_sequence.to(device)

    # Generate predictions
    with torch.no_grad():
        predicted_frames = model(input_sequence)

    # Convert to numpy
    input_frames = (input_sequence[0, :, 0].cpu().numpy() * 255).astype(np.uint8)
    pred_frames = (predicted_frames[0, :, 0].cpu().numpy() * 255).astype(np.uint8)
    all_frames = np.concatenate([input_frames, pred_frames], axis=0)

    # Save video
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video = cv2.VideoWriter(output_path, fourcc, fps, (64, 64))

    for frame in all_frames:
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)
        video.write(frame_bgr)
    video.release()

def test_video_generation():
    # Load model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = VideoTransformer().to(device)
    checkpoint = torch.load('best_model.pth', map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])

    # Load test sequence
    data_files = sorted(glob.glob('data_batch_*.npy'))
    sample_data = np.load(data_files[0])
    input_seq = torch.FloatTensor(sample_data[0:1, :10])

    # Generate video
    generate_prediction_video(model, input_seq, 'prediction.mp4')
    print("Video generated successfully!")

if __name__ == "__main__":
    test_video_generation()

  checkpoint = torch.load('best_model.pth', map_location=device)



Video generated successfully!
