In [1]:
import os

# Set dataset path (modify this according to your folder location)
dataset_path = r"C:\Users\hp\Downloads\archive"
# Check if dataset folders exist
print("Train folder exists:", os.path.exists(os.path.join(dataset_path, "train")))
print("Validation folder exists:", os.path.exists(os.path.join(dataset_path, "validation")))
print("Test folder exists:", os.path.exists(os.path.join(dataset_path, "testing")))


Train folder exists: True
Validation folder exists: True
Test folder exists: True


In [2]:
!pip install opencv-python





[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Define paths for train, val, and test
train_path = os.path.join(dataset_path, "train")
val_path = os.path.join(dataset_path, "validation")
test_path = os.path.join(dataset_path, "testing")

# Get all video frame folders
train_videos = sorted(os.listdir(train_path))
val_videos = sorted(os.listdir(val_path))
test_videos = sorted(os.listdir(test_path))

print(f"Train videos: {len(train_videos)}")
print(f"Validation videos: {len(val_videos)}")
print(f"Test videos: {len(test_videos)}")
print("Sample Train Video Folder:", train_videos[:5])  # Show first 5 video IDs


Train videos: 1200
Validation videos: 100
Test videos: 670
Sample Train Video Folder: ['-4wsuPCjDBc_5_15', '-7KMZQEsJW4_205_208', '-8y1Q0rA3n8_108_115', '-8y1Q0rA3n8_95_102', '-9CUm-2cui8_39_44']


In [4]:
!pip install torch torchvision torchaudio




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [5]:
import os
import cv2
import numpy as np

def load_video_frames(video_folder, max_frames=16):
    """
    Loads a limited number of frames from a given video folder.
    Converts frames into a NumPy array.
    """
    frame_paths = sorted(os.listdir(video_folder))[:max_frames]  # Select first `max_frames` frames
    frames = []

    for frame_file in frame_paths:
        frame_path = os.path.join(video_folder, frame_file)
        frame = cv2.imread(frame_path)
        if frame is not None:
            frame = cv2.resize(frame, (224, 224))  # Resize for CNN input
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB
            frames.append(frame)

    return np.array(frames)  # Shape: (num_frames, 224, 224, 3)

# Example usage
video_frames = load_video_frames(r"C:\Users\hp\Downloads\archive\train\bQJQGoJF7_k_145_153")
print("Frames shape:", video_frames.shape)  # Expected Output: (16, 224, 224, 3)


Frames shape: (16, 224, 224, 3)


In [6]:
# !pip uninstall torch torchvision torchaudio -y
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


In [7]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image

# Load Pre-trained CNN Model (VGG16)
cnn_model = models.vgg16(pretrained=True)
cnn_model = torch.nn.Sequential(*list(cnn_model.children())[:-1])  # Remove last FC layer
cnn_model.eval()

# Define Image Transform
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def extract_cnn_features(video_folder, max_frames=16):
    frames = load_video_frames(video_folder, max_frames)
    frame_tensors = torch.stack([transform(frame) for frame in frames])  # Convert frames to tensors

    with torch.no_grad():
        features = cnn_model(frame_tensors)

    return features.squeeze().numpy()  # Shape: (16, 512, 7, 7)

# Example usage
video_features = extract_cnn_features(r"C:\Users\hp\Downloads\archive\train\bQJQGoJF7_k_145_153")
print("Feature shape:", video_features.shape)  # Expected Output: (16, 512, 7, 7)




Feature shape: (16, 512, 7, 7)


In [7]:
import pandas as pd

# Load the Excel file
file_path = r"C:\Users\hp\Downloads\archive\captions.csv"  # Update with the correct path if needed
df = pd.read_csv(file_path)

# Display the first few rows
print(df.head())



       Alphanumeric                                     Sentence
0  -4wsuPCjDBc_5_15  a squirrel is eating a peanut in it s shell
1  -4wsuPCjDBc_5_15                         a chipmunk is eating
2  -4wsuPCjDBc_5_15                a chipmunk is eating a peanut
3  -4wsuPCjDBc_5_15                   a chipmunk is eating a nut
4  -4wsuPCjDBc_5_15                   a squirrel is eating a nut


In [9]:
# Group captions by video_id
grouped_df = df.groupby("Alphanumeric")["Sentence"].apply(list).reset_index()

# Display first few rows
print(grouped_df.head())


          Alphanumeric                                           Sentence
0     -4wsuPCjDBc_5_15  [a squirrel is eating a peanut in it s shell, ...
1  -7KMZQEsJW4_205_208  [a man demonstrating how to clean a flower, a ...
2  -8y1Q0rA3n8_108_115  [a man slices through a two liter plastic bott...
3   -8y1Q0rA3n8_95_102  [a man with a sword runs ands stabs a cardboar...
4    -9CUm-2cui8_39_44  [a woman puts four okra in a pan of boiling wa...


In [11]:
import os

# Set dataset path
dataset_path = r"C:\Users\hp\Downloads\archive"

train_path = os.path.join(dataset_path, "train")

# List all video frame folders
train_videos = sorted(os.listdir(train_path))

# Check if all video IDs exist
missing_videos = [vid for vid in grouped_df["Alphanumeric"] if vid not in train_videos]
print(f"Missing Videos: {missing_videos}")


Missing Videos: ['bQJQGoJF7_k_162_169', 'bSIjZ75a50s_286_304', 'bSrpvMSuhPM_17_31', 'bXsKw3TOQXs_30_55', 'b_BuSVZwq6M_1_9', 'bb6V0Grtub4_174_185', 'bkazguPsusc_74_85', 'bmOy6p87TWI_26_35', 'bmvD4HlPFxg_20_27', 'bmxIurBrW5s_51_70', 'bnN_o0Hkn3M_73_80', 'bqMmyY1ImkI_0_14', 'bruzcOyIGeg_4_12', 'btuxO-C2IzE_64_72', 'btxCxlO1Euc_1_20', 'buJ5HDCinrM_150_166', 'bxDlC7YV5is_0_12', 'bxjFqtfJlMs_18_27', 'c2MwqFYVE7A_40_45', 'c2a0GcoJAjw_107_129', 'c2a0GcoJAjw_52_73', 'c51L6ZxZGjQ_137_154', 'c53HKs39i28_26_35', 'c75SIlAjfjg_6_14', 'c76tShLfQb0_74_81', 'cCmnN96zIeQ_14_24', 'cFzSEIGrEfA_0_24', 'cJOZp2ZftCw_1_12', 'cLNrpO6wBk4_37_47', 'cM55xNJ_pfU_1_20', 'cR2yi-JnGcQ_8_16', 'cSDkshD2ME0_11_14', 'cSDkshD2ME0_12_15', 'cUW_bXll6YM_390_395', 'cUW_bXll6YM_462_469', 'cWOPC2kt_IA_8_16', 'c_-eFL7Sfw4_19_35', 'c_XV7nPoRg8_2_12', 'ceOXCFUmxzA_100_110', 'clKtfGBVI1I_15_28', 'clpgffj3sUw_1_12', 'cmQ3SiIaWy4_4_20', 'cnsjm3fNEec_4_10', 'crfrKqFp0Zg_15_25', 'cs33MNhpRNw_16_21', 'cwkjJrGpoaU_30_41', 'd-2C_N5anww_1_

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download("punkt")

# Tokenize captions
grouped_df["tokenized_captions"] = grouped_df["caption"].apply(lambda captions: [word_tokenize(c.lower()) for c in captions])

# Build Vocabulary
word_counts = Counter()
for caption_list in grouped_df["tokenized_captions"]:
    for caption in caption_list:
        word_counts.update(caption)

# Create a mapping of words to indices
word_to_index = {word: i+1 for i, (word, _) in enumerate(word_counts.most_common(5000))}
word_to_index["<start>"] = 5001
word_to_index["<end>"] = 5002
word_to_index["<pad>"] = 0

# Convert captions to sequences
def caption_to_sequence(caption):
    return [word_to_index.get(word, 0) for word in caption]

grouped_df["caption_sequences"] = grouped_df["tokenized_captions"].apply(lambda caption_list: [caption_to_sequence(c) for c in caption_list])

print(grouped_df.head())


In [None]:
import numpy as np
import pickle

# Load extracted CNN features
with open("train_features.pkl", "rb") as f:
    train_features = pickle.load(f)

# Ensure features match captions
for video_id in grouped_df["video_id"]:
    if video_id not in train_features:
        print(f"Missing features for {video_id}")


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class CaptioningModel(nn.Module):
    def __init__(self, feature_dim, hidden_dim, vocab_size, embedding_dim):
        super(CaptioningModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim + feature_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, video_features, captions):
        embeddings = self.embedding(captions)
        inputs = torch.cat((video_features.unsqueeze(1), embeddings), dim=1)
        lstm_out, _ = self.lstm(inputs)
        output = self.fc(lstm_out)
        return output

# Initialize Model
model = CaptioningModel(feature_dim=512*7*7, hidden_dim=512, vocab_size=len(word_to_index), embedding_dim=300)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop (simplified)
for epoch in range(5):
    optimizer.zero_grad()
    outputs = model(torch.randn(1, 512*7*7), torch.randint(0, 5000, (1, 10)))  
    loss = criterion(outputs.view(-1, len(word_to_index)), torch.randint(0, 5000, (10,)))  
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
