In [13]:
# MSVD Video Captioning Pipeline (Starter Code)
# Phase 1: Load Annotations and Preprocess Captions

import pandas as pd
import os
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load English annotations
ANNOTATION_PATH = "/kaggle/input/msvd-dataset-corpus/annotations.txt"
annotations = {}
with open(ANNOTATION_PATH, "r") as file:
    for line in file:
        video_id, caption = line.strip().split(' ', 1)
        annotations.setdefault(video_id, []).append(caption.lower())

print(f"Loaded captions for {len(annotations)} video segments")

# Add special tokens and clean captions
def preprocess_caption(caption):
    caption = re.sub(r"[^a-zA-Z0-9 ]", "", caption)
    return f"startseq {caption.strip()} endseq"

# Apply preprocessing
dataset = []
for video_id, caps in annotations.items():
    for c in caps:
        dataset.append((video_id, preprocess_caption(c)))

print(f"Total caption pairs: {len(dataset)}")

# Create tokenizer
all_captions = [cap for _, cap in dataset]
tokenizer = Tokenizer(oov_token='<unk>')
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")

# Create a dictionary of video_id -> processed captions
descriptions = {}
for vid, cap in dataset:
    descriptions.setdefault(vid, []).append(cap)

# Save tokenizer for later
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

Loaded captions for 1970 video segments
Total caption pairs: 80827
Vocabulary size: 12596


In [14]:
# Phase 2: Frame Extraction + CNN Feature Extraction
import cv2
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.utils import img_to_array

# Load InceptionV3 model without top layer
base_model = InceptionV3(weights='imagenet')
cnn_model = Model(base_model.input, base_model.layers[-2].output)

VIDEO_DIR = "/kaggle/input/msvd-clips/YouTubeClips"

# Function to extract frames from a video
def extract_frames(video_path, num_frames=5):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_ids = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    frames = []

    for fid in frame_ids:
        cap.set(cv2.CAP_PROP_POS_FRAMES, fid)
        success, frame = cap.read()
        if success:
            frame = cv2.resize(frame, (299, 299))
            frame = img_to_array(frame)
            frame = preprocess_input(frame)
            frames.append(frame)
    cap.release()
    return np.array(frames)

# Function to extract features for a video
def extract_video_features(video_id):
    video_path = os.path.join(VIDEO_DIR, video_id + ".avi")
    if not os.path.exists(video_path):
        return None
    frames = extract_frames(video_path)
    if len(frames) == 0:
        return None
    features = cnn_model.predict(frames, verbose=0)
    return np.mean(features, axis=0)

# Example: Extract and save features for first 10 videos
video_features = {}
for i, video_id in enumerate(list(descriptions.keys())[:10]):
    feats = extract_video_features(video_id)
    if feats is not None:
        video_features[video_id] = feats
    print(f"[{i+1}/10] Processed: {video_id}")

# Save extracted features
with open("video_features.pkl", "wb") as f:
    pickle.dump(video_features, f)

print("Saved video features for 10 videos.")

[1/10] Processed: -4wsuPCjDBc_5_15
[2/10] Processed: -7KMZQEsJW4_205_208
[3/10] Processed: -8y1Q0rA3n8_108_115
[4/10] Processed: -8y1Q0rA3n8_95_102
[5/10] Processed: -9CUm-2cui8_39_44
[6/10] Processed: -AwoiGR6c8M_10_14
[7/10] Processed: -Cv5LsqKUXc_17_25
[8/10] Processed: -Cv5LsqKUXc_71_76
[9/10] Processed: -DKuLXYoY3g_14_20
[10/10] Processed: -DRy7rBg0IQ_31_37
Saved video features for 10 videos.


In [15]:

# Phase 3: Caption Generation Model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

# Parameters
max_length = max(len(c.split()) for c in all_captions)
embedding_dim = 256

# Define the captioning model
def define_model(vocab_size, max_length):
    # Feature extractor (from video)
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(embedding_dim, activation='relu')(fe1)

    # Sequence processor (captions)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    # se3 = LSTM(256)(se2)
    se3 = LSTM(256, return_sequences=False, recurrent_activation='sigmoid')(se2)
    # Decoder (combine both)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

# Create the model
model = define_model(vocab_size, max_length)
model.summary()

In [24]:
# Phase 4: Training + Caption Generation + Evaluation 

import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from nltk.translate.bleu_score import corpus_bleu

# Step 1: Create training sequences (with right-padding fix)
def create_sequences(tokenizer, max_length, descriptions, video_features, vocab_size):
    X1, X2, y = [], [], []
    for video_id, caps in descriptions.items():
        feature = video_features.get(video_id)
        if feature is None:
            continue
        for cap in caps:
            seq = tokenizer.texts_to_sequences([cap])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                # ✅ Fix: ensure right-padding for cuDNN compatibility
                in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post')[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(feature)
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

# Prepare sequences
X1train, X2train, ytrain = create_sequences(tokenizer, max_length, descriptions, video_features, vocab_size)
print(f"Training data shapes -> Video: {X1train.shape}, Captions: {X2train.shape}, Labels: {ytrain.shape}")

# Step 2: Train the model
history = model.fit([X1train, X2train], ytrain, epochs=10, batch_size=64, verbose=1)

# Save the trained model
# model.save("video_caption_model.h5")

model.export("video_caption_model")

print("Model saved!")


# Step 3: Generate caption (inference, fixed version)
def generate_caption(model, tokenizer, video_feat, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # ✅ Fix: pad with right-padding (post), for cuDNN compliance
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        yhat = model.predict([np.array([video_feat]), sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text.replace('startseq', '').replace('endseq', '').strip()

# Example caption generation
test_video_id = list(video_features.keys())[0]
test_feat = video_features[test_video_id]
caption = generate_caption(model, tokenizer, test_feat, max_length)
print(f"Generated Caption for video {test_video_id}:\n{caption}")

# Step 4: BLEU score evaluation (safe against future key errors)
actual, predicted = [], []
for vid_id in list(descriptions.keys())[:10]:  # Evaluate on 10 samples
    y_pred = generate_caption(model, tokenizer, video_features[vid_id], max_length)
    references = [d.split() for d in descriptions[vid_id]]
    actual.append(references)
    predicted.append(y_pred.split())

# BLEU scores
print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))


Training data shapes -> Video: (2950, 2048), Captions: (2950, 47), Labels: (2950, 12596)
Epoch 1/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.7820
Epoch 2/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.7856 
Epoch 3/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.7227
Epoch 4/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.7297
Epoch 5/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.7059
Epoch 6/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.7120
Epoch 7/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.6856
Epoch 8/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.6376
Epoch 9/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.6565
Epoch 10/10
[1m47

In [26]:
# ✅ Phase 5 (Part 1): Sentiment Analysis on Generated Captions

# Step 1: Install Transformers library
!pip install transformers

# Step 2: Load Sentiment Model
from transformers import pipeline

# Load HuggingFace Sentiment Classifier
sentiment_analyzer = pipeline("sentiment-analysis")

# Step 3: Generate Caption (Ensure padding is right-padded)
def generate_caption(model, tokenizer, video_feat, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        yhat = model.predict([np.array([video_feat]), sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text.replace('startseq', '').replace('endseq', '').strip()

# Step 4: Analyze Sentiment Function
def analyze_caption_sentiment(caption):
    result = sentiment_analyzer(caption)[0]
    label = result['label']
    score = round(result['score'], 3)
    return label, score

# Step 5: Run on Example Video
test_video_id = list(video_features.keys())[0]
test_feat = video_features[test_video_id]

caption = generate_caption(model, tokenizer, test_feat, max_length)
label, score = analyze_caption_sentiment(caption)

print(f"Generated Caption: {caption}")
print(f"Sentiment: {label} (Confidence: {score})")

# Step 6: Batch Process Multiple Videos (Optional)
for vid_id in list(video_features.keys())[:10]:
    test_feat = video_features[vid_id]
    caption = generate_caption(model, tokenizer, test_feat, max_length)
    label, score = analyze_caption_sentiment(caption)
    print(f"Video: {vid_id}")
    print(f"  Caption: {caption}")
    print(f"  Sentiment: {label} (Confidence: {score})\n")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


Generated Caption: a squirrel is eating a peanut
Sentiment: POSITIVE (Confidence: 0.582)
Video: -4wsuPCjDBc_5_15
  Caption: a squirrel is eating a peanut
  Sentiment: POSITIVE (Confidence: 0.582)

Video: -7KMZQEsJW4_205_208
  Caption: a man is holding dead sunflowers
  Sentiment: NEGATIVE (Confidence: 0.974)

Video: -8y1Q0rA3n8_108_115
  Caption: a man is cutting a bottle of water with a sword
  Sentiment: NEGATIVE (Confidence: 0.875)

Video: -8y1Q0rA3n8_95_102
  Caption: a man is stabbing a cardboard cutout with a sword
  Sentiment: NEGATIVE (Confidence: 0.998)

Video: -9CUm-2cui8_39_44
  Caption: a woman is boiling finger in a pot
  Sentiment: NEGATIVE (Confidence: 0.952)

Video: -AwoiGR6c8M_10_14
  Caption: a boy is playing a piano
  Sentiment: POSITIVE (Confidence: 0.994)

Video: -Cv5LsqKUXc_17_25
  Caption: a woman is removing a stem of strawberry
  Sentiment: NEGATIVE (Confidence: 0.966)

Video: -Cv5LsqKUXc_71_76
  Caption: a woman is filtering some food powder
  Sentiment: NEGAT

In [27]:
# ✅ Streamlit App for Video Caption + Sentiment (Fixed for .keras models)

import streamlit as st
import numpy as np
import cv2
import tempfile
from transformers import pipeline
import tensorflow as tf
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ✅ Load tokenizer and model (.keras)
tokenizer = pickle.load(open("tokenizer.pkl", "rb"))
model = tf.keras.models.load_model("video_caption_model.keras")

# Load HuggingFace sentiment pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

# ✅ Caption generation function (fixed)
def generate_caption(model, tokenizer, video_feat, max_length=47):  # 47 is your dataset max_length
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        yhat = model.predict([np.array([video_feat]), sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text.replace('startseq', '').replace('endseq', '').strip()

# ✅ Streamlit UI
st.title("🎥 Video Caption + Sentiment Analyzer")

uploaded_file = st.file_uploader("Upload a Video (.avi)", type=["avi"])

if uploaded_file is not None:
    # Save to temp file
    tfile = tempfile.NamedTemporaryFile(delete=False)
    tfile.write(uploaded_file.read())
    video_path = tfile.name

    # Extract frames (5 frames)
    cap = cv2.VideoCapture(video_path)
    frames = []
    for i in range(5):
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (299, 299))
        frame = tf.keras.applications.inception_v3.preprocess_input(frame)
        frames.append(frame)
    cap.release()
    frames = np.array(frames)

    # Load InceptionV3 model
    base_model = tf.keras.applications.InceptionV3(weights='imagenet', include_top=False, pooling='avg')
    features = base_model.predict(frames)
    video_feat = np.mean(features, axis=0)

    # ✅ Generate caption
    caption = generate_caption(model, tokenizer, video_feat)

    # ✅ Analyze sentiment
    sentiment = sentiment_analyzer(caption)[0]

    st.success(f"📜 Caption: {caption}")
    st.info(f"💬 Sentiment: {sentiment['label']} (Confidence: {round(sentiment['score'], 2)})")


ValueError: File not found: filepath=video_caption_model.keras. Please ensure the file is an accessible `.keras` zip file.

In [31]:
# Phase 6: Sentiment and Topic Classification on Generated Captions
!pip install transformers torch --quiet

from transformers import pipeline
from collections import Counter

# Initialize sentiment and topic pipelines
sentiment_analyzer = pipeline('sentiment-analysis')
classifier = pipeline("zero-shot-classification", model="microsoft/deberta-large-mnli")
labels = [
  "education", "lifestyle", "health", "travel", "politics",
  "finance", "technology", "sports", "entertainment", "music",
  "nature", "food", "animals", "vehicles", "shopping", "weather",
  "fitness", "art", "culture", "history", "science", "family",
  "transportation", "war", "crime", "disaster", "fashion", 
  "space", "architecture", "religion", "military", "news", 
  "environment", "economy", "real estate", "gaming", "internet", "meditation"
]

# Analyze generated captions
caption_outputs = []
topics = []
sentiments = []

for vid in video_features:
    gen_caption = generate_caption(model, tokenizer, video_features[vid], max_length)
    caption_outputs.append((vid, gen_caption))

    # sentiment_result = sentiment_analyzer(gen_caption)[0]
    # sentiments.append(sentiment_result['label'])

    topic_result = classifier(gen_caption, labels)
    top_label = topic_result['labels'][0]
    topics.append(top_label)

# Display results
print("\nGenerated Captions with Sentiment and Topic:")
for i, (vid, cap) in enumerate(caption_outputs):
    print(f"Video: {vid}\nCaption: {cap}\nTopic: {topics[i]}\n")

# Summary statistics
topic_counts = Counter(topics)
top_3 = topic_counts.most_common(3)
print("Top 3 Topics:")
for topic, count in top_3:
    print(f"{topic}: {count}")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0
Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Asking t


Generated Captions with Sentiment and Topic:
Video: -4wsuPCjDBc_5_15
Caption: a squirrel is eating a peanut
Topic: food

Video: -7KMZQEsJW4_205_208
Caption: a man is holding dead sunflowers
Topic: environment

Video: -8y1Q0rA3n8_108_115
Caption: a man is cutting a bottle of water with a sword
Topic: crime

Video: -8y1Q0rA3n8_95_102
Caption: a man is stabbing a cardboard cutout with a sword
Topic: crime

Video: -9CUm-2cui8_39_44
Caption: a woman is boiling finger in a pot
Topic: food

Video: -AwoiGR6c8M_10_14
Caption: a boy is playing a piano
Topic: music

Video: -Cv5LsqKUXc_17_25
Caption: a woman is removing a stem of strawberry
Topic: food

Video: -Cv5LsqKUXc_71_76
Caption: a woman is filtering some food powder
Topic: food

Video: -DKuLXYoY3g_14_20
Caption: a young boy is bouncing a basketball
Topic: sports

Video: -DRy7rBg0IQ_31_37
Caption: a woman is swimming underwater
Topic: environment

Top 3 Topics:
food: 4
environment: 2
crime: 2
