In [1]:
import tensorflow as tf
import numpy as np
import cv2
from transformers import ViTFeatureExtractor, TFAutoModel

## Avoid any OOM error for future

In [2]:
# Avoid OOM errors by setting GPU Memory Consumption Growth
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
print(gpus)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
# Load pre-trained Vision Transformer and feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
vit_model = TFAutoModel.from_pretrained('google/vit-base-patch16-224')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFViTModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing TFViTModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFViTModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFViTModel were not initialized from the PyTorch model and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def extract_features(video_path):
    cap = cv2.VideoCapture(video_path)
    features = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (224, 224))
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        inputs = feature_extractor(images=frame, return_tensors="tf")
        outputs = vit_model(inputs['pixel_values'])
        # Remove unnecessary dimensions
        features.append(outputs.last_hidden_state.numpy().squeeze())
    cap.release()
    return np.array(features)

In [5]:
video_features = extract_features('dataset/parkour.mp4')

In [7]:
from tensorflow.keras.layers import Input, GlobalAveragePooling1D, Dense, LayerNormalization, MultiHeadAttention, Dropout
from tensorflow.keras.models import Model

# class TransformerBlock(tf.keras.layers.Layer):
#     def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
#         super(TransformerBlock, self).__init__()
#         self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
#         self.ffn = tf.keras.Sequential(
#             [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
#         )
#         self.layernorm1 = LayerNormalization(epsilon=1e-6)
#         self.layernorm2 = LayerNormalization(epsilon=1e-6)
#         self.dropout1 = Dropout(rate)
#         self.dropout2 = Dropout(rate)

#     def call(self, inputs, training):
#         attn_output = self.att(inputs, inputs)
#         attn_output = self.dropout1(attn_output, training=training)
#         out1 = self.layernorm1(inputs + attn_output)
#         ffn_output = self.ffn(out1)
#         ffn_output = self.dropout2(ffn_output, training=training)
#         return self.layernorm2(out1 + ffn_output)

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"), 
            Dense(embed_dim), 
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super(TransformerBlock, self).get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate,
        })
        return config

In [8]:
# def build_transformer_model(input_shape, embed_dim, num_heads, ff_dim):
#     inputs = Input(shape=input_shape)
#     transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
#     x = transformer_block(inputs)
#     x = GlobalAveragePooling1D()(x)
#     outputs = Dense(1, activation='sigmoid')(x)
#     return Model(inputs, outputs)

def build_transformer_model(input_shape, embed_dim, num_heads, ff_dim, num_layers):
    inputs = Input(shape=input_shape)
    x = inputs
    for _ in range(num_layers):
        x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
    x = GlobalAveragePooling1D()(x)
    outputs = Dense(1, activation='sigmoid')(x)
    return Model(inputs, outputs)

In [9]:
input_shape = video_features.shape[1:]
embed_dim = 768  # Embedding dimension for ViT
num_heads = 8
ff_dim = 2048  # Feed forward dimension

In [10]:
video_features.shape

(5631, 197, 768)

In [11]:
input_shape

(197, 768)

In [12]:
# transformer_model = build_transformer_model(input_shape, embed_dim, num_heads, ff_dim)
# transformer_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

num_layers = 2  # Increase the number of transformer layers
transformer_model = build_transformer_model(input_shape, embed_dim, num_heads, ff_dim, num_layers)
transformer_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [13]:
transformer_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 197, 768)]        0         
                                                                 
 transformer_block (Transfor  (None, 197, 768)         22045184  
 merBlock)                                                       
                                                                 
 transformer_block_1 (Transf  (None, 197, 768)         22045184  
 ormerBlock)                                                     
                                                                 
 global_average_pooling1d (G  (None, 768)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_4 (Dense)             (None, 1)                 769       
                                                             

## Generating Random Labels

In [14]:
def generate_continuous_labels(num_frames, segment_duration_sec, fps):
    segment_length = int(segment_duration_sec * fps)  # Number of frames in each segment
    num_segments = num_frames // segment_length
    labels = np.zeros(num_frames, dtype=int)
    
    for _ in range(num_segments):
        start_frame = np.random.randint(0, num_frames - segment_length)
        labels[start_frame:start_frame + segment_length] = 1
    
    return labels

In [15]:
num_frames = video_features.shape[0]
segment_duration_sec = 5  # Length of each segment in seconds
fps = 30  # Frames per second (adjust based on your video)

labels = generate_continuous_labels(num_frames, segment_duration_sec, fps)

In [30]:
# for i in labels:
#     print(i)

In [31]:
# # Assume you have a total number of frames
# num_frames = video_features.shape[0]

# # Generate random binary labels for each frame
# labels = np.random.randint(2, size=num_frames)

In [18]:
# Get unique elements, indices, and counts
unique_elements, indices, counts = np.unique(labels, return_index=True, return_counts=True)

print("Unique elements in the array:", unique_elements)
# print("Indices of unique elements in the original array:", indices)
print("Counts of each unique element:", counts)

Unique elements in the array: [0 1]
Counts of each unique element: [476 756]


## Train the Transformer

In [19]:
transformer_model.fit(video_features, labels, epochs=10, batch_size=4)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2498da30100>

In [16]:
def predict_in_batches(model, data, batch_size):
    predictions = []
    num_batches = int(np.ceil(data.shape[0] / batch_size))
    for i in range(num_batches):
        batch_data = data[i * batch_size: (i + 1) * batch_size]
        batch_predictions = model.predict(batch_data)
        predictions.append(batch_predictions)
    return np.concatenate(predictions, axis=0)

In [22]:
batch_size = 4  # Adjust based on your memory capacity
predictions = predict_in_batches(transformer_model, video_features, batch_size)
summary_frames = np.where(predictions > 0.5)[0]  # Threshold to select important frames



## Generate Summary

In [23]:
len(summary_frames)

798

In [24]:
def generate_summary(video_path, summary_frames):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_list = []
    for frame_no in summary_frames:
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
        ret, frame = cap.read()
        if ret:
            frame_list.append(frame)
    cap.release()
    return frame_list

In [25]:
summary_frames = generate_summary('ok.mp4', summary_frames)

In [33]:
len(summary_frames)

798

In [27]:
def save_summary_video(frames, output_path, fps):
    height, width, _ = frames[0].shape
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # or cv2.VideoWriter_fourcc(*'H264')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    for frame in frames:
        out.write(frame)
    out.release()

In [28]:
# Save the summary video in MP4 format
fps = 60  # Adjust based on your video's FPS
save_summary_video(summary_frames, 'final.mp4', fps)

### Get Video FPS

In [59]:
from moviepy.editor import VideoFileClip
def get_fps(video_path):
    video_clip = VideoFileClip(video_path)
    fps = video_clip.fps
    video_clip.close()
    return fps

fps = get_fps('ok.mp4')
fps

60.0

# TRAINING WITHOUT LABELS

In [14]:
video_features = (video_features - np.mean(video_features, axis=0)) / np.std(video_features, axis=0)

In [15]:
def diversity_reward(selected_frames, all_frames):
    # Calculate pairwise distances between selected frames
    distances = np.linalg.norm(selected_frames[:, np.newaxis] - selected_frames[np.newaxis, :], axis=-1)
    diversity_score = np.sum(distances) / 2  # Summing over upper triangle
    return diversity_score

def representativeness_reward(selected_frames, all_frames):
    # Calculate distances between selected frames and all frames
    distances = np.linalg.norm(selected_frames[:, np.newaxis] - all_frames[np.newaxis, :], axis=-1)
    representativeness_score = np.mean(np.min(distances, axis=0))  # Mean minimum distance
    return representativeness_score

def compute_reward(selected_frames, all_frames, alpha=0.5):
    if selected_frames.shape[0] == 0:
        return 0.0  # Return a default reward if no frames are selected

    diversity = diversity_reward(selected_frames, all_frames)
    representativeness = representativeness_reward(selected_frames, all_frames)
    reward = alpha * diversity + (1 - alpha) * representativeness
    # return reward
    # Normalize reward to a reasonable scale
    normalized_reward = (reward - np.mean(reward)) / (np.std(reward) + 1e-8)
    return normalized_reward

In [16]:
import tensorflow_probability as tfp

def reinforce_loss(logits, actions, rewards):
    neg_log_prob = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=actions)
    loss = tf.reduce_mean(neg_log_prob * rewards)
    return loss

def train_step(model, optimizer, features, batch_size):
    with tf.GradientTape() as tape:
        logits = model(features, training=True)
        probs = tf.nn.sigmoid(logits)
        actions = tf.cast(tf.random.uniform(tf.shape(probs)) < probs, tf.float32)
        
        selected_frames_indices = np.where(actions.numpy().flatten() > 0)[0]
        if selected_frames_indices.size == 0:
            reward = 0
            loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=actions))
        else:
            selected_frames = features[selected_frames_indices]
            reward = compute_reward(selected_frames, features)
            loss = reinforce_loss(logits, actions, reward)
    
    grads = tape.gradient(loss, model.trainable_variables)
    clipped_grads = [tf.clip_by_value(grad, -1.0, 1.0) for grad in grads]  # Gradient clipping
    optimizer.apply_gradients(zip(clipped_grads, model.trainable_variables))
    return loss, reward

def train_model(model, video_features, epochs, batch_size):
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    num_batches = int(np.ceil(video_features.shape[0] / batch_size))
    
    for epoch in range(epochs):
        epoch_loss = 0
        epoch_reward = 0
        for batch_idx in range(num_batches):
            batch_features = video_features[batch_idx * batch_size: (batch_idx + 1) * batch_size]
            loss, reward = train_step(model, optimizer, batch_features, batch_size)
            epoch_loss += loss
            epoch_reward += reward
        
        print(f'Epoch {epoch + 1}, Loss: {epoch_loss / num_batches}, Reward: {epoch_reward / num_batches}')

In [17]:
# Train the model
batch_size = 4  # Adjust based on your memory capacity
epochs = 10
train_model(transformer_model, video_features, epochs, batch_size)

Epoch 1, Loss: 0.04304554685950279, Reward: 0.0
Epoch 2, Loss: 0.04578312486410141, Reward: 0.0
Epoch 3, Loss: 0.05316748842597008, Reward: 0.0
Epoch 4, Loss: 0.03987563028931618, Reward: 0.0
Epoch 5, Loss: 0.0511983260512352, Reward: 0.0
Epoch 6, Loss: 0.04873687028884888, Reward: 0.0
Epoch 7, Loss: 0.041844796389341354, Reward: 0.0
Epoch 8, Loss: 0.03593730553984642, Reward: 0.0
Epoch 9, Loss: 0.03938334062695503, Reward: 0.0
Epoch 10, Loss: 0.04332166910171509, Reward: 0.0


In [18]:
def predict_in_batches(model, data, batch_size):
    predictions = []
    num_batches = int(np.ceil(data.shape[0] / batch_size))
    for i in range(num_batches):
        batch_data = data[i * batch_size: (i + 1) * batch_size]
        batch_predictions = model.predict(batch_data)
        predictions.append(batch_predictions)
    return np.concatenate(predictions, axis=0)

In [19]:
def get_top_percentile_indices(arr, percentile=35):
    # Flatten the array if it is multi-dimensional
    arr_flat = arr.flatten()
    
    # Calculate the threshold value for the given percentile
    threshold = np.percentile(arr_flat, 100 - percentile)
    
    # Get the indices of the values that are greater than or equal to the threshold
    top_indices = np.where(arr_flat >= threshold)[0]
    
    return top_indices

In [20]:
# get_top_percentile_indices(predictions, percentile=35)

In [21]:
# Predict important frames in batches
predictions = predict_in_batches(transformer_model, video_features, batch_size)
# summary_frames_indices = np.where(predictions >= np.median(predictions))[0]  # Threshold to select important frames



In [22]:
predictions

array([[6.2074121e-16],
       [6.1728659e-16],
       [6.1445088e-16],
       ...,
       [6.2437001e-16],
       [6.2975926e-16],
       [6.3787584e-16]], dtype=float32)

In [23]:
def generate_summary(video_path, summary_frames):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_list = []
    for frame_no in summary_frames:
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
        ret, frame = cap.read()
        if ret:
            frame_list.append(frame)
    cap.release()
    return frame_list

In [24]:
summary_frames_indices = get_top_percentile_indices(predictions, percentile=35)

In [25]:
len(summary_frames_indices)

1971

In [27]:
# Generate summary frames
summary_frames = generate_summary('dataset/parkour.mp4', summary_frames_indices)

In [28]:
len(summary_frames)

1971

In [29]:
def save_summary_video(frames, output_path, fps):
    height, width, _ = frames[0].shape
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # or cv2.VideoWriter_fourcc(*'H264')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    for frame in frames:
        out.write(frame)
    out.release()

In [30]:
from moviepy.editor import VideoFileClip
def get_fps(video_path):
    video_clip = VideoFileClip(video_path)
    fps = video_clip.fps
    video_clip.close()
    return fps

In [31]:
# Save the summary video in MP4 format
fps = get_fps('dataset/parkour.mp4')  # Adjust based on your video's FPS
save_summary_video(summary_frames, 'summaries/parkour_summary.mp4', fps)

### Save The Model

In [32]:
# Save the model
transformer_model.save('model/temporal_transformer_big.h5')
print("Model saved successfully.")

Model saved successfully.
