In [None]:
import tensorflow as tf
import numpy as np
import cv2
import os
from transformers import ViTImageProcessor, TFAutoModel

# Avoid OOM errors by setting GPU Memory Consumption Growth
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
print(gpus)

# Load pre-trained Vision Transformer and feature extractor
feature_extractor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
vit_model = TFAutoModel.from_pretrained('google/vit-base-patch16-224')

# Function to extract features from a single video
def extract_features(video_path):
    cap = cv2.VideoCapture(video_path)
    features = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (224, 224))
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        inputs = feature_extractor(images=frame, return_tensors="tf")
        outputs = vit_model(inputs['pixel_values'])
        features.append(outputs.last_hidden_state.numpy().squeeze())
    cap.release()
    return np.array(features)

# Load features for each video in the dataset folder
def load_dataset_features(dataset_folder):
    dataset_features = []
    for video_file in os.listdir(dataset_folder):
        video_path = os.path.join(dataset_folder, video_file)
        if os.path.isfile(video_path) and video_file.endswith(('.mp4', '.avi', '.mov')):
            print(f"Processing {video_file}...")
            video_features = extract_features(video_path)
            # Normalize features
            video_features = (video_features - np.mean(video_features, axis=0)) / np.std(video_features, axis=0)
            dataset_features.append(video_features)
    return dataset_features

# Define transformer model
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([tf.keras.layers.Dense(ff_dim, activation="relu"),
                                        tf.keras.layers.Dense(embed_dim)])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        
    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def build_transformer_model(input_shape, embed_dim, num_heads, ff_dim, num_layers):
    inputs = tf.keras.layers.Input(shape=input_shape)
    x = inputs
    for _ in range(num_layers):
        x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    return tf.keras.models.Model(inputs, outputs)

def diversity_reward(selected_frames, all_frames):
    # Calculate pairwise distances between selected frames
    distances = np.linalg.norm(selected_frames[:, np.newaxis] - selected_frames[np.newaxis, :], axis=-1)
    diversity_score = np.sum(distances) / 2  # Summing over upper triangle
    return diversity_score

def representativeness_reward(selected_frames, all_frames):
    # Calculate distances between selected frames and all frames
    distances = np.linalg.norm(selected_frames[:, np.newaxis] - all_frames[np.newaxis, :], axis=-1)
    representativeness_score = np.mean(np.min(distances, axis=0))  # Mean minimum distance
    return representativeness_score

def compute_reward(selected_frames, all_frames, alpha=0.5):
    if selected_frames.shape[0] == 0:
        return 0.0  # Return a default reward if no frames are selected

    diversity = diversity_reward(selected_frames, all_frames)
    representativeness = representativeness_reward(selected_frames, all_frames)
    reward = alpha * diversity + (1 - alpha) * representativeness
    # return reward
    # Normalize reward to a reasonable scale
    normalized_reward = (reward - np.mean(reward)) / (np.std(reward) + 1e-8)
    return normalized_reward

def reinforce_loss(logits, actions, rewards):
    neg_log_prob = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=actions)
    loss = tf.reduce_mean(neg_log_prob * rewards)
    return loss

def train_step(model, optimizer, features, batch_size):
    with tf.GradientTape() as tape:
        logits = model(features, training=True)
        probs = tf.nn.sigmoid(logits)
        actions = tf.cast(tf.random.uniform(tf.shape(probs)) < probs, tf.float32)
        
        selected_frames_indices = np.where(actions.numpy().flatten() > 0)[0]
        if selected_frames_indices.size == 0:
            reward = 0
            loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=actions))
        else:
            selected_frames = features[selected_frames_indices]
            reward = compute_reward(selected_frames, features)
            loss = reinforce_loss(logits, actions, reward)
    
    grads = tape.gradient(loss, model.trainable_variables)
    clipped_grads = [tf.clip_by_value(grad, -1.0, 1.0) for grad in grads]  # Gradient clipping
    optimizer.apply_gradients(zip(clipped_grads, model.trainable_variables))
    return loss, reward

epoch_adjustment = 0
# Training configuration
def train_model_on_dataset(dataset_folder, epochs=10, batch_size=4):
    dataset_features = load_dataset_features(dataset_folder)
    embed_dim = 768  # ViT embedding dimension
    num_heads = 8
    ff_dim = 2048
    num_layers = 2
    input_shape = (dataset_features[0].shape[1], embed_dim)  # Shape of individual video feature
    
    transformer_model = build_transformer_model(input_shape, embed_dim, num_heads, ff_dim, num_layers)
    transformer_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train on each video in the dataset
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    for epoch in range(epochs):
        epoch_loss, epoch_reward = 0, 0
        for video_features in dataset_features:
            num_batches = int(np.ceil(video_features.shape[0] / batch_size))
            for batch_idx in range(num_batches):
                batch_features = video_features[batch_idx * batch_size: (batch_idx + 1) * batch_size]
                video_features = (video_features - np.mean(video_features, axis=0)) / np.std(video_features, axis=0)
                loss, reward = train_step(transformer_model, optimizer, batch_features, batch_size)
                epoch_loss += loss
                epoch_reward += reward
        print(f'Epoch {epoch + 1}, Loss: {epoch_loss / num_batches - epoch_adjustment}, Reward: {epoch_reward / num_batches + epoch_adjustment}')
        
# Specify the path to your dataset
dataset_folder = 'videos'
train_model_on_dataset(dataset_folder, epochs=10, batch_size=4)