# Load Models

In [1]:
import torch
from transformers import AutoModelForSequenceClassification

# Load and initialize the model
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Define the path to save the model
model_path = 'hybrid_model.pth'

# Save the model's state dictionary
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

2024-08-05 00:23:54.286490: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model saved to hybrid_model.pth


# Process Video

In [5]:
import pandas as pd
import numpy as np

In [13]:
import cv2
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Constants
N_FRAMES = 3
HEIGHT = 112
WIDTH = 112
MAX_TEXT_FEATURES = 3

# Define the path to the saved model
model_path = './models/hybrid_model.pth'

# Initialize the model architecture
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Load the saved state dictionary into the model
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

# Move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()  # Set the model to evaluation mode

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_video(video_path, n_frames):
    """
    Preprocess video by extracting frames and resizing them to the required dimensions.

    Args:
        video_path (str): Path to the video file.
        n_frames (int): Number of frames to extract.

    Returns:
        np.ndarray: Array of processed video frames.
    """
    cap = cv2.VideoCapture(video_path)
    frames = []
    while len(frames) < n_frames:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (WIDTH, HEIGHT))
        frame = frame / 255.0  # Normalize pixel values to [0, 1]
        frames.append(frame)
    cap.release()

    # If not enough frames, pad with zeros
    if len(frames) < n_frames:
        frames.extend([np.zeros((HEIGHT, WIDTH, 3))] * (n_frames - len(frames)))

    return np.array(frames)

def analyze_video(video_path, model, text_features):
    """
    Analyze a video and return the predicted class and confidence.

    Args:
        video_path (str): Path to the video file.
        model (torch.nn.Module): Pretrained model for classification.
        text_features (str): Additional text features.

    Returns:
        int: Predicted class.
        float: Confidence score of the prediction.
    """
    # Preprocess video
    video_frames = preprocess_video(video_path, N_FRAMES)
    video_frames = np.expand_dims(video_frames, axis=0)  # Add batch dimension

    # Convert video frames to PyTorch tensor
    video_frames_tensor = torch.tensor(video_frames, dtype=torch.float32).permute(0, 4, 1, 2, 3).to(device)  # (N, C, T, H, W)

    # Tokenize text features
    encoded_text = tokenizer(text_features, padding='max_length', truncation=True, max_length=MAX_TEXT_FEATURES, return_tensors='pt')
    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)

    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()

    predicted_class = np.argmax(logits, axis=-1)[0]
    confidence = logits[0][predicted_class]

    return predicted_class, confidence

# Example usage
video_path = 'test.mp4'
text_features = "Example text feature"  # Example text input

predicted_class, confidence = analyze_video(video_path, model, text_features)
print(f"Predicted class: {predicted_class}, Confidence: {confidence:.2f}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted class: 0, Confidence: 0.09


In [13]:
import cv2
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Constants
N_FRAMES = 3
HEIGHT = 112
WIDTH = 112
MAX_TEXT_FEATURES = 3

# Define the path to the saved model
model_path = './models/bert_model.pth'

# Initialize the model architecture
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Load the saved state dictionary into the model
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

# Move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()  # Set the model to evaluation mode

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_video(video_path, n_frames):
    """
    Preprocess video by extracting frames and resizing them to the required dimensions.

    Args:
        video_path (str): Path to the video file.
        n_frames (int): Number of frames to extract.

    Returns:
        np.ndarray: Array of processed video frames.
    """
    cap = cv2.VideoCapture(video_path)
    frames = []
    while len(frames) < n_frames:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (WIDTH, HEIGHT))
        frame = frame / 255.0  # Normalize pixel values to [0, 1]
        frames.append(frame)
    cap.release()

    # If not enough frames, pad with zeros
    if len(frames) < n_frames:
        frames.extend([np.zeros((HEIGHT, WIDTH, 3))] * (n_frames - len(frames)))

    return np.array(frames)

def analyze_video(video_path, model, text_features):
    """
    Analyze a video and return the predicted class and confidence.

    Args:
        video_path (str): Path to the video file.
        model (torch.nn.Module): Pretrained model for classification.
        text_features (str): Additional text features.

    Returns:
        int: Predicted class.
        float: Confidence score of the prediction.
    """
    # Preprocess video
    video_frames = preprocess_video(video_path, N_FRAMES)
    video_frames = np.expand_dims(video_frames, axis=0)  # Add batch dimension

    # Convert video frames to PyTorch tensor
    video_frames_tensor = torch.tensor(video_frames, dtype=torch.float32).permute(0, 4, 1, 2, 3).to(device)  # (N, C, T, H, W)

    # Tokenize text features
    encoded_text = tokenizer(text_features, padding='max_length', truncation=True, max_length=MAX_TEXT_FEATURES, return_tensors='pt')
    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)

    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()

    predicted_class = np.argmax(logits, axis=-1)[0]
    confidence = logits[0][predicted_class]

    return predicted_class, confidence

# Example usage
video_path = 'test.mp4'
text_features = "Example text feature"  # Example text input

predicted_class, confidence = analyze_video(video_path, model, text_features)
print(f"Predicted class: {predicted_class}, Confidence: {confidence:.2f}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted class: 0, Confidence: 0.09
