
# **Download and extract the Kaggle dataset**.

In [None]:
# Install required libraries
!pip install kaggle opencv-python scikit-learn numpy scipy joblib



In [None]:
# Download the dataset from Kaggle
!kaggle datasets download -d naveenk903/movies-fight-detection-dataset

Dataset URL: https://www.kaggle.com/datasets/naveenk903/movies-fight-detection-dataset
License(s): unknown
movies-fight-detection-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
# Unzip the downloaded dataset
!unzip -q movies-fight-detection-dataset.zip


# **Download and extract the Kaggle dataset**

In [None]:
import shutil
import os
import random
import cv2
import numpy as np
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from joblib import dump, load


# **Move the dataset to the specified Google Drive folder.**

In [None]:
# Move the dataset to the desired folder on Google Drive
source_folder = "/content/Peliculas"
destination_folder = "/content/drive/MyDrive/CV_LAB/Lab-10"
shutil.move(source_folder, destination_folder)

# Paths to the folders containing the data
fights_folder_path = os.path.join(destination_folder, 'fights')
nofights_folder_path = os.path.join(destination_folder, 'noFights')

# Fraction of data to be used for testing
test_size = 0.2

In [None]:
# List all files in the fights and nofights folders
fights_files = os.listdir(fights_folder_path)
nofights_files = os.listdir(nofights_folder_path)

# Shuffle the files randomly
random.shuffle(fights_files)
random.shuffle(nofights_files)

# Calculate the number of files for testing
num_fights_test_files = int(len(fights_files) * test_size)
num_nofights_test_files = int(len(nofights_files) * test_size)


# **Split the dataset into training and testing sets.**

In [None]:
# Split files into training and testing sets for fights and nofights
fights_train_files = fights_files[num_fights_test_files:]
fights_test_files = fights_files[:num_fights_test_files]

nofights_train_files = nofights_files[num_nofights_test_files:]
nofights_test_files = nofights_files[:num_nofights_test_files]

# Confirm the split
print(f"Fights Training files: {len(fights_train_files)}")
print(f"Fights Testing files: {len(fights_test_files)}")
print(f"NoFights Training files: {len(nofights_train_files)}")
print(f"NoFights Testing files: {len(nofights_test_files)}")

Fights Training files: 80
Fights Testing files: 20
NoFights Training files: 81
NoFights Testing files: 20


# **Define functions to extract SIFT features and frames from video files.**

In [None]:
# Function to extract SIFT features from a frame
def extract_sift_features(frame):
    sift = cv2.SIFT_create()
    keypoints, descriptors = sift.detectAndCompute(frame, None)
    return descriptors

# **Load and extract features from the dataset.**

In [None]:
# Function to extract frames from a video file
def extract_frames_from_video(video_path, interval=30):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    frames = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % interval == 0:
            gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            frames.append(gray_frame)
        frame_count += 1

    cap.release()
    return frames

# Load dataset and extract SIFT features
def load_and_extract_features(files, folder_path, interval=30):
    features = []
    labels = []

    for file in files:
        video_path = os.path.join(folder_path, file)
        frames = extract_frames_from_video(video_path, interval)
        for frame in frames:
            descriptors = extract_sift_features(frame)
            if descriptors is not None:
                features.append(descriptors)
                if 'fights' in folder_path:
                    labels.append(0)  # Label for fight videos
                else:
                    labels.append(1)  # Label for nofight videos

    return features, labels

In [None]:
# Extract features from training sets
train_features_fights, train_labels_fights = load_and_extract_features(fights_train_files, fights_folder_path, 30)
train_features_nofights, train_labels_nofights = load_and_extract_features(nofights_train_files, nofights_folder_path, 30)

# Extract features from testing sets
test_features_fights, test_labels_fights = load_and_extract_features(fights_test_files, fights_folder_path, 30)
test_features_nofights, test_labels_nofights = load_and_extract_features(nofights_test_files, nofights_folder_path, 30)

# Combine training features and labels
train_features = train_features_fights + train_features_nofights
train_labels = train_labels_fights + train_labels_nofights

# Combine testing features and labels
test_features = test_features_fights + test_features_nofights
test_labels = test_labels_fights + test_labels_nofights

# Confirm data distribution
print(f"Training labels distribution: {np.bincount(train_labels)}")
print(f"Testing labels distribution: {np.bincount(test_labels)}")

Training labels distribution: [160 162]
Testing labels distribution: [40 40]



# **Cluster the features using KMeans.**

In [None]:
# Flatten the feature list for KMeans clustering
all_descriptors = np.vstack(train_features)

# KMeans clustering to create codebook
num_clusters = 100  # Number of clusters for KMeans
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(all_descriptors)
dump(kmeans, 'kmeans_codebook.joblib')



['kmeans_codebook.joblib']

# **Create histograms of codewords for each video**

In [None]:
# Helper function to create histogram of codewords for each video
def create_histogram(features, kmeans, num_clusters):
    histograms = []
    for descriptors in features:
        if descriptors is not None:
            words = kmeans.predict(descriptors)
            histogram, _ = np.histogram(words, bins=np.arange(num_clusters+1), density=True)
            histograms.append(histogram)
        else:
            histograms.append(np.zeros(num_clusters))
    return np.array(histograms)

# Create histograms for training and testing sets
train_histograms = create_histogram(train_features, kmeans, num_clusters)
test_histograms = create_histogram(test_features, kmeans, num_clusters)

# **Train an SVM classifier.**

In [None]:
# Train SVM classifier
svm = SVC(kernel='linear', random_state=42)
svm.fit(train_histograms, train_labels)
dump(svm, 'svm_classifier.joblib')

['svm_classifier.joblib']

# **Evaluate the classifier and print the performance metrics.**

In [None]:
# Predict and evaluate
labels_pred = svm.predict(test_histograms)
accuracy = accuracy_score(test_labels, labels_pred)
precision = precision_score(test_labels, labels_pred)
recall = recall_score(test_labels, labels_pred)
f1 = f1_score(test_labels, labels_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.7
Precision: 1.0
Recall: 0.4
F1 Score: 0.5714285714285715
