In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!git clone https://github.com/airtlab/A-Dataset-for-Automatic-Violence-Detection-in-Videos.git

Cloning into 'A-Dataset-for-Automatic-Violence-Detection-in-Videos'...
remote: Enumerating objects: 376, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 376 (delta 3), reused 11 (delta 3), pack-reused 364 (from 1)[K
Receiving objects: 100% (376/376), 1.02 GiB | 23.39 MiB/s, done.
Resolving deltas: 100% (3/3), done.
Updating files: 100% (355/355), done.


In [None]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.199-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.17-py3-none-any.whl.metadata (14 kB)
Downloading ultralytics-8.3.199-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.17-py3-none-any.whl (28 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.199 ultralytics-thop-2.0.17


# Keypoints Extractions

In [None]:
import numpy as np
import pandas as pd
import os
from ultralytics import YOLO
import cv2

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [None]:
def get_data_files(folder_path):
    video_files = []

    for root, dirs, files in os.walk(folder_path):
        subfolder_list = root.split(os.path.sep)
        v_nv = subfolder_list[-2]
        c1_c2 = subfolder_list[-1]

        video_files.extend([(v_nv, c1_c2, file) for file in files if file.lower().endswith(('.mp4', '.jpg'))])

    return video_files

## Full Videos

In [None]:
folder_path = '/content/A-Dataset-for-Automatic-Violence-Detection-in-Videos/violence-detection-dataset'

video_files_list = get_data_files(folder_path)

In [None]:
columns = ('filename','keypoints', 'label', 'camera')

data = []

model = YOLO("yolov8n-pose.pt")

for label, cam, video_file in video_files_list:
  cap = cv2.VideoCapture(f'{folder_path}/{label}/{cam}/{video_file}')

  results  = []

  while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
      break

    estimation_result = model(frame)

    for r in estimation_result:
      results.append(r.keypoints.xy.cpu().numpy())

  data.append((video_file, results, label, cam))

  cap.release()

df = pd.DataFrame(data, columns=columns)
#df.to_csv('extracted_keypoints.csv', index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Speed: 3.4ms preprocess, 12.4ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 12.1ms
Speed: 5.6ms preprocess, 12.1ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 10.3ms
Speed: 3.7ms preprocess, 10.3ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 11.4ms
Speed: 3.3ms preprocess, 11.4ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 10.1ms
Speed: 2.6ms preprocess, 10.1ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 10.4ms
Speed: 2.6ms preprocess, 10.4ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 12.7ms
Speed: 2.4ms preprocess, 12.7ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 13.2ms
Speed: 2.6ms preprocess, 13.2ms inference, 2.0

KeyboardInterrupt: 

In [None]:
df.to_pickle('extracted_keypoints.pkl')

## Selected keyframes

In [None]:
keyframes_dataset_path = '/drive/MyDrive/AIRTLab/'

image_files_list = get_data_files(keyframes_dataset_path)

In [None]:
image_files_list[0]

('test', 'non-violent', 'v37f2.jpg')

In [None]:
model = YOLO("yolov8n-pose.pt")

r = model(keyframes_dataset_path + 'test/violent/v104f10.jpg')


image 1/1 /content/drive/MyDrive/AIRTLab/test/violent/v104f10.jpg: 384x640 4 persons, 17.1ms
Speed: 3.3ms preprocess, 17.1ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)


In [None]:
columns = ('filename','keypoints', 'label', 'set_split')

data = []

model = YOLO("yolov8n-pose.pt")

for set_split, label, image_file in image_files_list:

  results  = []

  file_path = f'{keyframes_dataset_path}/{set_split}/{label}/{image_file}'

  estimation_result = model(file_path)

  for r in estimation_result:
    results.append(r.keypoints.xy.cpu().numpy())

  data.append((image_file, results, label, set_split))


df = pd.DataFrame(data, columns=columns)
#df.to_csv('extracted_keypoints.csv', index=False)

[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n-pose.pt to 'yolov8n-pose.pt': 100% ━━━━━━━━━━━━ 6.5MB 94.2MB/s 0.1s


# Feature Vectors

In [None]:
from sklearn.model_selection import train_test_split
from scipy.ndimage import uniform_filter1d

In [None]:
def keypoint_statistics_features(keypoints_sequence):
    if keypoints_sequence.shape[0] < 1:
        return None  # Not enough frames

    smoothed_sequence = uniform_filter1d(keypoints_sequence, size=3, axis=0, mode='nearest')

    stats = [
        np.min(smoothed_sequence, axis=0),
        np.max(smoothed_sequence, axis=0),
        np.mean(smoothed_sequence, axis=0),
        np.std(smoothed_sequence, axis=0),
        np.median(smoothed_sequence, axis=0),
        np.max(smoothed_sequence, axis=0) - np.min(smoothed_sequence, axis=0),
        np.percentile(smoothed_sequence, 25, axis=0),
        np.percentile(smoothed_sequence, 75, axis=0),
    ]


    features = [stat.flatten() for stat in stats]
    return np.concatenate(features)

def aggregate_video_features2(video_person_skeletons, pooling='mean'):
    person_features = []

    for keypoints_sequence in video_person_skeletons:
        feats = keypoint_statistics_features(np.array(keypoints_sequence))
        if feats is not None:
            person_features.append(feats)

    if not person_features:
        return np.zeros(204)

    person_features = np.array(person_features)

    if pooling == 'mean':
        return np.mean(person_features, axis=0)
    elif pooling == 'max':
        return np.max(person_features, axis=0)
    else:
        raise ValueError("Pooling must be 'mean' or 'max'")

In [None]:
df = pd.read_pickle('extracted_keypoints.pkl')
df.head()

Unnamed: 0,filename,keypoints,label,camera
0,49.mp4,"[[[[ 1300.7 665.07], [ 1312.5 ...",violent,cam2
1,104.mp4,"[[[[ 955.61 420], [ 954.25 ...",violent,cam2
2,84.mp4,"[[[[ 582.17 242.16], [ 583.53 ...",violent,cam2
3,11.mp4,"[[[[ 1313.6 158.94], [ 1324.4 ...",violent,cam2
4,109.mp4,"[[[[ 1025.4 610.7], [ 1014.9 ...",violent,cam2


In [None]:
import re

def extract_info(filename):
        match = re.match(r'v(\d+)f(\d+)\.jpg', filename)
        if match:
            return int(match.group(1)), int(match.group(2))
        return None, None
def create_grouped_keypoints_dataframe(df):
    # Extract video and frame number

    df[['video', 'frame']] = df['filename'].apply(lambda x: pd.Series(extract_info(x)))

    # Group and sort by video and split
    grouped = df.groupby(['video', 'set_split'])

    grouped_data = []
    for (video, split), group in grouped:
        group_sorted = group.sort_values(by='frame')
        keypoints_sequence = [kp[0] for kp in group_sorted['keypoints']]  # flatten single-element lists
        grouped_data.append({
            'videofile': f'v{video}',
            'keypoints': keypoints_sequence,
            'set_split': split
        })

    # Create new DataFrame
    return pd.DataFrame(grouped_data)

In [None]:
keypoints_list = np.array(df['keypoints'].copy())
video_features = []
for video in keypoints_list:
    # `video` is a dict: {person_id: [skeletons_per_frame]}
    fvec = aggregate_video_features(video, pooling='max')
    video_features.append(fvec)

In [None]:
keypoints_list = np.array(df['keypoints'].copy())
video_features = []
for video in keypoints_list:
    # `video` is a dict: {person_id: [skeletons_per_frame]}
    fvec = aggregate_video_features2(video, pooling='max')
    video_features.append(fvec)

In [None]:
df['feature_vectors'] = video_features
df

Unnamed: 0,filename,keypoints,label,camera,feature_vectors
0,49.mp4,"[[[[ 1300.7 665.07], [ 1312.5 ...",violent,cam2,"[1129.1244, 480.76425, 1134.6652, 469.11688, 1..."
1,104.mp4,"[[[[ 955.61 420], [ 954.25 ...",violent,cam2,"[1118.5825, 495.6267, 1115.4192, 488.03247, 11..."
2,84.mp4,"[[[[ 582.17 242.16], [ 583.53 ...",violent,cam2,"[905.5891, 345.35834, 914.1955, 334.72183, 894..."
3,11.mp4,"[[[[ 1313.6 158.94], [ 1324.4 ...",violent,cam2,"[1515.2991, 356.51144, 1520.8922, 346.67654, 1..."
4,109.mp4,"[[[[ 1025.4 610.7], [ 1014.9 ...",violent,cam2,"[924.60443, 550.16144, 925.4054, 541.961, 919...."
...,...,...,...,...,...
345,33.mp4,"[[[[ 1079.7 181.36], [ 1093.4 ...",non-violent,cam1,"[1019.59924, 229.0736, 1030.3308, 219.59088, 1..."
346,39.mp4,"[[[[ 831.11 282.21], [ 833.77 ...",non-violent,cam1,"[1153.6123, 432.62808, 1157.3484, 418.45932, 1..."
347,60.mp4,"[[[[ 1034.3 164.69], [ 1046.1 ...",non-violent,cam1,"[887.33716, 269.94238, 889.05133, 258.94016, 8..."
348,20.mp4,"[[[[ 1041 570.82], [ 1036.3 ...",non-violent,cam1,"[949.01996, 149.87105, 953.23376, 139.26009, 9..."


# Classification

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
df_classifier = df.drop(columns=['filename', 'keypoints', 'camera'])

In [None]:
X = np.vstack(df_classifier['feature_vectors'])
y = df_classifier['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
def classify(clf):
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='macro')  # use 'macro' for multiclass
  recall = recall_score(y_test, y_pred, average='macro')
  f1 = f1_score(y_test, y_pred, average='macro')

  print(f"Accuracy:  {accuracy:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall:    {recall:.4f}")
  print(f"F1 Score:  {f1:.4f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

classify(RandomForestClassifier())

Accuracy:  0.8276
Precision: 0.8116
Recall:    0.7942
F1 Score:  0.8016


In [None]:
from sklearn.naive_bayes import GaussianNB

classify(GaussianNB())

Accuracy:  0.7414
Precision: 0.7335
Recall:    0.7609
F1 Score:  0.7317


In [None]:
from sklearn.neighbors import KNeighborsClassifier

classify(KNeighborsClassifier(n_neighbors=5))

Accuracy:  0.7931
Precision: 0.7700
Recall:    0.7872
F1 Score:  0.7761


In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

classify(make_pipeline(StandardScaler(), SVC(gamma='auto')))

Accuracy:  0.8621
Precision: 0.8580
Recall:    0.8265
F1 Score:  0.8389


# CNNs

## Full videos

In [None]:
import numpy as np
import pandas as pd
import os
from ultralytics import YOLO
import cv2

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [None]:
def get_data_files(folder_path):
    video_files = []

    for root, dirs, files in os.walk(folder_path):
        subfolder_list = root.split(os.path.sep)
        v_nv = subfolder_list[-2]
        c1_c2 = subfolder_list[-1]

        video_files.extend([(v_nv, c1_c2, file) for file in files if file.lower().endswith(('.mp4', '.jpg'))])

    return video_files

In [None]:
folder_path = '/content/A-Dataset-for-Automatic-Violence-Detection-in-Videos/violence-detection-dataset'

video_files_list = get_data_files(folder_path)

In [None]:
video_files_list[0]

('non-violent', 'cam2', '56.mp4')

In [None]:
model = YOLO("yolov8n-pose.pt")

new_dataset_path = "skeletons_drawn_dataset"

fourcc = cv2.VideoWriter_fourcc(*'mp4v')

skeleton_connections = [
    (5, 7), (7, 9),     # Left arm
    (6, 8), (8, 10),    # Right arm
    (5, 6),             # Shoulders
    (11, 12),           # Hips
    (5, 11), (6, 12),   # Torso
    (11, 13), (13, 15), # Left leg
    (12, 14), (14, 16)  # Right leg
]


for label, cam, video_file in video_files_list:
  cap = cv2.VideoCapture(f'{folder_path}/{label}/{cam}/{video_file}')

  cap_height = np.int_(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
  cap_width = np.int_(cap.get(cv2.CAP_PROP_FRAME_WIDTH ))
  cap_fps = cap.get(cv2.CAP_PROP_FPS)

  out  = cv2.VideoWriter(f'{new_dataset_path}/{label}/{cam}/{"new-"+video_file}', fourcc, cap_fps, (cap_width, cap_height))

  while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
      break

    results = model(frame)

    for result in results:
        kpts = result.keypoints
        if kpts is None:
            continue

        keypoints = kpts.xy.cpu().numpy()

        for person in keypoints:
            # Draw keypoints
            for x, y in person:
                cv2.circle(frame, (int(x), int(y)), 3, (0, 255, 0), -1)

            # Draw connections (skeleton)
            for idx1, idx2 in skeleton_connections:
                x1, y1 = person[idx1]
                x2, y2 = person[idx2]
                if x1 > 0 and y1 > 0 and x2 > 0 and y2 > 0:  # valid points
                    cv2.line(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 255), 2)

    out.write(frame)

  cap.release()
  out.release()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0: 384x640 3 persons, 11.3ms
Speed: 3.4ms preprocess, 11.3ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 15.1ms
Speed: 3.3ms preprocess, 15.1ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 12.3ms
Speed: 3.4ms preprocess, 12.3ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 11.8ms
Speed: 3.2ms preprocess, 11.8ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 11.7ms
Speed: 3.2ms preprocess, 11.7ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 9.4ms
Speed: 7.4ms preprocess, 9.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 11.6ms
Speed: 3.2ms preprocess, 11.6ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 7.5ms
Speed: 3.2ms preproc

In [None]:
!cp -r /content/skeletons_drawn_dataset/ /content/drive/MyDrive/

## Selected Keyframes

In [None]:
keyframes_dataset_path = '/content/drive/MyDrive/AIRTLab/'

image_files_list = get_data_files(keyframes_dataset_path)

In [None]:
model = YOLO("yolov8n-pose.pt")

set_split, label, image_file = image_files_list[0]

file_path = f'{keyframes_dataset_path}/{set_split}/{label}/{image_file}'

estimation_result = model(keyframes_dataset_path + 'test/violent/v104f10.jpg')

image = cv2.imread(file_path)

for result in estimation_result:
  kpts = result.keypoints
  if kpts is None:
      continue

  keypoints = kpts.xy.cpu().numpy()

  for person in keypoints:
      # Draw keypoints
      for x, y in person:
          cv2.circle(image, (int(x), int(y)), 3, (0, 255, 0), -1)

      # Draw connections (skeleton)
      for idx1, idx2 in skeleton_connections:
          x1, y1 = person[idx1]
          x2, y2 = person[idx2]
          if x1 > 0 and y1 > 0 and x2 > 0 and y2 > 0:  # valid points
              cv2.line(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 255), 2)



image 1/1 /content/drive/MyDrive/AIRTLab/test/violent/v104f10.jpg: 384x640 4 persons, 9.0ms
Speed: 2.2ms preprocess, 9.0ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)


In [None]:
model = YOLO("yolov8n-pose.pt")

new_keyframes_dataset_path = "skeletons_drawn_keyframes"

skeleton_connections = [
    (5, 7), (7, 9),     # Left arm
    (6, 8), (8, 10),    # Right arm
    (5, 6),             # Shoulders
    (11, 12),           # Hips
    (5, 11), (6, 12),   # Torso
    (11, 13), (13, 15), # Left leg
    (12, 14), (14, 16)  # Right leg
]

for set_split, label, image_file in image_files_list:

  results  = []

  file_path = f'{keyframes_dataset_path}/{set_split}/{label}/{image_file}'

  estimation_result = model(file_path)

  image = cv2.imread(file_path)

  for result in estimation_result:
    kpts = result.keypoints
    if kpts is None:
        continue

    keypoints = kpts.xy.cpu().numpy()

    for person in keypoints:
        # Draw keypoints
        for x, y in person:
            cv2.circle(image, (int(x), int(y)), 3, (0, 255, 0), -1)

        # Draw connections (skeleton)
        for idx1, idx2 in skeleton_connections:
            x1, y1 = person[idx1]
            x2, y2 = person[idx2]
            if x1 > 0 and y1 > 0 and x2 > 0 and y2 > 0:  # valid points
                cv2.line(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 255), 2)
    cv2.imwrite(f'{new_keyframes_dataset_path}/{set_split}/{label}/{image_file}', image)

#df = pd.DataFrame(data, columns=columns)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
image 1/1 /content/drive/MyDrive/AIRTLab/train/violent/v61f112.jpg: 384x640 4 persons, 11.8ms
Speed: 2.4ms preprocess, 11.8ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /content/drive/MyDrive/AIRTLab/train/violent/v61f109.jpg: 384x640 4 persons, 8.2ms
Speed: 2.3ms preprocess, 8.2ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /content/drive/MyDrive/AIRTLab/train/violent/v61f114.jpg: 384x640 4 persons, 10.9ms
Speed: 2.3ms preprocess, 10.9ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /content/drive/MyDrive/AIRTLab/train/violent/v61f105.jpg: 384x640 3 persons, 8.4ms
Speed: 2.3ms preprocess, 8.4ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /content/drive/MyDrive/AIRTLab/train/violent/v61f83.jpg: 384x640 4 persons, 8.1ms
Speed: 2.3ms preprocess, 8.1ms inference, 1.7ms postprocess per image at shape 

In [None]:
!cp -r /content/skeletons_drawn_keyframes/ /content/drive/MyDrive/