In [1]:
from ultralytics import YOLO
import cv2
import torch
import os
import numpy as np
import math
from collections import deque

Importing dependecies and paths

In [2]:
DATA_path = r'D:\Datasets\fight'
actions = ['fighting','not_fighting']
fighting_path = r'D:\Datasets\fight\fighting'
not_fighting_path = r'D:\Datasets\fight\not_fighting'

Main functions to process and extract keypoints

In [3]:

def calc_distances(hands_dict,body_kp,head_kp):

    # creating a dictionary of distances between each keypoint (except of the same object) in the keypoint_dict
    dist_dict = {}
    keyhead = head_kp.keys()
    keysh = hands_dict.keys()
    keysb = body_kp.keys()

    # calculating distances between keypoints on each hand: left to left, right to left, left to right and right to right
    for i,keyi in enumerate(keysh,start =1):
        for j,keyj in enumerate(keysh,start =1):
            if j>=i:
                break
            distll = calc_euclid_dist(hands_dict[keyi][0],hands_dict[keyj][0])
            distlr = calc_euclid_dist(hands_dict[keyi][0],hands_dict[keyj][1])
            distrl = calc_euclid_dist(hands_dict[keyi][1],hands_dict[keyj][0])
            distrr = calc_euclid_dist(hands_dict[keyi][1],hands_dict[keyj][1])
            dist_dict[f'{keyi}'+f'{keyj}'] = list([distll,distlr,distrl,distrr])
 
    # calculating distances between hands and bodies
    for i,keyi in enumerate(keysh,start =1):
        for j,keyj in enumerate(keysb,start =1):
            if j>=i:
                break

            distlb = calc_euclid_dist(hands_dict[keyi][0],body_kp[keyj])
            
            distrb = calc_euclid_dist(body_kp[keyj],hands_dict[keyi][1])

            dist_dict[f'{keyi}'+f'{keyj}'].append(distlb)
            dist_dict[f'{keyi}'+f'{keyj}'].append(distrb)
                
    for i,keyi in enumerate(keysb,start =1):
        for j,keyj in enumerate(keysh,start =1):
            if j>=i:
                break

            distlb = calc_euclid_dist(hands_dict[keyj][0],body_kp[keyi])
            
            distrb = calc_euclid_dist(body_kp[keyi],hands_dict[keyj][1])
            

            dist_dict[f'{keyi}'+f'{keyj}'].append(distlb)
            dist_dict[f'{keyi}'+f'{keyj}'].append(distrb)


    # calculating distances between hands and heads
    for i,keyi in enumerate(keysh,start =1):
        for j,keyj in enumerate(keyhead,start =1):
            if j>=i:
                break

            distlh = calc_euclid_dist(hands_dict[keyi][0],head_kp[keyj])
            
            distrh = calc_euclid_dist(head_kp[keyj],hands_dict[keyi][1])
            
            dist_dict[f'{keyi}'+f'{keyj}'].append(distlh)
            dist_dict[f'{keyi}'+f'{keyj}'].append(distrh)



    for i,keyi in enumerate(keyhead,start =1):
        for j,keyj in enumerate(keysh,start =1):
            if j>=i:
                break

            distlh = calc_euclid_dist(hands_dict[keyj][0],head_kp[keyi])
            
            distrh = calc_euclid_dist(head_kp[keyi],hands_dict[keyj][1])
            
 
            dist_dict[f'{keyi}'+f'{keyj}'].append(distlh)
            dist_dict[f'{keyi}'+f'{keyj}'].append(distrh)



    # calculating distances between bodies
    for i,keyi in enumerate(keysb,start =1):
        for j,keyj in enumerate(keysb,start =1):
            if j>=i:
                break

            distbb = calc_euclid_dist(body_kp[keyi],body_kp[keyj])

            dist_dict[f'{keyi}'+f'{keyj}'].append(distbb)


    # calculating distances between heads
    for i,keyi in enumerate(keyhead,start =1):
        for j,keyj in enumerate(keyhead,start =1):
            if j>=i:
                break

            disthh = calc_euclid_dist(head_kp[keyi],head_kp[keyj])

            dist_dict[f'{keyi}'+f'{keyj}'].append(disthh)
    
    return dist_dict


def extract_hands_keypoints(results, threshold_class, threshold_keypoint):
    # creating a dictionary to collect keypoints to each object id as dictionary key
    existing_kp = {}
    for result,i_d in zip(results[0],results[0].boxes.id):
        # There results for bounding boxes, and confidence scores for general detect
        x1, y1, x2, y2,_, conf_for_detect, class_id_detected = (result.boxes.data.tolist())[0]
        # If the confidence score for general detect is lower than threshold, skip
        if conf_for_detect < threshold_class:
            continue
        # keypoints
        keys = (result.keypoints.data.tolist())[0]
        xl_key, yl_key, confl = keys[9]
        if confl > threshold_keypoint:
           l = [int(xl_key),int(yl_key)]
        else:
            l = []
        xr_key, yr_key, confr = keys[10]
        if confr > threshold_keypoint:
           r = [int(xr_key),int(yr_key)]
        else:
            r = []
        hands_coords = list([l,r])
        # Adding existing hand keypoints of an object in a frame to the dictionary   
        existing_kp[int(i_d)] = hands_coords
    return existing_kp

def extract_body_keypoints(results,threshold_class, threshold_keypoint):
    # creating a dictionary to collect keypoints to each object id as dictionary key
    existing_kp = {}
    for result,i_d in zip(results[0],results[0].boxes.id):
        # There results for bounding boxes, and confidence scores for general detect
        x1, y1, x2, y2,_, conf_for_detect, class_id_detected = (result.boxes.data.tolist())[0]
        # If the confidence score for general detect is lower than threshold, skip
        if conf_for_detect < threshold_class:
            continue
        # keypoints
        keys = (result.keypoints.data.tolist())[0]
        xl_key, yl_key, confl = keys[5]
        xr_key, yr_key, confr = keys[6]
        if (confl>threshold_keypoint) and (confr>threshold_keypoint):
            # Adding existing hand keypoints of an object in a frame to the dictionary   
            mid_point  = list([int((xr_key+xl_key)/2),int((yl_key+yr_key)/2)])
            
        else:
            mid_point = []

        existing_kp[int(i_d)] = mid_point

    return existing_kp

def extract_head_keypoints(results,threshold_class, threshold_keypoint):
    # creating a dictionary to collect keypoints to each object id as dictionary key
    existing_kp = {}
    for result,i_d in zip(results[0],results[0].boxes.id):
        # There results for bounding boxes, and confidence scores for general detect
        x1, y1, x2, y2,_, conf_for_detect, class_id_detected = (result.boxes.data.tolist())[0]
        # If the confidence score for general detect is lower than threshold, skip
        if conf_for_detect < threshold_class:
            continue
        # keypoints
        keys = (result.keypoints.data.tolist())[0]
        xh_key, yh_key, confh = keys[0]
        if confh>threshold_keypoint:
            # Adding existing hand keypoints of an object in a frame to the dictionary   
            mid_point  = list([int(xh_key),int(yh_key)])
        else:
            mid_point = []
        existing_kp[int(i_d)] = mid_point
    return existing_kp


def extract_keypoints(results, threshold_class):
    existing_kp = {}
    for result,i_d in zip(results[0],results[0].boxes.id):
        # There results for bounding boxes, and confidence scores for general detect
        x1, y1, x2, y2,_, conf_for_detect, class_id_detected = (result.boxes.data.tolist())[0]
        # If the confidence score for general detect is lower than threshold, skip
        if conf_for_detect < threshold_class:
            continue
        # keypoints
        keys = (result.keypoints.data.tolist())[0]
        keyp_arr = list()
        for key in keys:
            keyp_arr.append(key)
        # Adding existing hand keypoints of an object in a frame to the dictionary   
        existing_kp[int(i_d)] = keyp_arr
    return existing_kp

def calc_kp_to_kp_dist(keypoints_dict):
    # creating a dictionary of distances between each keypoint (except of the same object) in the keypoint_dict
    dist_dict = {}
    keys = keypoints_dict.keys()
    # calculating distances between keypoints 
    for l,keyi in enumerate(keys,start =1):
        for m,keyj in enumerate(keys,start =1):
            if m>=l:
                break  
            for i,p1 in enumerate(keypoints_dict[keyi]):
                for j,p2 in enumerate(keypoints_dict[keyj]):
                    dist = calc_euclid_dist(p1,p2)
                    dist_dict[f'{keyi}'+f'{keyj}'+f'{i}'+f'{j}'] = dist
    return dist_dict

def calc_euclid_dist(p1,p2):
    if (len(p1)>0) and (len(p2)>0):
        dist = int(math.sqrt((p1[0]-p2[0])*(p1[0]-p2[0]) + (p1[1]-p2[1])*(p1[1]-p2[1])))
        return dist
    else: 
        return np.nan
    
def calc_grad(dist_dict):
    return

Initializing dictionaries and parametres

In [4]:
text2 = "No suspicious activity"
text1 = "Suspicious activity"
text3 = "No people in sight"
color2 = (100, 200, 0)
color1 = (100, 0, 200)
color3 = (100, 100, 100)
font_scale = 1.6
thickness = 2

winsize = 40
all_keypoints = {}
distance_dict = {}
average_dist = {}
grad_dict = {}
outputs = [0,1]
nums_sequences = 0

Initializing YOLOv8 pose model and caption from file

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
modely = YOLO('yolov8l-pose.pt')  # load a pretrained YOLOv8n classification model
modely.to(device)
video_path = r"D:\videos\fight4.mp4"
vid_name = 'v4'
cap = cv2.VideoCapture(video_path)
# Get video properties
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fps = cap.get(cv2.CAP_PROP_FPS) # or number
# Create a VideoWriter object to save the output video
output_video_path = r"D:\videos_processed\fight4_processed.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

Data coleccting cycle. Each (winsize) frames you will have to press 'f' key if fight was on those frames and any other key if not

In [6]:
while cap.isOpened():
# Read a frame from the video
    success, frame = cap.read()
    if success:

        results = modely.track(frame, persist=True, retina_masks=True, boxes=True, show_conf=False, line_width=1,  conf=0.8, iou=0.5,  classes=0, show_labels=False, device=device,verbose = False,tracker="bytetrack.yaml")
        if results[0].boxes.id is not None:
            boxes = results[0].boxes.xyxy.cpu().numpy().astype(int)
            ids = results[0].boxes.id.cpu().numpy().astype(int)
            for box, i_d in zip(boxes, ids):
                x1, y1, x2, y2 = box[0], box[1], box[2], box[3]



                # Draw bounding box
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

                # Draw customer id on the frame above the bounding box
                text = f"{i_d}"
                font = cv2.FONT_HERSHEY_SIMPLEX
                font_scale = 0.6
                font_thickness = 1
                text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]

                # Calculate the position to align the label with the top of the bounding box
                text_x = x1 + (x2 - x1 - text_size[0]) // 2
                text_y = y1 - 10  # Adjust this value for the desired vertical offset

                # Make sure the text_y position is within the frame's bounds
                if text_y < 0:
                    text_y = 0

                # Draw the label background rectangle
                cv2.rectangle(frame, (text_x - 5, text_y - text_size[1] - 5), (text_x + text_size[0] + 5, text_y + 5), (0, 0, 0), -1)

                #Draw the customer id text
                cv2.putText(
                    frame,
                    text,
                    (text_x, text_y),
                    font,
                    font_scale,
                    (255, 255, 255),  # White color
                    font_thickness,
                    lineType=cv2.LINE_AA
                )
    
            #extracting keypoints
            body_kp = extract_body_keypoints(results = results,threshold_class=0.4,threshold_keypoint=0.4)
            hands_kp = extract_hands_keypoints(results = results,threshold_class=0.4,threshold_keypoint=0.4)
            head_kp = extract_head_keypoints(results = results,threshold_class=0.4,threshold_keypoint=0.4)
            #calculating distances between keypoints

            dd = calc_distances(hands_kp,body_kp,head_kp)
            #appending distances dictionary and evaluating average distance and classification based on it
            for key in dd.keys():

                if key not in distance_dict.keys():
                    distance_dict[key] = deque(maxlen=40)

                distance_dict[key].append(dd[key])
                
                if len(distance_dict[key]) == winsize:
                    nums_sequences = nums_sequences + 1
                    print(f'Processing pair {key}.')
                    keypoints = np.array(distance_dict[key])
                    if cv2.waitKey(-1) & 0xFF == ord('f'):
                        if cv2.waitKey(-1) & 0xFF == ord('f'):
                            save_path = fighting_path   + f'\{vid_name}' +  f'{nums_sequences}'
                        else:
                            save_path = not_fighting_path  + f'\{vid_name}' +  f'{nums_sequences}'
                    else:
                        distance_dict[key].clear()
                        continue
                    np.save(save_path,keypoints)
                    distance_dict[key].clear()
            

        annotated_frame_show = cv2.resize(frame, (1080, 720))
        cv2.imshow("YOLOv8 Inference", annotated_frame_show)
        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
            
        
    else:
        # Break the loop if the end of the video is reached
        break

cap.release()
cv2.destroyAllWindows()



Processing pair 42.
Processing pair 41.
Processing pair 12.
Processing pair 92.
Processing pair 21.
Processing pair 12.
Processing pair 12.
Processing pair 228.
Processing pair 3528.
Processing pair 3628.
Processing pair 3928.
Processing pair 3936.
Processing pair 3628.
Processing pair 3628.
Processing pair 3628.
Processing pair 4436.
Processing pair 4536.
Processing pair 2836.
Processing pair 4636.
Processing pair 4836.
Processing pair 4936.
Processing pair 4948.
Processing pair 4836.
Processing pair 4936.
Processing pair 4948.
Processing pair 5036.
Processing pair 5048.
Processing pair 5049.
Processing pair 4836.
Processing pair 4936.
Processing pair 4948.
Processing pair 5036.
Processing pair 5048.
Processing pair 5049.
Processing pair 4836.
Processing pair 4936.
Processing pair 4948.
Processing pair 4836.
Processing pair 5036.
Processing pair 5048.
Processing pair 4836.
Processing pair 4836.
Processing pair 5848.
Processing pair 5948.
Processing pair 5958.
Processing pair 5948.
Pro

Preprocessing data

In [6]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import torch
from math import sqrt

In [7]:
def normalize_sequence(seq):
    mean = torch.mean(seq)
    std_dev = torch.std(seq) + 0.0001
    standardized_data = (seq - mean) / std_dev
    return standardized_data

#data = torch.Tensor([ np.nan,  np.nan,  np.nan,  np.nan,  np.nan,  np.nan,  np.nan,  np.nan,  np.nan,  
  #                   np.nan])
def approximate_linear(series):
    i_0 = 0
    length = series.shape[0]
    left_value = 0
    right_value = 0
    for i,num in enumerate(series):
        if torch.isnan(num) == True:
            i_0 = i
            for j in range(i_0,length):
                if torch.isnan(series[j]) == False:
                    right_value  = series[j]
                    if (i_0) == 0:
                        left_value = right_value 
                    else:
                        left_value = series[i_0 - 1]
                    
                    tg = (right_value - left_value)/(j - i_0 + 1)
                    for k,ind  in enumerate(range(i_0,j)):
                        series[ind] = left_value + (k+1)*tg
                    break
                    
                else:
                    if ((j+1) == length)&(i_0 == 0):
                        series = torch.nan_to_num(series,0)
                        return series
                    if (j+1) == length:
                        for k,ind  in enumerate(range(i_0,j+1)):
                            series[ind] = left_value

        else:
            continue
    
    return normalize_sequence(series)


def approx_gaps(sequences):
    seq_data = torch.transpose(torch.Tensor(sequences),1,2)
    data_shape = seq_data.shape
    output_data = torch.zeros((data_shape)) 
    for i,seq in enumerate(seq_data):
        for j,series in enumerate(seq):
            output_data[i][j] = approximate_linear(series)

    return output_data




In [8]:
label_map = {label: num for num, label in enumerate(actions)}
sequences,labels = [],[]
for action in actions:
    for file in os.listdir(os.path.join(DATA_path,action)):
        sequences.append(np.load(os.path.join(DATA_path,action,file),allow_pickle=True))
        labels.append(label_map[action])

seq_data = approx_gaps(sequences=np.array(sequences))


seq_labels = to_categorical(labels).astype(int)
X_train, X_test, y_train, y_test = train_test_split(seq_data,seq_labels,test_size=0.1) 


num0class = np.sum(np.argmax(y_train,axis = 1) ==0 )
num1class = np.sum(np.argmax(y_train,axis = 1) ==1 )

num01class = len(y_train)
print(X_train.shape)
loss_weights = torch.Tensor([1 - num0class/num01class,1 - num1class/num01class])
loss_weights

torch.Size([494, 14, 40])


tensor([0.7976, 0.2024])

Transfering data to pytorch compatible type

In [9]:
class Keypoint_sequence_dataset(torch.utils.data.Dataset):

    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx, :], self.y[idx]
        
    


Building neural network and train

In [10]:
import torch
import torch.nn as nn
import numpy as np
from sklearn.metrics import accuracy_score
import wandb
from filelock import FileLock
from ray import train, tune
from ray.train import Checkpoint
from ray.tune.schedulers import ASHAScheduler

#wandb.login(key = '66d1d2eaf7cd3f83b644fc151071bbf5d7f0c237')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [11]:
class Seq_conv_net(nn.Module):
    def __init__(self,kernel_size = 3,dropout_conv = 0.3,dropout_linear = 0.3,attention_size = 8,attention_heads = 4,attention_dropout = 0.3):
        super(Seq_conv_net,self).__init__()
        

        self.conv1 = nn.Sequential(
            nn.Conv1d(in_channels=14, out_channels=32, kernel_size =kernel_size,padding= 'same') ,
            nn.BatchNorm1d(32),
            nn.GELU(),
            nn.Dropout1d(dropout_conv)
        )
        self.conv2 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size = kernel_size,padding= 'same') ,
            nn.BatchNorm1d(64),
            nn.GELU(),
            nn.Dropout1d(dropout_conv)
        )
        self.conv3 = nn.Sequential(
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size = kernel_size,padding= 'same') ,
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Dropout1d(dropout_conv)
        )
        self.conv4 = nn.Sequential(
            nn.Conv1d(in_channels=128, out_channels=attention_size, kernel_size = kernel_size,padding= 'same') ,
            nn.BatchNorm1d(attention_size),
            nn.GELU(),
            nn.Dropout1d(dropout_conv)
        )
        self.maxpool1 = nn.AdaptiveMaxPool1d(20)



        

       #self.lstm1 = nn.LSTM(64,64,LSTM_size,batch_first=True,dropout= LSTM_dropout)
        self.attention = nn.MultiheadAttention(attention_size,attention_heads,batch_first=True,dropout= attention_dropout)

        self.flatten = nn.Flatten()
        

        self.linear1 = nn.Sequential(
            nn.Linear(attention_size*20, attention_size*5) ,
            nn.BatchNorm1d(attention_size*5),
            nn.GELU(),
            nn.Dropout1d(dropout_linear)
        )
        self.linear2 = nn.Sequential(
            nn.Linear(attention_size*5, attention_size) ,
            nn.BatchNorm1d(attention_size),
            nn.GELU(),
            nn.Dropout1d(dropout_linear)
        )
        self.linear3 = nn.Sequential(
            nn.Linear(attention_size, 128) ,
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Dropout1d(dropout_linear)
        )
        self.logits = nn.Sequential(
            nn.Linear(128, 2) ,
            nn.Softmax(1)
        )

        
    def forward(self, x):
        # forward pass сети
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.maxpool1(x)
        
        x = torch.transpose(x,1,2)
        #x,_ = self.lstm1(x)
        x,_ = self.attention(x,x,x)

        x = self.flatten(x)
        x = self.linear1(x)
        x = self.linear2(x)
        x = self.linear3(x)
        x = self.logits(x)
        
        return x



In [12]:
def evaluate_model(model, dataloader, loss_fn):
    
    losses = []

    num_elements = len(dataloader.dataset)
    num_correct = 0
    num0_true = 0
    num0_false = 0
    num1_true = 0
    num1_false = 0
    num0_ = 0
    num1_ = 0
    for i, batch in enumerate(dataloader):
        
        # так получаем текущий батч
        X_batch, y_batch = batch
        
        with torch.no_grad():
            logits = model(X_batch.to(device,dtype = torch.float))
            
            loss = loss_fn(logits, y_batch.to(device,dtype = torch.float))
            
            losses.append(loss.item())
            
            y_pred = torch.argmax(logits, dim=1).cpu()
            
            y_answers = torch.argmax(y_batch, dim=1).cpu()

            num_correct += torch.sum(y_answers == y_pred)
            num0_true += torch.sum((y_answers == 0)&(y_pred == 0))
            num0_false += torch.sum((y_answers == 0)&(y_pred == 1))
            num1_true += torch.sum((y_answers == 1)&(y_pred == 1))
            num1_false += torch.sum((y_answers == 1)&(y_pred == 0))
            num0_ +=torch.sum(y_answers == 0)
            num1_ +=torch.sum(y_answers == 1)
    accuracy = num_correct / num_elements   

    f1 = F1_score(num0_true,num0_false,num1_false)

    conf_matrix = np.array([[num1_true,num0_false],[num1_false,num0_true]])
    
    return float(accuracy), float(f1),float(np.mean(losses)), conf_matrix

def F1_score(tp,fp,fn):
    return (2*tp/(2*tp + fp + fn))

def train_model(model, loss_fn, optimizer,train_loader,val_loader, n_epoch=3,raytune_mode = False,wandb_mode = False):

    num_iter = 0
    # цикл обучения сети
    for epoch in range(n_epoch):

        

        model.train(True)
        for i, batch in enumerate(train_loader):
            # так получаем текущий батч
            X_batch, y_batch = batch 
            
            # forward pass (получение ответов на батч картинок)
            logits = model(X_batch.to(device,dtype = torch.float)) 
            
            # вычисление лосса от выданных сетью ответов и правильных ответов на батч
            loss = loss_fn(logits, y_batch.to(device,dtype = torch.float)) 
            
            
            loss.backward() # backpropagation (вычисление градиентов)
            optimizer.step() # обновление весов сети
            optimizer.zero_grad() # обнуляем веса

            num_iter += 1


        # после каждой эпохи получаем метрику качества на валидационной выборке
        model.train(False)
        
        val_accuracy,f1_v, val_loss,_ = evaluate_model(model, val_loader, loss_fn=loss_fn)
        train_accuracy,f1_t, train_loss,_ = evaluate_model(model, train_loader, loss_fn=loss_fn)
        
        if wandb_mode == True:
            wandb.log({"Val/accuracy": val_accuracy,"val/f1_metric": f1_v ,
                   "Val/loss": val_loss,"train/accuracy": train_accuracy,
                   "train/loss": train_loss,'train/f1_metric':f1_t})
        
        if raytune_mode == True:
            os.makedirs("checkpoint_models", exist_ok=True)
            torch.save(
                        (model.state_dict(), optimizer.state_dict()), "checkpoint_models/checkpoint.pt")
            checkpoint = Checkpoint.from_directory("checkpoint_models")
            train.report({"loss": val_loss, "accuracy": val_accuracy,"f1": f1_v}, checkpoint=checkpoint)
        #if epoch%10 ==0:
        #    print("Epoch:", epoch)
        #    
        #    #print('Loss/train', train_loss.item(), epoch)
        #    print(f'Accuracy/train{train_accuracy.item():0.6f}')
        #    #print('Loss/val', val_loss.item(), epoch)
        #    print(f'Accuracy/val{val_accuracy.item():0.6f}')
        
    return model

In [18]:
wandb.login(key = '66d1d2eaf7cd3f83b644fc151071bbf5d7f0c237')



train_data = Keypoint_sequence_dataset(X_train,y_train)
test_data = Keypoint_sequence_dataset(X_test,y_test)

train_size = int(len(train_data) * 1)

val_size = len(train_data) - train_size

train_data, val_data = torch.utils.data.random_split(train_data, [train_size, val_size])

train_loader = torch.utils.data.DataLoader(train_data, batch_size=16, shuffle=True)
#val_loader = torch.utils.data.DataLoader(val_data, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=16, shuffle=True)


kernel_size = 3
dropout_conv = 0.1
dropout_linear = 0.1
attention_size = 128
attention_dropout = 0.1
batch_size = 16
attention_heads = 8
conv_net = Seq_conv_net(kernel_size = kernel_size,dropout_conv = dropout_conv ,dropout_linear = dropout_linear,attention_size = attention_size,attention_heads=attention_heads,attention_dropout=attention_dropout)

conv_net = conv_net.to(device)



loss_fn = torch.nn.CrossEntropyLoss(weight=loss_weights.to(device))

# выбираем алгоритм оптимизации и learning_rate
learning_rate = 1e-4
optimizer = torch.optim.Adam(conv_net.parameters(), lr=learning_rate)

run = wandb.init(
        # Set the project where this run will be logged
        project="fight_detection_module",
        # Track hyperparameters and run metadata
        config={
            "learning_rate": learning_rate,
            'kernel_size':  kernel_size,
            'dropout_conv': dropout_conv ,
            'dropout_linear': dropout_linear ,
            'Attention_size': attention_size ,
            'Attention_dropout': attention_dropout,
            'attention_heads': attention_heads,
            'batch_size': batch_size    })


conv_net = train_model(model=conv_net,loss_fn=loss_fn,optimizer=optimizer,train_loader=train_loader,val_loader=test_loader,n_epoch=300,raytune_mode= False)






Ray Tuning (AutoML analog in torch)

In [None]:
config_hp = {
    "learning_rate": tune.grid_search([0.01,0.001,0.0001]),
    "batch_size": tune.grid_search([16, 32]),
    "kernel_size":tune.grid_search([3,5,7]),
    "dropout_conv": tune.grid_search([0.,0.1,0.2]),
    "dropout_linear": tune.grid_search([0.,0.1,0.2]),
    "attention_size": tune.grid_search([128,256]),
    "attention_dropout": tune.grid_search([0.,0.1,0.2]),
    "attention_heads": tune.grid_search([4,8,16]),
    }

In [25]:
train_data = Keypoint_sequence_dataset(X_train,y_train)
test_data = Keypoint_sequence_dataset(X_test,y_test)

def ray_train_model(config):

    batch_size = config['batch_size']
    kernel_size = config['kernel_size']
    dropout_conv = config['dropout_conv']
    dropout_linear = config['dropout_linear']
    attention_size = config['attention_size']
    attention_dropout = config['attention_dropout']
    attention_heads = config['attention_heads']
    learning_rate = config['learning_rate']

    


    train_loader = torch.utils.data.DataLoader(train_data, batch_size=int(batch_size), shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=int(batch_size), shuffle=True)

    conv_net = Seq_conv_net(kernel_size = kernel_size,dropout_conv = dropout_conv ,dropout_linear = dropout_linear,attention_size = attention_size,attention_heads=attention_heads,attention_dropout=attention_dropout)

    conv_net = conv_net.to(device)



    loss_fn = torch.nn.CrossEntropyLoss(weight=loss_weights.to(device))

    
    optimizer = torch.optim.Adam(conv_net.parameters(), lr=learning_rate)

    loaded_checkpoint = train.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
           model_state, optimizer_state = torch.load(os.path.join(loaded_checkpoint_dir, "checkpoint.pt"))
        conv_net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)



   
    conv_net = train_model(model=conv_net,loss_fn=loss_fn,optimizer=optimizer,train_loader=train_loader,val_loader=test_loader,n_epoch=100,raytune_mode=True,wandb_mode = False)
   
def evaluate_best_model(net, dataset, loss_fn,best_result,config):

    model = net(kernel_size = config['kernel_size'],dropout_conv = config['dropout_conv'] ,dropout_linear = config['dropout_linear'],
                attention_size = config['attention_size'],attention_heads=config['attention_heads'],attention_dropout=config['attention_dropout'])
    model.to(device)
    checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")
    model_state, optimizer_state = torch.load(checkpoint_path)
    model.load_state_dict(model_state)
    loss_fn = torch.nn.CrossEntropyLoss(weight=loss_weights.to(device))
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=int(config["batch_size"]), shuffle=True)

    return evaluate_model(model,dataloader,loss_fn)

def main(num_samples=1, max_num_epochs=100, gpus_per_trial=1):
    config_hp = {
    "learning_rate": tune.grid_search([0.0001]),
    "batch_size": tune.grid_search([16]),
    "kernel_size":tune.grid_search([5]),
    "dropout_conv": tune.grid_search([0.,0.05]),
    "dropout_linear": tune.grid_search([0.,0.05]),
    "attention_size": tune.grid_search([128,160]),
    "attention_dropout": tune.grid_search([0.,0.05]),
    "attention_heads": tune.grid_search([16,32]),
    }
    scheduler = ASHAScheduler(
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    
    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(ray_train_model),
            resources={"cpu": 0, "gpu": gpus_per_trial}
        ),
        run_config=train.RunConfig(
        name="fight-exp",
        local_dir=r"D:\ray_temp",
        checkpoint_config=train.CheckpointConfig(
            checkpoint_score_attribute="accuracy",
            checkpoint_score_order="max",
            num_to_keep=5,
        ),
        ),
        tune_config=tune.TuneConfig(
            metric="accuracy",
            mode="max",
            scheduler=scheduler,
            num_samples=num_samples,
        ),
        param_space=config_hp,
    )

    results = tuner.fit()

    return results

#results = main(num_samples=1, max_num_epochs=50, gpus_per_trial=1)
    

In [30]:
best_result = results.get_best_result("loss", "min")

print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(
        best_result.metrics["loss"]))
print("Best trial final validation accuracy: {}".format(
        best_result.metrics["accuracy"]))
print("Best trial f1 metric: {}".format(
        best_result.metrics["f1"]))
    
a,b,e,conf_matrix = evaluate_best_model(Seq_conv_net,test_data,torch.nn.CrossEntropyLoss(weight=loss_weights.to(device)),best_result,best_result.config)
print(conf_matrix)

Best trial config: {'learning_rate': 0.0001, 'batch_size': 16, 'kernel_size': 5, 'dropout_conv': 0.1, 'dropout_linear': 0.1, 'attention_size': 112, 'attention_dropout': 0.1, 'attention_heads': 16}
Best trial final validation loss: 0.1770070381462574
Best trial final validation accuracy: 0.8181818127632141
Best trial f1 metric: 0.4444444477558136
[[36  4]
 [11  4]]


In [None]:
from ray import tune
tuner = tune.Tuner.restore(
    r"D:\ray_temp",
    trainable=my_trainable,
    resume_errored=True
)
tuner.fit()

In [None]:
a,b,e,conf_matrix = evaluate_model(conv_net, train_loader, loss_fn)
print(conf_matrix)
a,b,e,conf_matrix = evaluate_model(conv_net, test_loader, loss_fn)
print(conf_matrix)

[[378   5]
 [ 19  92]]
[[39  3]
 [ 5  8]]


In [108]:
model_scripted = torch.jit.script(conv_net)
model_scripted.save('fight_detection_v2.pt')

Testing on a real video

In [123]:
def preprocess_keypoints(keypoints):
    seq_data = torch.transpose(torch.Tensor(keypoints),0,1)
    data_shape = seq_data.shape
    output_data = np.zeros((data_shape)) 
    for i,seq in enumerate(seq_data):
        output_data[i] = approximate_linear(seq)

    return torch.Tensor([output_data])




In [127]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
modely = YOLO('yolov8l-pose.pt')  # load a pretrained YOLOv8n classification model
modely.to(device)
video_path = r"D:\videos\hands3.mp4"
cap = cv2.VideoCapture(video_path)
# Get video properties
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fps = cap.get(cv2.CAP_PROP_FPS) # or number
# Create a VideoWriter object to save the output video
output_video_path = r"D:\videos_processed\fight1_processed.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))


actions = ['fighting','not_fighting']

text2 = "No suspicious activity"
text1 = "Suspicious activity"
text3 = "No people in sight"
color_map = {'fighting': (200,100,0),'not_fighting': (0,100,200)}
font_scale = 1.6
thickness = 2

winsize = 40

distance_dict = {}


label_map = {num: label for num, label in enumerate(actions)}


In [128]:
ans = 'not_fighting'
while cap.isOpened():
# Read a frame from the video
    success, frame = cap.read()
    if success:

        results = modely.track(frame, persist=True, retina_masks=True, boxes=True, show_conf=False, line_width=1,  conf=0.6, iou=0.5,  classes=0, show_labels=False, device=device,verbose = False,tracker="bytetrack.yaml")




        if results[0].boxes.id is not None:
            
            #extracting keypoints
            body_kp = extract_body_keypoints(results = results,threshold_class=0.4,threshold_keypoint=0.4)
            hands_kp = extract_hands_keypoints(results = results,threshold_class=0.4,threshold_keypoint=0.4)
            head_kp = extract_head_keypoints(results = results,threshold_class=0.4,threshold_keypoint=0.4)
            #calculating distances between keypoints

            dd = calc_distances(hands_kp,body_kp,head_kp)
            #appending distances dictionary and evaluating average distance and classification based on it
            for key in dd.keys():

                if key not in distance_dict.keys():
                    distance_dict[key] = deque(maxlen=40)

                distance_dict[key].append(dd[key])
                
                if len(distance_dict[key]) == winsize:
                    nums_sequences = nums_sequences + 1
                    keypoints = preprocess_keypoints(distance_dict[key])
                    logits = conv_net(keypoints.to(device,dtype = torch.float))
                    prediction = int(torch.argmax(logits, dim=1).cpu())
                    print(logits)
                    ans = label_map[prediction]
                    distance_dict[key].clear()
                    if ans == 'fighting':
                        break
                    

            text_size, _ = cv2.getTextSize(ans, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
            text_position = (frame_width - text_size[0] - 10, text_size[1] + 10)
            cv2.rectangle(frame, (text_position[0] - 5, text_position[1] - text_size[1] - 5),
                                    (text_position[0] + text_size[0] + 5, text_position[1] + 5), color=(0, 0, 0),
                                    thickness=cv2.FILLED)
            cv2.putText(frame, ans, text_position, cv2.FONT_HERSHEY_SIMPLEX, font_scale, color_map[ans], thickness, cv2.LINE_AA)


                    
            

        annotated_frame_show = cv2.resize(frame, (1080, 720))
        out.write(frame)
        cv2.imshow("YOLOv8 Inference", annotated_frame_show)
        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
            
        
    else:
        # Break the loop if the end of the video is reached
        break
out.release()
cap.release()
cv2.destroyAllWindows()



tensor([[0.1317, 0.8683]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0314, 0.9686]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0830, 0.9170]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.6340, 0.3660]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0755, 0.9245]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0766, 0.9234]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.6932, 0.3068]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0782, 0.9218]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.3580, 0.6420]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.2321, 0.7679]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.4379, 0.5621]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.1167, 0.8833]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.8522, 0.1478]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.6053, 0.3947]], device='cuda:0', grad_fn=<SoftmaxBack