In [None]:
# Frame by Frame
import cv2
vidcap = cv2.VideoCapture("input/YDXJ0098.mp4")
success,image = vidcap.read()
count = 0
while success:
    cv2.imwrite("input/frames/frame%d.jpg" % count, image)     # save frame as JPEG file      
    success,image = vidcap.read()
    print('Read a new frame: ', success)
    count += 1
    print (count)

In [None]:
# Frame Recognition
import os
import cv2
import time
import torch
import argparse
import numpy as np

from Detection.Utils import ResizePadding
from CameraLoader import CamLoader, CamLoader_Q
from DetectorLoader import TinyYOLOv3_onecls

from PoseEstimateLoader import SPPE_FastPose
from fn import draw_single

from Track.Tracker import Detection, Tracker
from ActionsEstLoader import TSSTG

def preproc(image):
    """preprocess function for CameraLoader.
    """
    image = resize_fn(image)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image


def kpt2bbox(kpt, ex=20):
    """Get bbox that hold on all of the keypoints (x,y)
    kpt: array of shape `(N, 2)`,
    ex: (int) expand bounding box,
    """
    return np.array((kpt[:, 0].min() - ex, kpt[:, 1].min() - ex,
                     kpt[:, 0].max() + ex, kpt[:, 1].max() + ex))


source = './input/2YDXJ0101.mp4'
camera = source
detection_input_size = 1920
pose_input_size = '224x160'
pose_backbone = 'resnet50'
show_detected = True
show_skeleton = True
save_out = "./output/out271220212YDXJ0101.avi"
device = 'cuda'

# DETECTION MODEL.
inp_dets = detection_input_size
detect_model = TinyYOLOv3_onecls(inp_dets, device=device)

# POSE MODEL.
inp_pose = pose_input_size.split('x')
inp_pose = (int(inp_pose[0]), int(inp_pose[1]))
pose_model = SPPE_FastPose(pose_backbone, inp_pose[0], inp_pose[1], device=device)

# Tracker.
max_age = 30
tracker = Tracker(max_age=max_age, n_init=3)

# Actions Estimate.
action_model = TSSTG()

resize_fn = ResizePadding(inp_dets, inp_dets)

# cam_source = camera
# if type(cam_source) is str and os.path.isfile(cam_source):
    # Use loader thread with Q for video file.
    # cam = CamLoader_Q(cam_source, queue_size=100000, preprocess=preproc).start()
    # print ("Maybe camera")
# else:
    # Use normal thread loader for webcam.
    # cam = CamLoader(int(cam_source) if cam_source.isdigit() else cam_source,
                    # preprocess=preproc).start()
    # print ("Maybe video")

#frame_size = cam.frame_size
#scf = torch.min(inp_size / torch.FloatTensor([frame_size]), 1)[0]

# outvid = False
outvid = True
codec = cv2.VideoWriter_fourcc(*'MJPG')
writer = cv2.VideoWriter(save_out, codec, 30, (inp_dets * 2, inp_dets * 2))

fps_time = 0
f = 0
for x in range(1998):
    path = "./input/frames/frame" + str(x) + ".jpg"
    print(path)
    frame = cv2.imread(path)
    frame = preproc(frame)
    print (frame.shape)
    # frame = cv2.resize(frame, (384, 384))
    # print (frame.shape)
    # image = frame.copy()

    # Detect humans bbox in the frame with detector model.
    detected = detect_model.detect(frame, need_resize=False, expand_bb=10)

    # Predict each tracks bbox of current frame from previous frames information with Kalman filter.
    tracker.predict()
    # Merge two source of predicted bbox together.
    for track in tracker.tracks:
        det = torch.tensor([track.to_tlbr().tolist() + [0.5, 1.0, 0.0]], dtype=torch.float32)
        detected = torch.cat([detected, det], dim=0) if detected is not None else det

    detections = []  # List of Detections object for tracking.
    if detected is not None:
        #detected = non_max_suppression(detected[None, :], 0.45, 0.2)[0]
        # Predict skeleton pose of each bboxs.
        poses = pose_model.predict(frame, detected[:, 0:4], detected[:, 4])

        # Create Detections object.
        detections = [Detection(kpt2bbox(ps['keypoints'].numpy()),
                                np.concatenate((ps['keypoints'].numpy(),
                                                ps['kp_score'].numpy()), axis=1),
                                ps['kp_score'].mean().numpy()) for ps in poses]

        # VISUALIZE.
        # if show_detected:
            # for bb in detected[:, 0:5]:
                # frame = cv2.rectangle(frame, (bb[0], bb[1]), (bb[2], bb[3]), (0, 0, 255), 1)

    # Update tracks by matching each track information of current and previous frame or
    # create a new track if no matched.
    tracker.update(detections)

    # Predict Actions of each track.
    for i, track in enumerate(tracker.tracks):
        if not track.is_confirmed():
            continue

        track_id = track.track_id
        bbox = track.to_tlbr().astype(int)
        center = track.get_center().astype(int)

        action = 'pending..'
        clr = (0, 255, 0)
        # Use 30 frames time-steps to prediction.
        if len(track.keypoints_list) == 30:
            pts = np.array(track.keypoints_list, dtype=np.float32)
            out = action_model.predict(pts, frame.shape[:2])
            action_name = action_model.class_names[out[0].argmax()]
            
            img = cv2.imread(path)
            name = str(action_name) + " : " + str(out[0].max() * 100)
            font = cv2.FONT_HERSHEY_DUPLEX
            cv2.putText(img, name, (15, 15), font, 0.5, (255, 255, 255), 1)
            r = cv2.imwrite("./input/framesRec/frameRec2" + str(x) + ".jpg", img)
            print("./input/framesRec/frameRec2" + str(x) + ".jpg")
            print("is stored")
            
            
            # action = '{}: {:.2f}%'.format(action_name, out[0].max() * 100)
            # if action_name == 'Fall Down':
                # clr = (0, 255, 0)
            # elif action_name == 'Lying Down':
                # clr = (0, 255, 0)

        # VISUALIZE.
        # if track.time_since_update == 0:
            # if show_skeleton:
                # frame = draw_single(frame, track.keypoints_list[-1])
            # frame = cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 1)
            # frame = cv2.putText(frame, str(track_id), (center[0], center[1]), cv2.FONT_HERSHEY_COMPLEX,
                                # 0.4, (255, 0, 0), 2)
            # frame = cv2.putText(frame, action, (bbox[0] + 15, bbox[1] + 15), cv2.FONT_HERSHEY_COMPLEX,
                                # 0.4, clr, 1)
            # frame = cv2.putText(frame, action, (0, 95), cv2.FONT_HERSHEY_COMPLEX,
                                # 0.4, clr, 1)

    # Show Frame.
    # frame = cv2.resize(frame, (0, 0), fx=2., fy=2.)
    # frame = cv2.putText(frame, '%d, FPS: %f' % (f, 1.0 / (time.time() - fps_time)),
                        # (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
    # frame = frame[:, :, ::-1]
    # fps_time = time.time()

    # if outvid:
        # writer.write(frame)

    # cv2.imshow('frame', frame)
    # if cv2.waitKey(1) & 0xFF == ord('q'):
        # break

# Clear resource.
# cam.stop()
# if outvid:
    # writer.release()
# cv2.destroyAllWindows()

In [None]:
# Frames Building
import cv2

img_array = []

def toCheck(image):
    x = 0
    if image is None:
        return None

def openn(x):
    path = "./input/framesRec/frameRec" + str(x) + ".jpg"
    img = cv2.imread(path)
    return img
    
for x in range(1998):
    f = openn(x)
    if f is None:
        x += 1
        print (x)
        toCheck(f)
    else:   
        height, width, layers = f.shape
        size = (width, height)
        img_array.append(f)

        out = cv2.VideoWriter('./output/RecognizedFrames.avi', cv2.VideoWriter_fourcc(*'DIVX'), 15, size)

for i in range(len(img_array)):
    out.write(img_array[i])
out.release()

In [None]:
# Frame Recognition
import os
import cv2
import time
import torch
import argparse
import numpy as np

from Detection.Utils import ResizePadding
from CameraLoader import CamLoader, CamLoader_Q
from DetectorLoader import TinyYOLOv3_onecls

from PoseEstimateLoader import SPPE_FastPose
from fn import draw_single

from Track.Tracker import Detection, Tracker
from ActionsEstLoader import TSSTG

def preproc(image):
    """preprocess function for CameraLoader.
    """
    image = resize_fn(image)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image


def kpt2bbox(kpt, ex=20):
    """Get bbox that hold on all of the keypoints (x,y)
    kpt: array of shape `(N, 2)`,
    ex: (int) expand bounding box,
    """
    return np.array((kpt[:, 0].min() - ex, kpt[:, 1].min() - ex,
                     kpt[:, 0].max() + ex, kpt[:, 1].max() + ex))


source = './input/2YDXJ0101.mp4'
camera = source
detection_input_size = 1920
pose_input_size = '224x160'
pose_backbone = 'resnet50'
show_detected = True
show_skeleton = True
save_out = "./output/out271220212YDXJ0101.avi"
device = 'cuda'

# DETECTION MODEL.
inp_dets = detection_input_size
detect_model = TinyYOLOv3_onecls(inp_dets, device=device)

# POSE MODEL.
inp_pose = pose_input_size.split('x')
inp_pose = (int(inp_pose[0]), int(inp_pose[1]))
pose_model = SPPE_FastPose(pose_backbone, inp_pose[0], inp_pose[1], device=device)

# Tracker.
max_age = 30
tracker = Tracker(max_age=max_age, n_init=3)

# Actions Estimate.
action_model = TSSTG()

resize_fn = ResizePadding(inp_dets, inp_dets)

# cam_source = camera
# if type(cam_source) is str and os.path.isfile(cam_source):
    # Use loader thread with Q for video file.
    # cam = CamLoader_Q(cam_source, queue_size=100000, preprocess=preproc).start()
    # print ("Maybe camera")
# else:
    # Use normal thread loader for webcam.
    # cam = CamLoader(int(cam_source) if cam_source.isdigit() else cam_source,
                    # preprocess=preproc).start()
    # print ("Maybe video")

#frame_size = cam.frame_size
#scf = torch.min(inp_size / torch.FloatTensor([frame_size]), 1)[0]

# outvid = False
outvid = True
codec = cv2.VideoWriter_fourcc(*'MJPG')
writer = cv2.VideoWriter(save_out, codec, 30, (inp_dets * 2, inp_dets * 2))

fps_time = 0
f = 0
for x in range(1998):
    path = "./input/frames/frame" + str(x) + ".jpg"
    print(path)
    frame = cv2.imread(path)
    frame = preproc(frame)
    print (frame.shape)
    # frame = cv2.resize(frame, (384, 384))
    # print (frame.shape)
    # image = frame.copy()

    # Detect humans bbox in the frame with detector model.
    detected = detect_model.detect(frame, need_resize=False, expand_bb=10)

    # Predict each tracks bbox of current frame from previous frames information with Kalman filter.
    tracker.predict()
    # Merge two source of predicted bbox together.
    for track in tracker.tracks:
        det = torch.tensor([track.to_tlbr().tolist() + [0.5, 1.0, 0.0]], dtype=torch.float32)
        detected = torch.cat([detected, det], dim=0) if detected is not None else det

    detections = []  # List of Detections object for tracking.
    if detected is not None:
        #detected = non_max_suppression(detected[None, :], 0.45, 0.2)[0]
        # Predict skeleton pose of each bboxs.
        poses = pose_model.predict(frame, detected[:, 0:4], detected[:, 4])

        # Create Detections object.
        detections = [Detection(kpt2bbox(ps['keypoints'].numpy()),
                                np.concatenate((ps['keypoints'].numpy(),
                                                ps['kp_score'].numpy()), axis=1),
                                ps['kp_score'].mean().numpy()) for ps in poses]

        # VISUALIZE.
        # if show_detected:
            # for bb in detected[:, 0:5]:
                # frame = cv2.rectangle(frame, (bb[0], bb[1]), (bb[2], bb[3]), (0, 0, 255), 1)

    # Update tracks by matching each track information of current and previous frame or
    # create a new track if no matched.
    tracker.update(detections)

    # Predict Actions of each track.
    for i, track in enumerate(tracker.tracks):
        if not track.is_confirmed():
            continue

        track_id = track.track_id
        bbox = track.to_tlbr().astype(int)
        center = track.get_center().astype(int)

        action = 'pending..'
        clr = (0, 255, 0)
        # Use 30 frames time-steps to prediction.
        if len(track.keypoints_list) == 30:
            pts = np.array(track.keypoints_list, dtype=np.float32)
            out = action_model.predict(pts, frame.shape[:2])
            action_name = action_model.class_names[out[0].argmax()]
            
            img = cv2.imread(path)
            print(img.shape)
            img = cv2.resize(img, (1920, 1920))
            print(img.shape)
            # img = draw_single(img, track.keypoints_list[-1])
            # print('1111111111111111')
            name = str(action_name) + " : " + str(out[0].max() * 100)
            font = cv2.FONT_HERSHEY_DUPLEX
            cv2.putText(img, name, (15, 15), font, 0.5, (255, 255, 255), 1)
            r = cv2.imwrite("./input/framesRec2/frameRec" + str(x) + ".jpg", img)
            print("./input/framesRec2/frameRec" + str(x) + ".jpg")
            print("is stored")
            
            
            # action = '{}: {:.2f}%'.format(action_name, out[0].max() * 100)
            # if action_name == 'Fall Down':
                # clr = (0, 255, 0)
            # elif action_name == 'Lying Down':
                # clr = (0, 255, 0)

        # VISUALIZE.
        # if track.time_since_update == 0:
            # if show_skeleton:
                # frame = draw_single(frame, track.keypoints_list[-1])
                # print('1111111111111111')
            # frame = cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 1)
            # frame = cv2.putText(frame, str(track_id), (center[0], center[1]), cv2.FONT_HERSHEY_COMPLEX,
                                # 0.4, (255, 0, 0), 2)
            # frame = cv2.putText(frame, action, (bbox[0] + 15, bbox[1] + 15), cv2.FONT_HERSHEY_COMPLEX,
                                # 0.4, clr, 1)
            # frame = cv2.putText(frame, action, (0, 95), cv2.FONT_HERSHEY_COMPLEX,
                                # 0.4, clr, 1)

    # Show Frame.
    # frame = cv2.resize(frame, (0, 0), fx=2., fy=2.)
    # frame = cv2.putText(frame, '%d, FPS: %f' % (f, 1.0 / (time.time() - fps_time)),
                        # (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
    # frame = frame[:, :, ::-1]
    # fps_time = time.time()

    # if outvid:
        # writer.write(frame)

    # cv2.imshow('frame', frame)
    # if cv2.waitKey(1) & 0xFF == ord('q'):
        # break

# Clear resource.
# cam.stop()
# if outvid:
    # writer.release()
# cv2.destroyAllWindows()

In [None]:
# Frame Recognition
import os
import cv2
import time
import torch
import argparse
import numpy as np

from Detection.Utils import ResizePadding
from CameraLoader import CamLoader, CamLoader_Q
from DetectorLoader import TinyYOLOv3_onecls

from PoseEstimateLoader import SPPE_FastPose
from fn import draw_single

from Track.Tracker import Detection, Tracker
from ActionsEstLoader import TSSTG

def preproc(image):
    image = resize_fn(image)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image


def kpt2bbox(kpt, ex=20):
    return np.array((kpt[:, 0].min() - ex, kpt[:, 1].min() - ex,
                     kpt[:, 0].max() + ex, kpt[:, 1].max() + ex))


source = './input/YDXJ0098.mp4'
detection_input_size = 1920
pose_input_size = '224x160'
pose_backbone = 'resnet50'
save_out = "./output/out28122021YDXJ0098.avi"
device = 'cuda'

# DETECTION MODEL.
inp_dets = detection_input_size
detect_model = TinyYOLOv3_onecls(inp_dets, device=device)

# POSE MODEL.
inp_pose = pose_input_size.split('x')
inp_pose = (int(inp_pose[0]), int(inp_pose[1]))
pose_model = SPPE_FastPose(pose_backbone, inp_pose[0], inp_pose[1], device=device)

# Tracker.
max_age = 30
tracker = Tracker(max_age=max_age, n_init=3)

# Actions Estimate.
action_model = TSSTG()

resize_fn = ResizePadding(inp_dets, inp_dets)

# outvid = False
outvid = True
codec = cv2.VideoWriter_fourcc(*'MJPG')
writer = cv2.VideoWriter(save_out, codec, 30, (inp_dets * 2, inp_dets * 2))

fps_time = 0
f = 0
for x in range(1998):
    path = "./input/frames/frame" + str(x) + ".jpg"
    # print(path)
    frame = cv2.imread(path)
    frame = preproc(frame)
    print (frame.shape)

    # Detect humans bbox in the frame with detector model.
    detected = detect_model.detect(frame, need_resize=False, expand_bb=10)

    # Predict each tracks bbox of current frame from previous frames information with Kalman filter.
    tracker.predict()
    # Merge two source of predicted bbox together.
    for track in tracker.tracks:
        det = torch.tensor([track.to_tlbr().tolist() + [0.5, 1.0, 0.0]], dtype=torch.float32)
        detected = torch.cat([detected, det], dim=0) if detected is not None else det

    detections = []  # List of Detections object for tracking.
    if detected is not None:
        # Predict skeleton pose of each bboxs.
        poses = pose_model.predict(frame, detected[:, 0:4], detected[:, 4])

        # Create Detections object.
        detections = [Detection(kpt2bbox(ps['keypoints'].numpy()),
                                np.concatenate((ps['keypoints'].numpy(),
                                                ps['kp_score'].numpy()), axis=1),
                                ps['kp_score'].mean().numpy()) for ps in poses]
    tracker.update(detections)

    # Predict Actions of each track.
    for i, track in enumerate(tracker.tracks):
        if not track.is_confirmed():
            continue

        track_id = track.track_id
        bbox = track.to_tlbr().astype(int)
        center = track.get_center().astype(int)

        action = 'pending..'
        clr = (0, 255, 0)
        # Use 30 frames time-steps to prediction.
        if len(track.keypoints_list) == 30:
            pts = np.array(track.keypoints_list, dtype=np.float32)
            out = action_model.predict(pts, frame.shape[:2])
            action_name = action_model.class_names[out[0].argmax()]
            
            img = cv2.imread(path)
            print(img.shape)
            # img = cv2.resize(img, (1920, 1920))
            # print(img.shape)
            # img = draw_single(img, track.keypoints_list[-1])
            # print('1111111111111111')
            name = str(action_name) + " : " + str(out[0].max() * 100)
            font = cv2.FONT_HERSHEY_DUPLEX
            cv2.putText(img, name, (15, 15), font, 0.5, (255, 255, 255), 1)
            r = cv2.imwrite("./input/framesRec2/frameRec" + str(x) + ".jpg", img)
            print("./input/framesRec2/frameRec" + str(x) + ".jpg")
            print("is stored")