In [7]:
import os
import cv2
import math
import time
import numpy as np
import pandas as pd
from pytube import YouTube
from scipy.io import loadmat
from multiprocessing import Pool
from PIL import Image

repo_dir = "c:/Users/James/git/3dGolfPoseDetection/"
download_path = repo_dir + 'downloaded-data/'
videos_path = download_path + 'videos/'
frames_save_dir = download_path + 'frames/'

if not os.path.exists(download_path):
    os.makedirs(download_path)
if not os.path.exists(videos_path):
    os.makedirs(videos_path)
if not os.path.exists(frames_save_dir):
    os.makedirs(frames_save_dir)

#Reusing some code from wmcnally/golfdb, thank you very much for the labels :)

#input file golfDB.mat

In [4]:
x = loadmat('golfDB.mat')
l = list(x['golfDB'][0])
d = dict()
for idx, k in enumerate(l):
    d["{:3d}".format(idx)] = list(l[idx])
df = pd.DataFrame(d).T
df.columns = ["id","youtube_id","player", "sex", "club","view","slow","events","bbox","split"]
df = df.drop(columns=['id', 'sex', 'club', 'view', 'slow', 'split'])
for col in ['youtube_id', 'player', 'events', 'bbox']:
    df[col] = df[col].apply(lambda x: x[0])

# Group Rows on 'youtube_id' going from 1400 to 580 aggregated rows (i.e. multiple events per video)
df = df.groupby('youtube_id').agg({"player": lambda x: list(x)[0],
                                   "events": lambda x: list(x),
                                   "bbox": lambda x: list(x)})
df = df.sort_values(by=['player'], ascending=True)
df.reset_index(inplace=True)
df.head()

Unnamed: 0,youtube_id,player,events,bbox
0,XwnbLuQSLuY,ADAM SCOTT,"[[721, 783, 794, 798, 805, 810, 813, 815, 838,...","[[0.109375, 0.0006944444444444445, 0.54609375,..."
1,jbB8LaUIq1c,ADAM SCOTT,"[[1414, 1725, 1736, 1740, 1748, 1752, 1755, 17...","[[0.07187500000000002, 0.0006944444444444445, ..."
2,t-DfeLMz6Rc,ADAM SCOTT,"[[815, 1251, 1265, 1268, 1277, 1282, 1285, 128...","[[0.000390625, 0.0006944444444444445, 0.598828..."
3,AmJx1lu1fv8,ADAM SCOTT,"[[871, 1222, 1236, 1239, 1248, 1253, 1255, 125...","[[0.50078125, 0.0006944444444444445, 0.4996093..."
4,1yBqlMwG8Lw,ADAM SCOTT,"[[391, 508, 520, 524, 531, 536, 539, 541, 561,...","[[0.5234375, 0.0006944444444444445, 0.384375, ..."


In [6]:
### Download youtube videos ###
youtube_base_url = 'https://www.youtube.com/watch?v='

def download_video(url, video_path):
    if not os.path.exists(video_path):
        os.makedirs(video_path)

    yt = YouTube(url)
    yt = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
    yt.download(video_path)

downloaded_videos = 0

t_start = time.time()
for index, row in df.iterrows():

    url = youtube_base_url+row['youtube_id']
    video_path = videos_path + f'{index}/'
    try:
        download_video(url, video_path)
        downloaded_videos+=1
        print(f"Downloaded video index [{index}/{len(df)}] | {url} | Time taken: {time.time() - t_start:.2f}s | [{downloaded_videos}/{index+1}]]")
    except Exception as e:
        print(f"Skipping video index {index} | {url}", e)

    # if index == 10:
    #     break

Skipping video index 0 | https://www.youtube.com/watch?v=XwnbLuQSLuY 'streamingData'
Skipping video index 1 | https://www.youtube.com/watch?v=jbB8LaUIq1c 'streamingData'
Skipping video index 2 | https://www.youtube.com/watch?v=t-DfeLMz6Rc 'streamingData'
Downloaded video index [3/580] | https://www.youtube.com/watch?v=AmJx1lu1fv8 | Time taken: 5.58s | [1/4]]
Downloaded video index [4/580] | https://www.youtube.com/watch?v=1yBqlMwG8Lw | Time taken: 7.59s | [2/5]]
Downloaded video index [5/580] | https://www.youtube.com/watch?v=wnNyaR1S28w | Time taken: 13.95s | [3/6]]
Skipping video index 6 | https://www.youtube.com/watch?v=E8eoOtPXzdA 'streamingData'
Downloaded video index [7/580] | https://www.youtube.com/watch?v=fCcXYimrIek | Time taken: 16.93s | [4/8]]
Downloaded video index [8/580] | https://www.youtube.com/watch?v=LZZlwNBdV3E | Time taken: 23.61s | [5/9]]
Downloaded video index [9/580] | https://www.youtube.com/watch?v=Q6yJx7HFlSg | Time taken: 25.65s | [6/10]]
Skipping video inde

In [8]:
### Convert videos to saved images ###
non_empty_folder = 0
for folder in os.listdir(videos_path):
    if len(os.listdir(videos_path+folder)) > 0:
        non_empty_folder += 1
print(f"Non empty folders {non_empty_folder}/{len(os.listdir(videos_path))}")

image_save_num = 1

t_start = time.time()
video_folders =  os.listdir(videos_path)
print("Total video folders:", len(video_folders))
processed_folders = 0
for folder in video_folders:
    video_folder = videos_path + folder
    files_in_folder = os.listdir(video_folder)
    if len(files_in_folder) == 0:
        continue
    processed_folders += 1
    filepath = video_folder + '/' + files_in_folder[0]

    try:
        #### Generating wanted frames from video ####

        #Generte roughly 20 images from each video, event if multiple segments
        images_per_video = 20 #(approx e.g 8 different bboxs will give 24 images)

        row = df.iloc[int(folder)]
        images_per_bbox = math.ceil(images_per_video/ len(row.events))

        all_wanted = []
        wanted_frames_bboxs = [] 
        last_frame = 0

        for events, bbox in zip(row.events, row.bbox):
            #generate a list of 'images_per_video' evenly spaced frame
            possible_frames = np.linspace(events[1], events[-2], images_per_video, dtype=int)
            wanted_frames = sorted(np.random.choice(possible_frames, images_per_bbox, replace=False))
            all_wanted.extend(wanted_frames)
            wanted_frames_bboxs.extend([[frame_num, bbox] for frame_num in wanted_frames])
            last_frame = max(last_frame, events[-1])

        #### Loading video, cropping and saving frames ####

        cap = cv2.VideoCapture(filepath)
        x_frame = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
        y_frame = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
        w_frame = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
        h_frame = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
        success = cap.grab() # get the next frame

        images = []
        count = 0
        while success:
            count+=1
            if count > last_frame:
                break
            
            if count in all_wanted:
                _, image = cap.retrieve()
                for frame_num, bbox in wanted_frames_bboxs:
                    if frame_num == count:
                        x, y, w, h = int(x_frame * bbox[0]), int(y_frame * bbox[1]), int(w_frame * bbox[2]), int(h_frame * bbox[3])
                        crop_img = image[y:y + h, x:x + w]
                        img_to_save = Image.fromarray(crop_img[:, :, ::-1])
                        img_to_save.save(frames_save_dir + f'{image_save_num}_video_{folder}.jpg')
                        image_save_num+=1  
            success = cap.grab()	
    except Exception as e:
        print(f"Error processing video {folder} | {e}")

    if processed_folders % 5 == 0:
        print(f"Processed [{processed_folders}/{len(non_empty_folder)}] folders | Time taken: {time.time() - t_start:.2f} seconds")

Total video folders: 580
Processed [5/580] folders | Time taken: 11.15 seconds
Processed [10/580] folders | Time taken: 19.98 seconds
Processed [15/580] folders | Time taken: 29.74 seconds
Processed [20/580] folders | Time taken: 43.43 seconds
Processed [25/580] folders | Time taken: 64.29 seconds
Processed [30/580] folders | Time taken: 84.62 seconds
Processed [35/580] folders | Time taken: 100.76 seconds
Processed [40/580] folders | Time taken: 117.41 seconds
Processed [45/580] folders | Time taken: 137.83 seconds
Processed [50/580] folders | Time taken: 154.46 seconds
Processed [55/580] folders | Time taken: 166.49 seconds
Processed [60/580] folders | Time taken: 182.73 seconds
Processed [65/580] folders | Time taken: 203.09 seconds
Processed [70/580] folders | Time taken: 218.62 seconds
Processed [75/580] folders | Time taken: 230.83 seconds
Processed [80/580] folders | Time taken: 268.67 seconds
Processed [85/580] folders | Time taken: 284.19 seconds
Processed [90/580] folders | T