# This notebook is responsible for downloading and preparing datasets from various source. 

---> 5 videos from google drive (which will be split into frames)  
---> dataset from roboflow


## Define path of the project and the paths where the train/valid/test sets will be stored

In [1]:
from pathlib import Path
import os

# Path of the project
PROJECT_PATH = Path(os.getcwd()).resolve().parent

# Make the main data dir
DATA_PATH = PROJECT_PATH / 'data'
os.makedirs(DATA_PATH, exist_ok=True)

# Make the dir where train/valid/test set will be saved
IMAGES_PATH = DATA_PATH / 'images'
os.makedirs(IMAGES_PATH, exist_ok=True)

# Make the train dir
TRAIN_PATH = IMAGES_PATH / 'train'
os.makedirs(TRAIN_PATH, exist_ok=True)

# Make the valid dir
VALID_PATH = IMAGES_PATH / 'valid'
os.makedirs(VALID_PATH, exist_ok=True)

# Make the test dir
TEST_PATH = IMAGES_PATH / 'test'
os.makedirs(TEST_PATH, exist_ok=True)

# Make the dir where the original videos will be stored
ORIGIN_VIDEOS_PATH = DATA_PATH / 'origin_videos'
os.makedirs(ORIGIN_VIDEOS_PATH, exist_ok=True)

print(f'Project path -> {PROJECT_PATH}')
print(f'|')
print(f'|--data -> {DATA_PATH}')
print(f'|  |--images -> {TRAIN_PATH}')
print(f'|  |   |--train -> {TRAIN_PATH}')
print(f'|  |   |--valid -> {VALID_PATH}')
print(f'|  |   |--test -> {TEST_PATH}')
print(f'|  |   ')
print(f'|  |--origin_videos -> {ORIGIN_VIDEOS_PATH}')
print(f'|')

Project path -> C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI
|
|--data -> C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data
|  |--images -> C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\images\train
|  |   |--train -> C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\images\train
|  |   |--valid -> C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\images\valid
|  |   |--test -> C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\images\test
|  |   
|  |--origin_videos -> C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\origin_videos
|


----

## Preparing videos from Google Drive

### Download videos from google drive

In [2]:
!gdown -O "{ORIGIN_VIDEOS_PATH}/0bfacc_0.mp4" "https://drive.google.com/uc?id=12TqauVZ9tLAv8kWxTTBFWtgt2hNQ4_ZF"
!gdown -O "{ORIGIN_VIDEOS_PATH}/2e57b9_0.mp4" "https://drive.google.com/uc?id=19PGw55V8aA6GZu5-Aac5_9mCy3fNxmEf"
!gdown -O "{ORIGIN_VIDEOS_PATH}/08fd33_0.mp4" "https://drive.google.com/uc?id=1OG8K6wqUw9t7lp9ms1M48DxRhwTYciK-"
!gdown -O "{ORIGIN_VIDEOS_PATH}/573e61_0.mp4" "https://drive.google.com/uc?id=1yYPKuXbHsCxqjA9G-S6aeR2Kcnos8RPU"
!gdown -O "{ORIGIN_VIDEOS_PATH}/121364_0.mp4" "https://drive.google.com/uc?id=1vVwjW1dE1drIdd4ZSILfbCGPD4weoNiu"

Downloading...
From: https://drive.google.com/uc?id=12TqauVZ9tLAv8kWxTTBFWtgt2hNQ4_ZF
To: C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\origin_videos\0bfacc_0.mp4

  0%|          | 0.00/19.9M [00:00<?, ?B/s]
  3%|▎         | 524k/19.9M [00:00<00:03, 5.03MB/s]
 13%|█▎        | 2.62M/19.9M [00:00<00:01, 13.8MB/s]
 37%|███▋      | 7.34M/19.9M [00:00<00:00, 28.6MB/s]
 53%|█████▎    | 10.5M/19.9M [00:00<00:00, 23.9MB/s]
 74%|███████▍  | 14.7M/19.9M [00:00<00:00, 26.0MB/s]
100%|██████████| 19.9M/19.9M [00:00<00:00, 30.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=19PGw55V8aA6GZu5-Aac5_9mCy3fNxmEf
To: C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\origin_videos\2e57b9_0.mp4

  0%|          | 0.00/21.1M [00:00<?, ?B/s]
  2%|▏         | 524k/21.1M [00:00<00:04, 4.79MB/s]
 12%|█▏        | 2.62M/21.1M [00:00<00:01, 13.2MB/s]
 37%|███▋      | 7.86M/21.1M [00:00<00:00, 28.7MB/s]
 52%|█████▏    | 11.0M/21.1M [00:00<00:00, 25.2MB/s]
 70%|██████▉

### Extract frames from videos and move them to appropriate directories

Extracting frames from the videos will involve going through it at a certain frequency of frame extraction. The frame extraction frequency is defineed in the variable STRIDE

Each frame will be saved as .jpg format and will have a unique index, for example: image_0341.jpg

### Extracting frames

In [3]:
from tqdm import tqdm
import supervision as sv
import cv2

# ----------------------------------------------------EXTRACTING FRAMES FROM VIDEOS----------------------------------------------------
def extract_frames(video_path: Path, output_path: Path, start_idx: int=0, stride = 5):
    video_info = sv.VideoInfo.from_video_path(video_path)
    frame_generator = sv.get_video_frames_generator(video_path, stride=stride)

    frame_idx = start_idx
    current_video = str(video_path).split('\\')[-1]
    for frame in tqdm(frame_generator, desc=f'Extracting frames from video -> {current_video}', total=int(video_info.total_frames / 5)):
        output_frame_path = output_path / f'image_{frame_idx:04d}.jpg'
        cv2.imwrite(output_frame_path, frame)

        frame_idx += 1
# ----------------------------------------------------EXTRACTING FRAMES FROM VIDEOS----------------------------------------------------

In [4]:
# Make the dir where the extracted frames will be saved
EXTRACTED_FRAMES_DIR = 'extracted_frames'
EXTRACTED_FRAMES_PATH = DATA_PATH / EXTRACTED_FRAMES_DIR
os.makedirs(EXTRACTED_FRAMES_PATH, exist_ok=True)

STRIDE = 5
start_idx_videos = []
for video_file_name in os.listdir(ORIGIN_VIDEOS_PATH):
    video_path = ORIGIN_VIDEOS_PATH / video_file_name
    start_idx = len(os.listdir(EXTRACTED_FRAMES_PATH))
    start_idx_videos.append(start_idx)

    extract_frames(video_path, EXTRACTED_FRAMES_PATH, start_idx, STRIDE)

print(f'{len(os.listdir(EXTRACTED_FRAMES_PATH))} images from videos were saved.')

Extracting frames from video -> 08fd33_0.mp4: 100%|██████████| 150/150 [00:06<00:00, 23.63it/s]
Extracting frames from video -> 0bfacc_0.mp4: 100%|██████████| 150/150 [00:06<00:00, 24.10it/s]
Extracting frames from video -> 121364_0.mp4: 100%|██████████| 150/150 [00:06<00:00, 24.01it/s]
Extracting frames from video -> 2e57b9_0.mp4: 100%|██████████| 150/150 [00:06<00:00, 24.01it/s]
Extracting frames from video -> 573e61_0.mp4: 100%|██████████| 150/150 [00:06<00:00, 22.89it/s]

750 images from videos were saved.





In [5]:
start_idx_videos

[0, 150, 300, 450, 600]

### Splitting data into train, valid and test folder

In [6]:
import numpy as np

sorted_file_names = sorted(os.listdir(EXTRACTED_FRAMES_PATH))
np_file_names = np.array(sorted_file_names)
image_ids = np.arange(len(np_file_names))
bins = start_idx_videos[1:]

video_mask = np.digitize(image_ids, bins=bins)

In [7]:
video_ids, counts = np.unique(video_mask, return_counts=True)

for n_video, count in zip(video_ids, counts):
    print(f'Video {n_video}: {count} images')

Video 0: 150 images
Video 1: 150 images
Video 2: 150 images
Video 3: 150 images
Video 4: 150 images


In [8]:
from sklearn.model_selection import train_test_split

VALID_SIZE = 0.1
TEST_SIZE = 0.1

train_full_images, valid_images, train_full_mask, valid_mask = train_test_split(np_file_names, video_mask, test_size=VALID_SIZE, stratify=video_mask)

train_images, test_images, train_mask, test_mask = train_test_split(train_full_images, train_full_mask, test_size=TEST_SIZE, stratify=train_full_mask)

In [9]:
print('Train Set')
video_ids, train_counts = np.unique(train_mask, return_counts=True)
for n_video, count in zip(video_ids, train_counts):
    print(f'Video {n_video}: {count} images')
print(f'Total Train Images: {len(train_mask)}')

print('\nValid Set')
video_ids, valid_counts = np.unique(valid_mask, return_counts=True)
for n_video, count in zip(video_ids, valid_counts):
    print(f'Video {n_video}: {count} images')
print(f'Total Valid Images: {len(valid_mask)}')

print('\nTest Set')
video_ids, test_counts = np.unique(test_mask, return_counts=True)
for n_video, count in zip(video_ids, test_counts):
    print(f'Video {n_video}: {count} images')
print(f'Total Test Images: {len(test_mask)}')

Train Set
Video 0: 121 images
Video 1: 122 images
Video 2: 121 images
Video 3: 121 images
Video 4: 122 images
Total Train Images: 607

Valid Set
Video 0: 15 images
Video 1: 15 images
Video 2: 15 images
Video 3: 15 images
Video 4: 15 images
Total Valid Images: 75

Test Set
Video 0: 14 images
Video 1: 13 images
Video 2: 14 images
Video 3: 14 images
Video 4: 13 images
Total Test Images: 68


In [10]:
import shutil

# ----------------------------------------------------MOVING IMAGES TO APPROPRIATE FOLDER----------------------------------------------------
def move_images(image_paths_list: list[Path], target_folder_path: Path):
    for image_path in tqdm(image_paths_list, desc=f'Transferring images to {target_folder_path}', total=len(image_paths_list)):
        if str(image_path).endswith('.jpg'):
            output_image_path = target_folder_path / str(image_path).split('\\')[-1]

            shutil.move(image_path, output_image_path)
# ----------------------------------------------------MOVING IMAGES TO APPROPRIATE FOLDER----------------------------------------------------

In [11]:
image_sets = [
    [EXTRACTED_FRAMES_PATH / train_image for train_image in train_images],
    [EXTRACTED_FRAMES_PATH / valid_image for valid_image in valid_images],
    [EXTRACTED_FRAMES_PATH / test_image for test_image in test_images]
]
target_folder_sets = [TRAIN_PATH, VALID_PATH, TEST_PATH]

for image_set, target_folder_set in zip(image_sets, target_folder_sets):
    move_images(image_paths_list=image_set, target_folder_path=target_folder_set)

Transferring images to C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\images\train: 100%|██████████| 607/607 [00:00<00:00, 1132.37it/s]
Transferring images to C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\images\valid: 100%|██████████| 75/75 [00:00<00:00, 1019.57it/s]
Transferring images to C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\images\test: 100%|██████████| 68/68 [00:00<00:00, 606.02it/s]


In [12]:
shutil.rmtree(EXTRACTED_FRAMES_PATH)

----------------

## Preparing datasets from Roboflow

In [13]:
from dotenv import load_dotenv
from roboflow import Roboflow

load_dotenv()
ROBOFLOW_API_KEY = os.getenv('ROBOFLOW_API_KEY')

HOME = os.getcwd()
os.chdir(DATA_PATH)
rf = Roboflow(api_key=ROBOFLOW_API_KEY)
project = rf.workspace("mikoaj-bu1z8").project("football-ai-vision")
version = project.version(1)
dataset = version.download("coco")
os.chdir(HOME)

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in football-ai-vision-1 to coco:: 100%|██████████| 83381/83381 [00:02<00:00, 29269.10it/s]





Extracting Dataset Version Zip to football-ai-vision-1 in coco:: 100%|██████████| 380/380 [00:00<00:00, 1204.70it/s]


In [14]:
TRAIN_DIR = 'train'
VALID_DIR = 'valid'
TEST_DIR = 'test'

image_sets = [
    [Path(dataset.location) / TRAIN_DIR / train_image for train_image in os.listdir(Path(dataset.location) / TRAIN_DIR)],
    [Path(dataset.location) / VALID_DIR / valid_image for valid_image in os.listdir(Path(dataset.location) / VALID_DIR)],
    [Path(dataset.location) / TEST_DIR / test_image for test_image in os.listdir(Path(dataset.location) / TEST_DIR)]
]
target_folder_sets = [TRAIN_PATH, VALID_PATH, TEST_PATH]

for image_set, target_folder_set in zip(image_sets, target_folder_sets):
    move_images(image_paths_list=image_set, target_folder_path=target_folder_set)

Transferring images to C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\images\train: 100%|██████████| 299/299 [00:00<00:00, 1056.56it/s]
Transferring images to C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\images\valid: 100%|██████████| 50/50 [00:00<00:00, 709.47it/s]
Transferring images to C:\Users\miki0\OneDrive\Pulpit\Projects\my_projects\FootballAI\data\images\test: 100%|██████████| 26/26 [00:00<00:00, 1443.38it/s]


In [15]:
shutil.rmtree(dataset.location)

------------