# This notebook is responsible for downloading and preparing datasets from various source. 

---> 5 videos from Google Drive (which will be split into frames)  
---> dataset from roboflow

## Defininig path of the project and the paths where the train/valid/test sets will be stored

In [87]:
from pathlib import Path
import os

# Path of the project
PROJECT_PATH = Path(os.getcwd()).resolve().parent

# Main data dir
DATA_PATH = PROJECT_PATH / 'data'
os.makedirs(DATA_PATH, exist_ok=True)

# Paths to train, valid and test dir
TRAIN_PATH = DATA_PATH / 'train'
TRAIN_IMAGES_PATH = TRAIN_PATH / 'images'
os.makedirs(TRAIN_PATH, exist_ok=True)
os.makedirs(TRAIN_IMAGES_PATH, exist_ok=True)

VALID_PATH = DATA_PATH / 'valid'
VALID_IMAGES_PATH = VALID_PATH / 'images'
os.makedirs(VALID_PATH, exist_ok=True)
os.makedirs(VALID_IMAGES_PATH, exist_ok=True)

TEST_PATH = DATA_PATH / 'test'
TEST_IMAGES_PATH = TEST_PATH / 'images'
os.makedirs(TEST_PATH, exist_ok=True)
os.makedirs(TEST_IMAGES_PATH, exist_ok=True)

# Path to origin videos dir
ORIGIN_VIDEOS_PATH = DATA_PATH / 'origin_videos'
os.makedirs(ORIGIN_VIDEOS_PATH, exist_ok=True)

## Preparing videos from Google Drive

### Download videos from google drive

In [88]:
!gdown -O "{ORIGIN_VIDEOS_PATH}/0bfacc_0.mp4" "https://drive.google.com/uc?id=12TqauVZ9tLAv8kWxTTBFWtgt2hNQ4_ZF"
!gdown -O "{ORIGIN_VIDEOS_PATH}/2e57b9_0.mp4" "https://drive.google.com/uc?id=19PGw55V8aA6GZu5-Aac5_9mCy3fNxmEf"
!gdown -O "{ORIGIN_VIDEOS_PATH}/08fd33_0.mp4" "https://drive.google.com/uc?id=1OG8K6wqUw9t7lp9ms1M48DxRhwTYciK-"
!gdown -O "{ORIGIN_VIDEOS_PATH}/573e61_0.mp4" "https://drive.google.com/uc?id=1yYPKuXbHsCxqjA9G-S6aeR2Kcnos8RPU"
!gdown -O "{ORIGIN_VIDEOS_PATH}/121364_0.mp4" "https://drive.google.com/uc?id=1vVwjW1dE1drIdd4ZSILfbCGPD4weoNiu"

Downloading...
From: https://drive.google.com/uc?id=12TqauVZ9tLAv8kWxTTBFWtgt2hNQ4_ZF
To: /home/mikolaj/Desktop/Projects/my_projects/football-ai/data/origin_videos/0bfacc_0.mp4
100%|██████████████████████████████████████| 19.9M/19.9M [00:01<00:00, 11.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=19PGw55V8aA6GZu5-Aac5_9mCy3fNxmEf
To: /home/mikolaj/Desktop/Projects/my_projects/football-ai/data/origin_videos/2e57b9_0.mp4
100%|██████████████████████████████████████| 21.1M/21.1M [00:02<00:00, 10.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1OG8K6wqUw9t7lp9ms1M48DxRhwTYciK-
To: /home/mikolaj/Desktop/Projects/my_projects/football-ai/data/origin_videos/08fd33_0.mp4
100%|██████████████████████████████████████| 19.9M/19.9M [00:01<00:00, 11.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1yYPKuXbHsCxqjA9G-S6aeR2Kcnos8RPU
To: /home/mikolaj/Desktop/Projects/my_projects/football-ai/data/origin_videos/573e61_0.mp4
100%|██████████████████████████████████████| 18.9

### Extracting frames

In [89]:
from pathlib import Path
from tqdm import tqdm
import supervision as sv
import cv2

def extract_frames(video_path: Path, output_path: Path, stride=5):
    """
    Extracts frames from the provided video and saves them as .jpg files with unique indexes
    (e.g. <video_name>_<frame_idx>.jpg), starting from a given index and at a specific frequency (stride).

    Args:
        video_path (Path): Path to the video file which frames will be extracted.
        output_path (Path): Path to the output directory where the frames will be saved.
        stride (int): The frequency of frame extraction. For example, if stride=5, every 5th frame will be extracted. Defaults to 5.
    """

    # Video information
    video_info = sv.VideoInfo.from_video_path(video_path)
    # Frames generator
    frame_generator = sv.get_video_frames_generator(video_path, stride=stride)

    current_video = video_path.stem
    frame_idx = 0
    for frame in tqdm(frame_generator, desc=f'Extracting frames from video -> {current_video}', total=int(video_info.total_frames / stride)):
        output_frame_path = output_path / f'{current_video}_{frame_idx}.jpg'
        cv2.imwrite(output_frame_path, frame)

        frame_idx += 1


In [90]:
# Dir where the extracted frames will be saved
EXTRACTED_FRAMES_PATH = DATA_PATH / 'extracted_frames'
os.makedirs(EXTRACTED_FRAMES_PATH, exist_ok=True)

STRIDE = 5

# Process each video in ORIGIN_VIDEOS_PATH directory
for video_file_name in os.listdir(ORIGIN_VIDEOS_PATH):
    video_path = ORIGIN_VIDEOS_PATH / video_file_name
    extract_frames(video_path, EXTRACTED_FRAMES_PATH, stride=STRIDE)

print(f'{len(os.listdir(EXTRACTED_FRAMES_PATH))} images from videos were saved.')

Extracting frames from video -> 121364_0: 100%|██████████| 150/150 [00:02<00:00, 53.18it/s]
Extracting frames from video -> 0bfacc_0: 100%|██████████| 150/150 [00:02<00:00, 54.23it/s]
Extracting frames from video -> 08fd33_0: 100%|██████████| 150/150 [00:02<00:00, 67.65it/s]
Extracting frames from video -> 2e57b9_0: 100%|██████████| 150/150 [00:02<00:00, 51.12it/s]
Extracting frames from video -> 573e61_0: 100%|██████████| 150/150 [00:02<00:00, 67.36it/s]

750 images from videos were saved.





### Splitting data into train, valid and test set

In [91]:
import numpy as np

file_names = sorted(os.listdir(EXTRACTED_FRAMES_PATH))
np_file_names = np.array(file_names)
video_names = np.array(['_'.join(file_name.split('_')[:-1]) for file_name in file_names])
unique, counts = np.unique(video_names, return_counts=True)

for video_name, count in zip(unique, counts):
    print(f'{video_name}: {count} images')

08fd33_0: 150 images
0bfacc_0: 150 images
121364_0: 150 images
2e57b9_0: 150 images
573e61_0: 150 images


In [92]:
from sklearn.model_selection import train_test_split

VALID_SIZE = 0.1
TEST_SIZE = 0.1

train_full_images, valid_images, train_full_mask, valid_mask = train_test_split(np_file_names, video_names, test_size=TEST_SIZE, stratify=video_names)
train_images, test_images, train_mask, test_mask = train_test_split(train_full_images, train_full_mask, test_size=TEST_SIZE, stratify=train_full_mask)


In [93]:
print('Train Set')
video_ids, train_counts = np.unique(train_mask, return_counts=True)
for n_video, count in zip(video_ids, train_counts):
    print(f'Video {n_video}: {count} images')
print(f'Total Train Images: {len(train_mask)}')

print('\nValid Set')
video_ids, valid_counts = np.unique(valid_mask, return_counts=True)
for n_video, count in zip(video_ids, valid_counts):
    print(f'Video {n_video}: {count} images')
print(f'Total Valid Images: {len(valid_mask)}')

print('\nTest Set')
video_ids, test_counts = np.unique(test_mask, return_counts=True)
for n_video, count in zip(video_ids, test_counts):
    print(f'Video {n_video}: {count} images')
print(f'Total Test Images: {len(test_mask)}')

Train Set
Video 08fd33_0: 122 images
Video 0bfacc_0: 121 images
Video 121364_0: 121 images
Video 2e57b9_0: 122 images
Video 573e61_0: 121 images
Total Train Images: 607

Valid Set
Video 08fd33_0: 15 images
Video 0bfacc_0: 15 images
Video 121364_0: 15 images
Video 2e57b9_0: 15 images
Video 573e61_0: 15 images
Total Valid Images: 75

Test Set
Video 08fd33_0: 13 images
Video 0bfacc_0: 14 images
Video 121364_0: 14 images
Video 2e57b9_0: 13 images
Video 573e61_0: 14 images
Total Test Images: 68


### Moving the train, valid and test set to the appropriate folders

In [94]:
import shutil

def move_images(image_paths_list: list[Path], target_folder_path: Path) -> None:
    """
    Moves images from the current folder to the target folder. It checks if the image files 
    have a ".jpg" extension and then moves them to the specified target folder.

    Args:
        image_paths_list (list[Path]): A list of 'Path' objects, where each path refers to an image file 
                                       that should be moved. The images should be in `.jpg` format.
        target_folder_path (Path): The 'Path' object representing the target folder where the images 
                                    will be moved. The target folder should already exist.
    """

    # Iterate through each image path in the list
    for image_path in tqdm(image_paths_list, desc=f'Transferring images to {target_folder_path}', total=len(image_paths_list)):
        # Check if the file is in the appropriate format '.jpg'
        if str(image_path).endswith('.jpg'):
            # Absolute path for the output image
            output_image_path = target_folder_path / image_path.name

            # Move the image from the source to the target foler
            shutil.move(image_path, output_image_path)

In [95]:
# Create image sets (paths to each image)
image_sets = [
    [EXTRACTED_FRAMES_PATH / train_image for train_image in train_images],  # Train set
    [EXTRACTED_FRAMES_PATH / valid_image for valid_image in valid_images],  # Valid set
    [EXTRACTED_FRAMES_PATH / test_image for test_image in test_images]      # Test set
]
target_folder_sets = [TRAIN_IMAGES_PATH, VALID_IMAGES_PATH, TEST_IMAGES_PATH]  # List of paths for the corresponding sets

# Iterate through set of images and corresponding target folder
for image_set, target_folder_set in zip(image_sets, target_folder_sets):
    move_images(image_paths_list=image_set, target_folder_path=target_folder_set)

Transferring images to /home/mikolaj/Desktop/Projects/my_projects/football-ai/data/train/images: 100%|██████████| 607/607 [00:00<00:00, 15961.62it/s]
Transferring images to /home/mikolaj/Desktop/Projects/my_projects/football-ai/data/valid/images: 100%|██████████| 75/75 [00:00<00:00, 18411.14it/s]
Transferring images to /home/mikolaj/Desktop/Projects/my_projects/football-ai/data/test/images: 100%|██████████| 68/68 [00:00<00:00, 14706.23it/s]


In [96]:
shutil.rmtree(EXTRACTED_FRAMES_PATH)

----------------

## Preparing datasets from Roboflow

### Download roboflow dataset

In [97]:
from dotenv import load_dotenv
from roboflow import Roboflow

# Load .env file
dotenv_path = PROJECT_PATH / '.env'
load_dotenv(dotenv_path)

# Get the Roboflow API Key
ROBOFLOW_API_KEY = os.getenv('ROBOFLOW_API_KEY')

# Change the current dir to the data directory
HOME = Path(os.getcwd())
os.chdir(DATA_PATH)

# Download dataset
rf = Roboflow(api_key=ROBOFLOW_API_KEY)
project = rf.workspace("mikoaj-bu1z8").project("football-ai-vision")
version = project.version(1)
dataset = version.download("coco")

# Return to the Home direcotry
os.chdir(HOME)

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in football-ai-vision-1 to coco:: 100%|██████████| 83381/83381 [00:08<00:00, 10071.67it/s]





Extracting Dataset Version Zip to football-ai-vision-1 in coco:: 100%|██████████| 380/380 [00:00<00:00, 1351.44it/s]


In [98]:
# Create image sets (paths to each image)
image_sets = [
    [Path(dataset.location) / 'train' / train_image for train_image in os.listdir(Path(dataset.location) / 'train')],  # Train set
    [Path(dataset.location) / 'valid' / valid_image for valid_image in os.listdir(Path(dataset.location) / 'valid')],  # Valid set
    [Path(dataset.location) / 'test' / test_image for test_image in os.listdir(Path(dataset.location) / 'test')]       # Test set
]
target_folder_sets = [TRAIN_IMAGES_PATH, VALID_IMAGES_PATH, TEST_IMAGES_PATH]  # List of paths for the corresponding sets

# Iterate through set of images and corresponding target folder
for image_set, target_folder_set in zip(image_sets, target_folder_sets):
    move_images(image_paths_list=image_set, target_folder_path=target_folder_set)

Transferring images to /home/mikolaj/Desktop/Projects/my_projects/football-ai/data/train/images: 100%|██████████| 299/299 [00:00<00:00, 18191.66it/s]
Transferring images to /home/mikolaj/Desktop/Projects/my_projects/football-ai/data/valid/images: 100%|██████████| 50/50 [00:00<00:00, 18958.16it/s]
Transferring images to /home/mikolaj/Desktop/Projects/my_projects/football-ai/data/test/images: 100%|██████████| 26/26 [00:00<00:00, 15052.02it/s]


In [105]:
n_train_images = len(os.listdir(TRAIN_IMAGES_PATH))
n_valid_images = len(os.listdir(VALID_IMAGES_PATH))
n_test_images = len(os.listdir(TEST_IMAGES_PATH))
n_videos = len(os.listdir(ORIGIN_VIDEOS_PATH))

print(f'|--data')
print(f'|  |--train')
print(f'|  |   |--images ({n_train_images} images)')
print(f'|  |--valid')
print(f'|  |   |--images ({n_valid_images} images)')
print(f'|  |--test')
print(f'|  |   |--images  ({n_test_images} iamges)')
print(f'|  |   ')
print(f'|  |--origin_videos ({n_videos} videos)')
print(f'|')


|--data
|  |--train
|  |   |--images (905 images)
|  |--valid
|  |   |--images (124 images)
|  |--test
|  |   |--images  (93 iamges)
|  |   
|  |--origin_videos (5 videos)
|
