# Split data

Split the data into the following tree:

```
dataset/
├── train/
│   ├── images/
│   └── labels/
├── val/
│   ├── images/
│   └── labels/
└── test/
    ├── images/
    └── labels/
```
    
Generate a dataset.yaml file containing the information about the data needed for YOLO.


In [1]:
# Import
import os
from sklearn.model_selection import train_test_split
import shutil
import json
import yaml

In [2]:
output_director = '../datasets/dataset/'
input_images = '../datasets/images/'
input_labels = '../datasets/annotations_yolo/'
paht_label_dict = 'label_dict.json'

In [3]:

# Create the directories
os.makedirs(output_director, exist_ok=True)
os.makedirs(output_director + 'train/images/', exist_ok=True)
os.makedirs(output_director + 'train/labels/', exist_ok=True)
os.makedirs(output_director + 'val/images/', exist_ok=True)
os.makedirs(output_director + 'val/labels/', exist_ok=True)
os.makedirs(output_director + 'test/images/', exist_ok=True)
os.makedirs(output_director + 'test/labels/', exist_ok=True)

### Split the data

Careful, this split is only for testing with yolo. It should be modified for the final pipeline.

In [4]:
with open('../datasets/splits/val.txt', 'r') as f:
    val_files = f.readlines()

# split val_files into 3 datasets
val_files = [f.strip() for f in val_files]

X_train, X_val = train_test_split(val_files, test_size=0.6, random_state=42)
X_val, X_test = train_test_split(X_val, test_size=0.5, random_state=42)

### Copy files to the new directories

In [5]:
def copy_files(files, src, dest):
    for f in files:
        shutil.copy(src + f + '.jpg', dest + 'images/')
        shutil.copy(input_labels + f + '.txt', dest + 'labels/')   

In [6]:
copy_files(X_train, input_images, output_director + 'train/')
copy_files(X_val, input_images, output_director + 'val/')
copy_files(X_test, input_images, output_director + 'test/')

In [7]:
def creat_dataset_yaml(output_director, paht_label_dict):
    # Load the label dict
    with open(paht_label_dict, 'r') as f:
        label_dict = json.load(f)
        
    # Create the dataset.yaml file
    data = {
        'train': '../../datasets/dataset/train/images/',
        'val': '../../datasets/dataset/val/images/',
        'test': '../../datasets/dataset/test/images/',
        'nc': len(label_dict),
        'names': list(label_dict.keys())
    }
    
    with open('dataset.yaml', 'w') as f:
        yaml.dump(data, f)

In [8]:
creat_dataset_yaml(output_director, paht_label_dict)    