# Divide your data into train, val and test sets

Select the file *.txt* that contains all the images and its corresponding classes. Now we want to split our images into train, val and test files, so that the classes are well distributed (e.g. if the dataset is highly imbalanced the images are not randomly separated).

In [26]:
import os

import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

import imgclas
from imgclas import paths, config

from sklearn.model_selection import train_test_split

CONF = config.get_conf_dict()

# Customize your image folder if needed
# CONF['general']['images_directory'] = '/media/ignacio/Datos/datasets/semillas/datasets'  # absolute path to image_folder

# splits_dir = paths.get_splits_dir()
splits_dir = '/srv/images_classes/Regression/copasVasos_Madrid_LaRioja'

# Load the data
data = pd.read_csv(splits_dir+'/test.txt', sep="*", header=None, names=['image', 'clase'])

In [27]:
data

Unnamed: 0,image,clase
0,/srv/datos/Laboratorio/La Rioja/ICVV/Copa Char...,75
1,/srv/datos/Laboratorio/La Rioja/ICVV/Vaso Dura...,100
2,/srv/datos/Laboratorio/Madrid/copa Char/Fondo ...,200
3,/srv/datos/Laboratorio/Madrid/copa Char/Fondo ...,200
4,/srv/datos/Laboratorio/La Rioja/ICVV/Copa Borg...,100
...,...,...
2608,/srv/datos/Laboratorio/La Rioja/ICVV/Copa Borg...,150
2609,/srv/datos/Laboratorio/Madrid/vaso Dura/Fondo ...,75
2610,/srv/datos/Laboratorio/Madrid/copa Bor/Fondo a...,225
2611,/srv/datos/Laboratorio/Madrid/vaso Pinta/Fondo...,100


In [28]:
# https://machisnelearningmastery.com/train-test-split-for-evaluating-machine-learning-algorithms/
from collections import Counter
print(Counter(data.clase))

Counter({75: 301, 100: 301, 125: 300, 50: 299, 175: 278, 150: 277, 200: 206, 225: 191, 250: 173, 275: 161, 300: 126})


In [52]:
# split into train test sets
X_train, X_2, y_train, y_2 = train_test_split(data.image, data.clase, test_size=0.215, random_state=1, stratify=data.clase)
print(Counter(y_train))
print(Counter(y_2))

Counter({125: 2195, 75: 2194, 100: 2194, 50: 2187, 150: 2025, 175: 2025, 200: 1508, 225: 1393, 250: 1261, 275: 1178, 300: 919})
Counter({100: 601, 75: 601, 125: 601, 50: 599, 175: 555, 150: 554, 200: 413, 225: 382, 250: 346, 275: 322, 300: 252})


In [53]:
X_test, X_val, y_test, y_val = train_test_split(X_2, y_2, test_size=0.5, random_state=1, stratify=y_2)
print(Counter(y_test))
print(Counter(y_val))

Counter({75: 301, 100: 301, 125: 300, 50: 299, 175: 278, 150: 277, 200: 206, 225: 191, 250: 173, 275: 161, 300: 126})
Counter({125: 301, 100: 300, 50: 300, 75: 300, 175: 277, 150: 277, 200: 207, 225: 191, 250: 173, 275: 161, 300: 126})


In [60]:
train=pd.DataFrame(data={'image': X_train, 'clase': y_train})
train.to_csv(r'/srv/images_classes/Regression/copasVasos_Madrid_LaRioja/train.txt', sep='*', index=None, header=None, mode='a')

In [61]:
val=pd.DataFrame(data={'image': X_val, 'clase': y_val})
val.to_csv(r'/srv/images_classes/Regression/copasVasos_Madrid_LaRioja/val.txt', sep='*', index=None, header=None, mode='a')

In [62]:
test=pd.DataFrame(data={'image': X_test, 'clase': y_test})
test.to_csv(r'/srv/images_classes/Regression/copasVasos_Madrid_LaRioja/test.txt', sep='*', index=None, header=None, mode='a')

In [63]:
len(train)

19079

In [64]:
len(val)

2613

In [65]:
len(test)

2613