In [1]:
'''
The original dataset contains 10927 images of 11 classes,each of which contains around 1,000 examples.
 We randomly select 875 examples with each class contributing 8% of its images to construct validation set 
 the test set contains 20% of the images stratified by class
The rest of original dataset is our training set,i.e., the training-validation ratiois 9:1. while trainining test ratio is 4:1
'''
import torch 
from glob import glob
import pandas as pd
import numpy as np
from read_dataset import build_df
from utils import CFG
import os
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def split_dataset(train,target,validation=True):
    #70%-20%-10% split, as we're splitting 10% from the already split X_train so we're actually ending up with a 72%-20%-8% split here:
    #80 -20
    # x = img_path
    # y = 'xmin', 'ymin', 'xmax', 'ymax', 'label'

    X_train, X_test, y_train, y_test = train_test_split(
        train, target, train_size=0.8, shuffle=True, stratify=target[:, 4]
    )
    test_data = [X_test, y_test]
    if validation:
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train,
            y_train,
            train_size=0.9,
            shuffle=True,
            stratify=y_train[:, 4],
        )
        train_data = [X_train, y_train]
        validation_data = [X_valid, y_valid]
        
        return train_data, validation_data, test_data

    train_data = [X_train, y_train]
    test_data = [X_test, y_test]

    return train_data, test_data


In [3]:
if __name__ == '__main__':
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using {device} device")

    IMG_FILES = glob(CFG.img_path + "/*.jpg")
    XML_FILES = glob(CFG.xml_path + "/*.xml")

    #"id", "label", "xmin", "ymin", "xmax", "ymax", "img_path"
    df, classes = build_df(XML_FILES)
    data = df.to_numpy()
    # f_n = data[100][0] + '.txt'

    # input and target 
    input  = df[['id','img_path']].values
    input =np.squeeze(input)
    # input = input.reset_index()
    target = df[['xmin','ymin', 'xmax', 'ymax','label']].values.astype(np.int64) 

    train_data, validation_data, test_data = split_dataset(input,target,True)

Using cuda device


In [4]:
input.shape

(10927, 2)

In [5]:
# Check the split
df = pd.DataFrame(train_data[1])
df2 = pd.DataFrame(test_data[1])
df3 = pd.DataFrame(validation_data[1])
item_counts = df[4].value_counts()
item_counts2 = df2[4].value_counts()
item_counts3 = df3[4].value_counts()
item_counts,item_counts2,item_counts3

(9     726
 4     723
 6     721
 1     719
 5     719
 8     719
 7     718
 10    717
 2     716
 0     716
 3     672
 Name: 4, dtype: int64,
 9     202
 4     201
 8     200
 1     200
 6     200
 5     200
 10    199
 2     199
 0     199
 7     199
 3     187
 Name: 4, dtype: int64,
 9     81
 0     80
 10    80
 8     80
 5     80
 1     80
 2     80
 4     80
 7     80
 6     80
 3     74
 Name: 4, dtype: int64)

In [6]:
def convert(size, box):
    dw = 1./size[0]
    dh = 1./size[1]
    x = (box[0] + box[1])/2.0
    y = (box[2] + box[3])/2.0
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return (x,y,w,h)

In [7]:
import cv2
def copier(data,d_type = 'train'):
    img_path = 'yolov7/cell_datatset/images/'
    label_path = 'yolov7/cell_datatset/labels/'
    i=1
    for id, image_p, label in zip(data[0][:,0],data[0][:,1],data[1]):
        

        # Copy and paste image data
        # read image
        image = cv2.imread(image_p)
        h, w, channels = image.shape

        # image path + name for the copy
        image_name = img_path + d_type +'/'+ id + '.jpg'

        # Saving the image
        cv2.imwrite(image_name, image)

        
        # SAVE LABEL DATA
        # convert bbox -> 'xmin', 'ymin', 'xmax', 'ymax' to yolo format  (X, Y, W, H)
        #b = (xmin, xmax, ymin, ymax)
        b = (label[0], label[2], label[1], label[3])
        yolo_bb = convert((w,h), b)
        # data is made of class and bbox --->  class X Y W H
        label_data = str(label[-1]) + " " + str(yolo_bb[0]) + " "+ str(yolo_bb[1]) + " "+ str(yolo_bb[2]) + " "+ str(yolo_bb[3])

        # file_path 
        label_name = label_path + d_type +'/'+ id + '.txt'
        # write to data.txt
        with open(label_name, 'w') as f:
            f.write(label_data)
        i+=1
        print(f"Image: {i} Img:{image_name} ")

    

In [8]:
copier(train_data,d_type = 'train')
copier(validation_data,d_type = 'valid')
copier(test_data,d_type = 'test')

Image: 2 Img:yolov7/cell_datatset/images/train/Capillaria philippinensis_0193.jpg 
Image: 3 Img:yolov7/cell_datatset/images/train/Opisthorchis viverrine_0051.jpg 
Image: 4 Img:yolov7/cell_datatset/images/train/Hymenolepis nana_0825.jpg 
Image: 5 Img:yolov7/cell_datatset/images/train/Hookworm egg_0075.jpg 
Image: 6 Img:yolov7/cell_datatset/images/train/Enterobius vermicularis_0342.jpg 
Image: 7 Img:yolov7/cell_datatset/images/train/Ascaris lumbricoides_0352.jpg 
Image: 8 Img:yolov7/cell_datatset/images/train/Hymenolepis diminuta_0121.jpg 
Image: 9 Img:yolov7/cell_datatset/images/train/Paragonimus spp_0312.jpg 
Image: 10 Img:yolov7/cell_datatset/images/train/Taenia spp. egg_0263.jpg 
Image: 11 Img:yolov7/cell_datatset/images/train/Trichuris trichiura_0979.jpg 
Image: 12 Img:yolov7/cell_datatset/images/train/Fasciolopsis buski_0270.jpg 
Image: 13 Img:yolov7/cell_datatset/images/train/Ascaris lumbricoides_0226.jpg 
Image: 14 Img:yolov7/cell_datatset/images/train/Opisthorchis viverrine_0956