In [1]:
import pandas as pd
from tqdm import tqdm
import os
from shutil import copyfile
from sklearn.model_selection import train_test_split

In [2]:
IMAGES_PATH = 'defect_images'

In [5]:
images = []
for i, cls in enumerate(os.listdir(IMAGES_PATH)):
    path = os.path.join(IMAGES_PATH, cls)
    for image in os.listdir(path):
        image_path = os.path.join(path, image)
        images.append((image_path, i))
len(images)

26145

In [7]:
EXCEL_PATH = 'Log files'
df = pd.DataFrame()
for f in tqdm(os.listdir(EXCEL_PATH)):
    if '~' in f:
        continue
    df = pd.concat((df, pd.read_excel(EXCEL_PATH + os.sep + f)), axis=0, ignore_index=True, sort=False)
df.head()
material_types = df['ns1:Material'].fillna('').unique()
material_df = df[['ns1:FinishNodeRef', 'ns1:StartNodeRef', 'ns1:Material']].groupby(['ns1:StartNodeRef', 'ns1:FinishNodeRef', 'ns1:Material'], as_index=False).max()
material_df = material_df.apply(lambda x: x.reset_index(drop=True))

100%|██████████| 20/20 [00:08<00:00,  2.41it/s]


In [8]:
ALLOWED_MATERIALS = {'VC':0, 'AC':1, 'BR':2, 'CI':3, 'CO':4}

In [9]:
images_to_split = []
y_to_split = []
for image in tqdm(images):
    path = image[0]
    *_, img_name = path.rsplit('/', 1)
    start_node, end_node, *_ = img_name.split('_')
    # print(start_node, end_node)
    material = material_df.loc[(material_df['ns1:FinishNodeRef'] == end_node) & (material_df['ns1:StartNodeRef'] == start_node)]['ns1:Material']
    material = material.to_numpy()[0]
    try:
        ALLOWED_MATERIALS[material]
    except KeyError:
        continue
    y_to_split.append(image[1])
    images_to_split.append(path)
images_to_split[:5]

100%|██████████| 26145/26145 [00:33<00:00, 782.99it/s]


['defect_images/CL/ST32473101_ST32473104_D_230317_1.mpg_continous_416.jpg',
 'defect_images/CL/ST32473101_ST32473104_D_230317_1.mpg_continous_421.jpg',
 'defect_images/CL/ST32473101_ST32473104_D_230317_1.mpg_continous_426.jpg',
 'defect_images/CL/ST32473101_ST32473104_D_230317_1.mpg_continous_431.jpg',
 'defect_images/CL/ST32473101_ST32473104_D_230317_1.mpg_continous_436.jpg']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(images_to_split, y_to_split, test_size=.2, stratify=y_to_split)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.2, stratify=y_train)
X_train[:5]

['defect_images/healthy_images/ST69051401_ST69052401_D_271115_46.mpg_314.0999999999998.jpg',
 'defect_images/healthy_images/ST69051401_ST69052401_D_271115_46.mpg_652.8999999999943.jpg',
 'defect_images/CUW/ST83693501_ST83694506_U_010615_2.mpg_continous_567.jpg',
 'defect_images/DEG/ST03397003_ST03397002_U_290714_4.mpg_continous_622.jpg',
 'defect_images/healthy_images/ST69067301_ST69068301_D_301115_83.mpg_239.90000000000066.jpg']

In [11]:
img = images[0][0]
img.split('/', 2)[1]

'CL'

In [12]:
from shutil import copyfile
import shutil

TRAIN_FOLDER = 'Images/train'
VALIDATION_FOLDER = 'Images/validation'
TEST_FOLDER = 'Images/test'
DEFECT_IMAGES = 'defect_images'
HEALTHY_IMAGES = 'healthy_images'

for x, y in tqdm(zip(X_train, y_train)):
    *_, img_name = x.split('/')
    try:
        shutil.copy(x, os.path.join(TRAIN_FOLDER, x.split('/', 2)[1], img_name))
    except IOError as io_err:
        os.makedirs(os.path.dirname(os.path.join(TRAIN_FOLDER, x.split('/', 2)[1], img_name)))
        shutil.copy(x, os.path.join(TRAIN_FOLDER, x.split('/', 2)[1], img_name))

for x, y in tqdm(zip(X_val, y_val)):
    *_, img_name = x.split('/')
    try:
        shutil.copy(x, os.path.join(VALIDATION_FOLDER, x.split('/', 2)[1], img_name))
    except IOError as io_err:
        os.makedirs(os.path.dirname(os.path.join(VALIDATION_FOLDER, x.split('/', 2)[1], img_name)))
        shutil.copy(x, os.path.join(VALIDATION_FOLDER, x.split('/', 2)[1], img_name))

for x, y in tqdm(zip(X_test, y_test)):
    *_, img_name = x.split('/')
    try:
        shutil.copy(x, os.path.join(TEST_FOLDER, x.split('/', 2)[1], img_name))
    except IOError as io_err:
        os.makedirs(os.path.dirname(os.path.join(TEST_FOLDER, x.split('/', 2)[1], img_name)))
        shutil.copy(x, os.path.join(TEST_FOLDER, x.split('/', 2)[1], img_name))

15502it [11:33, 22.36it/s]
3876it [02:29, 25.98it/s]
4845it [03:16, 24.68it/s]
