In [1]:
import numpy as np
import pandas as pd
import cv2
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data_path = "/content/drive/My Drive/Hands on Deep Learning/mask dataset"
os.path.exists(data_path)

True

In [4]:
mydir = os.listdir(data_path)
os.listdir(data_path)

['mask_less', 'mask_improperly', 'mask_properly']

In [5]:
def image_preprocessing(img_path, img_size):
  # print(img_path)
  img = cv2.imread(img_path)
  h, w, _ = img.shape
  lmax = max(h, w)
    
  if h!=w and lmax==h:
    err = h-w
    img = np.concatenate([np.zeros((h, err//2, 3), dtype="uint8"), img, np.zeros((h, err-err//2, 3))], 1)
  elif h!=w and lmax==w:
    err = w-h
    img = np.concatenate([np.zeros((err//2, w, 3), dtype="uint8"), img, np.zeros((err-err//2, w, 3))], 0)
  return img

In [6]:
def data_open_save(data_path, new_path, img_size=(224, 224), data_split=True, data_split_rate=[0.7, 0.15, 0.15]):
  data_path_list, data_label_list = [], []
  for root, dirs, files in os.walk(data_path):
    if len(files)!=0:
        label = root.split("/")[-1]
        if data_split:
          split_names = ["train", "valid", "test"]
          for each_name in split_names:
            new_root = os.path.join(new_path, each_name, label)
            if not os.path.exists(new_root):
              os.makedirs(new_root)
        else:
          new_root = os.path.join(new_path, label)
          if not os.path.exists(new_root):
            os.makedirs(new_root)
        
    for file in files:
      img_path = os.path.join(root, file)
      label = root.split("/")[-1]
      if label in mydir:
        data_path_list.append(img_path)
        data_label_list.append(label)

  data_list = pd.DataFrame({"img_path":data_path_list, "label":data_label_list})
  class_map = { label: i for i, label in enumerate(data_list["label"].unique().copy())}
  data_list["label_class"] = data_list["label"].map(class_map)

  if data_split:
    train_list, test_list = train_test_split(data_list,
                            test_size=sum(data_split_rate[-2:]),
                            random_state=2,
                            stratify=data_list["label_class"])
    valid_list, test_list = train_test_split(test_list,
                            test_size=data_split_rate[-1]/sum(data_split_rate[-2:]),
                            random_state=2,
                            stratify=test_list["label_class"])
  
    data_types = [train_list["img_path"].values, valid_list["img_path"].values, test_list["img_path"].values]
    split_names = ["train", "valid", "test"]
  else:
    data_types = [data_path_list]
    split_names = [""]

  for each_list, each_name in zip(data_types, split_names):
    for img_path in each_list:
      label, file = img_path.split("/")[-2:]
      new_root = os.path.join(new_path, each_name, label)
      img_newpath = os.path.join(new_root, file)
      try:
        img = image_preprocessing(img_path, img_size)
        cv2.imwrite(img_newpath, img)
      except:
        continue

In [7]:
new_path = "/content/drive/My Drive/Hands on Deep Learning/mask_dataset_preprocessing"
img_size = (224, 224)

data_open_save(data_path, new_path, img_size=(224, 224), data_split=True, data_split_rate=[0.7, 0.15, 0.15])