In [31]:
import os
import torch
import matplotlib.pyplot as plt
import pytorch_lightning as pl
import segmentation_models_pytorch as smp

from pprint import pprint
from torch.utils.data import DataLoader
import cv2
import numpy as np
from torch.utils.data import Dataset
from pathlib import Path
import shutil
from os.path import exists, join
import json
import boto3
from concurrent.futures import ThreadPoolExecutor, as_completed

In [5]:
class BaseDataset(Dataset):

    
    CLASSES = ['kozijn']
    
    def __init__(
            self, 
            images_dir, 
            masks_dir, 
            classes=None, 
            augmentation=None, 
            preprocessing=None,
    ):
        #self.ids = os.listdir(images_dir)
        self.ids = [image_id for image_id in os.listdir(images_dir) if image_id != '.ipynb_checkpoints']
        #print(len(self.ids))
        self.images_fps = [os.path.join(images_dir, image_id) for image_id in self.ids if image_id != '.ipynb_checkpoints']
        self.masks_fps = [os.path.join(masks_dir, image_id) for image_id in self.ids  if image_id != '.ipynb_checkpoints']
        
        # convert str names to class values on masks
        self.class_values = [self.CLASSES.index(cls.lower()) for cls in classes]
        
        self.augmentation = augmentation
        self.preprocessing = preprocessing
    
    def __getitem__(self, i):
        
        # read data
        image = cv2.imread(self.images_fps[i])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        mask = np.clip(cv2.imread(self.masks_fps[i], 0),0,1)
        
        # extract certain classes from mask (e.g. cars)
#         masks = [(mask == v) for v in self.class_values]
#         mask = np.stack(masks, axis=-1).astype('float')
        
        # apply augmentations
        if self.augmentation:
            sample = self.augmentation(image=image, mask=mask)
            image, mask = sample['image'], sample['mask']
        
        # apply preprocessing
        if self.preprocessing:
            sample = self.preprocessing(image=image, mask=mask)
            image, mask = sample['image'], sample['mask']
            
        return image, mask
    
    def get_filename(self, i):    #<----Important
        return self.images_fps[i], self.masks_fps[i]
    
        
    def __len__(self):
        return len(self.ids)

In [6]:
x_train_dir = '/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/images/'
y_train_dir = '/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/labels/'

In [7]:
dataset = BaseDataset(x_train_dir, y_train_dir, classes=['kozijn'])

In [8]:
len(dataset)

1081

In [11]:
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(dataset, [781, 200, 100], generator=torch.Generator().manual_seed(42))

In [12]:
train_dataset = train_dataset.indices[:20]
valid_dataset = valid_dataset.indices[:20]
test_dataset = test_dataset.indices[:20]

In [17]:
for i in test_dataset.indices:
    print(dataset.get_filename(i))
    break
          

('/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/images/5014_43.jpg', '/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/labels/5014_43.jpg')


In [13]:
def create_train_val_test_folders(output_dir, version=1):
    """
    Generates folders in output_dir path, 
    using the S3 database structure for datasets.
    """
    
    out = Path(output_dir)
    path_train = out / "train" 
    path_val = out / "val" 
    path_test = out / "test" 
    os.mkdir(path_train)
    os.mkdir(path_val)
    os.mkdir(path_test)
    
    path_train = out / "train" / f"{version}"
    path_val = out / "val" / f"{version}"
    path_test = out / "test" / f"{version}"
    os.mkdir(path_train)
    os.mkdir(path_val)
    os.mkdir(path_test)
    
    for p in [path_train, path_test, path_val]:
        os.mkdir(p / "images")
        os.mkdir(p / "segmentations")
    

In [14]:
def move_images_and_masks_into_folders(dataset_dir, dataset, train_indexes, val_indexes, test_indexes, version=1):
    
    for index in train_indexes:
        image_path, mask_path = dataset.get_filename(index)
        image_new_path = Path(dataset_dir) / "train" / f"{version}" / "images" / image_path.split("/")[-1]
        mask_new_path = Path(dataset_dir) / "train" / f"{version}" / "segmentations" / mask_path.split("/")[-1]
        shutil.move(image_path, image_new_path)
        shutil.move(mask_path, mask_new_path)
    
    for index in val_indexes:
        image_path, mask_path = dataset.get_filename(index)
        image_new_path = Path(dataset_dir) / "val" / f"{version}" / "images" / image_path.split("/")[-1]
        mask_new_path = Path(dataset_dir) / "val" / f"{version}" / "segmentations" / mask_path.split("/")[-1]
        shutil.move(image_path, image_new_path)
        shutil.move(mask_path, mask_new_path)
        
    for index in test_indexes:
        image_path, mask_path = dataset.get_filename(index)
        image_new_path = Path(dataset_dir) / "test" / f"{version}" / "images" / image_path.split("/")[-1]
        mask_new_path = Path(dataset_dir) / "test" / f"{version}" / "segmentations" / mask_path.split("/")[-1]
        shutil.move(image_path, image_new_path)
        shutil.move(mask_path, mask_new_path)

In [15]:
def create_jsonlines(samples_list, output_dir: str):

    output_dir = Path(output_dir)
    images_dir = output_dir / "images"
    segmentations_dir = output_dir / "segmentations"

    with open(join(output_dir, "dataset.jsonlines"), "w") as file:
        for img, segment in samples_list.items():
            if exists(images_dir / img) and exists(
                segmentations_dir / segment
            ):
                sample = {"image": img, "segmentation": segment}

                file.write(json.dumps(sample))
                file.write("\n")

In [16]:
def create_metajson_with_classmap(class_map: dict, detection_type: str, output_dir: str):


    meta_json = {}
    meta_json["classmap"] = class_map
    meta_json["detection_type"] = detection_type

    output_dir = Path(output_dir)

    with open(output_dir / "meta.json", "w") as file:
        file.write(json.dumps(meta_json))

In [17]:
def create_samples_list(dataset, indexes):
    samples_list = {}
    for index in indexes:
        image_path, mask_path = dataset.get_filename(index)
        samples_list[image_path.split("/")[-1]] = mask_path.split("/")[-1]
    return samples_list

In [18]:
def create_jsonlines_train_val_test(output_dir, dataset, train_indices, val_indices, test_indices, version=1):
    
    train_samples_list = create_samples_list(dataset, train_indices)
    create_jsonlines(train_samples_list, Path(output_dir) / "train" / f"{version}")
    
    val_samples_list = create_samples_list(dataset, val_indices)
    create_jsonlines(val_samples_list, Path(output_dir) / "val" / f"{version}")
    
    test_samples_list = create_samples_list(dataset, test_indices)
    create_jsonlines(test_samples_list, Path(output_dir) / "test" / f"{version}")

In [19]:
def create_metajson_train_val_test(class_map: dict, detection_type: str, output_dir: str, version = 1):
    for d in ["train", "val", "test"]:
        create_metajson_with_classmap(class_map, detection_type, Path(output_dir) / d / f"{version}")
    

In [36]:
def upload_from_local_to_s3(root_dir_path, bucket, version=1):
    root_dir_name = os.path.basename(root_dir_path)

    client = boto3.client('s3')
    
    for d in ['train', 'val', 'test']:
        meta_file = Path(root_dir_path) / d / f"{version}" / "meta.json"
        meta_object = Path(root_dir_name) / d / f"{version}" / "meta.json"
        print(str(meta_file))
        print(str(meta_object))
        client.upload_file(str(meta_file), bucket, str(meta_object))

        jsonlines_file = Path(root_dir_path) / d / f"{version}" / "dataset.jsonlines"
        jsonlines_object = Path(root_dir_name) / d / f"{version}" / "dataset.jsonlines"
        client.upload_file(str(jsonlines_file), bucket, str(jsonlines_object))

        images = (Path(root_dir_path) / d / f"{version}" / "images" / o for o in os.listdir(Path(root_dir_path) / d / f"{version}" / "images") if '.ipynb' not in o)
        
        upload_todo_images = []
        for image_file in images:
            upload_todo_images.append((str(image_file), bucket, str(Path(root_dir_name) / d / f"{version}" / "images" / os.path.basename(image_file))))
        
        
        with ThreadPoolExecutor(max_workers=10) as executor:
            completed = 0

            futures = []
            for upload_todo in upload_todo_images:
                futures.append(
                    executor.submit(client.upload_file, *upload_todo)
                )

            for i, done in enumerate(as_completed(futures)):
                done.result()

                if (i + 1) % 20 == 0:
                    completed += 20
                    print(
                        f"uploaded {completed}/{len(upload_todo_images)} images"
                    )
#         for image_file in images:
#             client.upload_file(str(image_file), bucket, str(Path(root_dir_name) / d / f"{version}" / "images" / os.path.basename(image_file)))

#         segmentations = (Path(root_dir_path) / d / f"{version}" / "segmentations" / o for o in os.listdir(Path(root_dir_path) / d / f"{version}" / "segmentations") if '.ipynb' not in o)

#         for segm_file in segmentations:
#             client.upload_file(str(segm_file), bucket, str(Path(root_dir_name) / d / f"{version}" / "segmentations" / os.path.basename(segm_file)))



In [34]:
# with ThreadPoolExecutor(max_workers=10) as executor:
#         completed = 0

#         futures = []
#         for download_todo in download_list:
#             futures.append(
#                 executor.submit(client.download_file, *download_todo)
#             )

#         for i, done in enumerate(as_completed(futures)):
#             done.result()

#             if (i + 1) % 50 == 0:
#                 completed += 50
#                 logger.info(
#                     f"downloaded {completed}/{len(download_list)} imgs & labls"
#                 )

In [21]:
root_dir_path = '/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/kozijns-segmentations'
version =1

/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/kozijns-segmentations/train/1/meta.json
kozijns-segmentations/train/1/meta.json


In [73]:
object_name = os.path.basename(file_name)


In [69]:
#create_train_val_test_dirs('/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/kozijns-segmentations')

In [22]:
move_images_and_masks_into_folders('/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/kozijns-segmentations', dataset, train_dataset, valid_dataset, test_dataset)

In [67]:
create_jsonlines_train_val_test('/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/kozijns-segmentations', dataset, train_dataset, valid_dataset, test_dataset)

In [68]:
classmap = {"background": 0, "kozijn": 1}

In [69]:
create_metajson_train_val_test(classmap, 'segmentation', '/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/kozijns-segmentations')

In [37]:
upload_from_local_to_s3(root_dir_path, "spotr-datasets" ,1)

/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/kozijns-segmentations/train/1/meta.json
kozijns-segmentations/train/1/meta.json
uploaded 20/20 images
/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/kozijns-segmentations/val/1/meta.json
kozijns-segmentations/val/1/meta.json
uploaded 20/20 images
/Users/zed/viscode-github/window-frame-types/notebooks/dataset_for_ml/kozijns-segmentations/test/1/meta.json
kozijns-segmentations/test/1/meta.json
uploaded 20/20 images
