# Training pipeline

Dataset: https://www.kaggle.com/datasets/paultimothymooney/breast-histopathology-images

## Import modules

In [1]:
from pathlib import Path

from MAIN.create_dataset import split_and_move_images
from augmentation import Augmentator
from dataset_preprocessing import ArchivePreprocessor
from extender import Extender
from images_preprocessing import ImagePreprocessor
from info import DatasetInformer
from training_preper import prepare_training

# Flatter dataset

In [5]:
ArchivePreprocessor(Path("../data/backup/archive/IDC_regular_ps50_idx5"), Path( "../data/flatten"), "D0").preprocess_dataset()

  0%|          | 0/279 [00:00<?, ?patient/s]
  0%|          | 0/2 [00:00<?, ?class/s][A

  0%|          | 0/479 [00:00<?, ?image/s][A[A

 11%|█         | 51/479 [00:00<00:01, 302.53image/s][A[A

 17%|█▋        | 82/479 [00:00<00:01, 211.68image/s][A[A

 22%|██▏       | 104/479 [00:00<00:02, 144.27image/s][A[A

 25%|██▌       | 120/479 [00:00<00:03, 105.94image/s][A[A

 28%|██▊       | 132/479 [00:02<00:10, 31.86image/s] [A[A

 29%|██▉       | 140/479 [00:02<00:09, 35.26image/s][A[A

 40%|████      | 192/479 [00:02<00:03, 81.04image/s][A[A

 45%|████▍     | 214/479 [00:02<00:03, 86.29image/s][A[A

 48%|████▊     | 232/479 [00:03<00:03, 72.60image/s][A[A

 51%|█████▏    | 246/479 [00:03<00:03, 71.68image/s][A[A

 54%|█████▍    | 258/479 [00:03<00:04, 47.71image/s][A[A

 56%|█████▌    | 267/479 [00:04<00:05, 37.25image/s][A[A

 57%|█████▋    | 274/479 [00:04<00:05, 36.73image/s][A[A

 59%|█████▉    | 283/479 [00:04<00:04, 41.46image/s][A[A

 61%|██████    | 

## Get information about dataset

In [None]:
di = DatasetInformer(Path("../data/flatten"))
print(di)
ratio = di.percentage_abundance["0"]/di.percentage_abundance["1"]
print("RATIO 0 / 1:", ratio)

## Preprocess images 

In [6]:
# ip = ImagePreprocessor(Path("../data/large_2"), Path("../data/l_2_p"))
# ip.add_function(ip.equalize_rgb_histogram)
# ip.preprocess_images(False)

ip = ImagePreprocessor(Path("../data/flatten"), Path("../data/preprocessed"))
ip.add_function(ip.equalize_rgb_histogram)
ip.preprocess_images(False)



Preprocessing images: 100%|██████████| 277524/277524 [14:14<00:00, 324.79 image/s]


## Balance Data

In [7]:
augmentator1 = Augmentator([1])
augmentator1.add_function(augmentator1.mirror_image(type_=0), stack_key=1)

augmentator2 = Augmentator([2])
augmentator2.add_function(augmentator2.mirror_image(type_=1), stack_key=2)

augmentator3 = Augmentator([3])
augmentator3.add_function(augmentator3.mirror_image(type_=2), stack_key=3)

# augmentator.add_function(augmentator.rotate_image(angle=5), stack_key=4)
# augmentator.add_function(augmentator.stretch_image(factor_width=1.1, factor_height=1.2), stack_key=4)
# 
# augmentator.add_function(augmentator.rotate_image(angle=-5), stack_key=5)
# augmentator.add_function(augmentator.stretch_image(factor_width=1.1, factor_height=1.2), stack_key=5)

extender1 = Extender(Path("../data/preprocessed"), Path("../data/ext"), augmentator3)
extender1.extend_class("1", 2)
extender2 = Extender(Path("../data/preprocessed"), Path("../data/ext"), augmentator2)
extender2.extend_class("1", 1.5225)



# extender0 = Extender(Path("../data/flatten"), Path("../data/flatten"), augmentator)
# extender0.extend_class("0", 4/ratio)

100%|██████████| 78786/78786 [03:52<00:00, 339.12it/s]
0it [00:00, ?it/s]
100%|██████████| 41165/41165 [01:24<00:00, 486.67it/s]


In [9]:
prepare_training(Path("../data/preprocessed"), Path("../data/ready_small/train"))
# prepare_training(Path("../data/l_3_p"), Path("../data/l_3_ready/train"))

Moving images: 100%|██████████| 397475/397475 [16:51<00:00, 393.10 image/s]


In [8]:
import shutil
for file in Path("../data/ext").iterdir():
    shutil.move(file, Path("../data/preprocessed") / file.name)

In [3]:
import shutil
target_folder_path = Path(f"C:/STUDIA/data/ready_small_test")
split_and_move_images(target_folder_path, 0, 10000, fcn=shutil.move, source_folder=Path(f"C:/STUDIA/data/ready_small"))


100%|██████████| 10000/10000 [00:04<00:00, 2076.96it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2015.05it/s]
