<a href="https://colab.research.google.com/github/HubertDomaros/ai-capstone-proj/blob/dev/pytorch_tests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: import private dataset from https://www.kaggle.com/datasets/arct22/codebrim-original, use username and password from colab secrets

import os

if not os.path.exists(os.path.join(os.getcwd(), 'kaggle/input/codebrim-original')):
    from google.colab import userdata

    # Replace with your actual Kaggle username and key
    username = userdata.get('KAGGLE_USERNAME')
    key = userdata.get('KAGGLE_KEY')

    if username is None or key is None:
      raise ValueError("Please set KAGGLE_USERNAME and KAGGLE_KEY in Colab secrets.")

    os.environ['KAGGLE_USERNAME'] = username
    os.environ['KAGGLE_KEY'] = key

    # Download the dataset
    !kaggle datasets download -d arct22/codebrim-original -p /content/datasets

    !ls /content/datasets

    # Unzip the downloaded dataset (assuming it is a zip file)
    import zipfile
    with zipfile.ZipFile('/content/datasets/codebrim-original.zip', 'r') as zip_ref:
        zip_ref.extractall('/content/kaggle/input/codebrim-original')

    !ls /content/kaggle/input/codebrim-original

In [2]:
# prompt: clone https://github.com/HubertDomaros/ai-capstone-proj/tree/dev

#!git clone https://github.com/HubertDomaros/ai-capstone-proj.git --branch dev

In [3]:
# # prompt: move contents of ai-capstone-proj to main working dir

# import shutil
# import os

# # Assuming 'ai-capstone-proj' directory exists after git clone
# source_dir = "/content/ai-capstone-proj"
# destination_dir = "/content"

# # Iterate through the contents of the source directory
# for item in os.listdir(source_dir):
#     source_path = os.path.join(source_dir, item)
#     destination_path = os.path.join(destination_dir, item)

#     # Check if the item is a file or directory and move accordingly
#     if os.path.isfile(source_path):
#         shutil.move(source_path, destination_dir)  # Move the file directly
#     elif os.path.isdir(source_path):
#         if os.path.exists(destination_path):
#             # Handle the case where a directory with the same name exists
#             print(f"Directory '{item}' already exists in destination. Skipping.")
#             # You may choose to merge the contents, delete the destination directory first, or skip.
#             # Example of merging:
#             # for subitem in os.listdir(source_path):
#             #     shutil.move(os.path.join(source_path, subitem), destination_path)
#         else:
#             shutil.move(source_path, destination_dir) # Move the directory

# # Remove the original cloned repository if needed
# !rm -rf /content/ai-capstone-proj


In [4]:
# # prompt: remove sample_data folder from working dir

# import os
# from google.colab import userdata
# import zipfile
# import shutil

# # Remove sample_data
# !rm -rf /content/sample_data

In [5]:
import os
from multiprocessing import Pool, cpu_count

from src import annotation_extractor as an_ext
from src import image_processing as improc
from src import utils as u
from src import constants as c

import pandas as pd
import numpy as np
import cv2

  check_for_updates()


In [6]:
dataset_path = os.path.join(os.getcwd(), r'kaggle/input/codebrim-original/original_dataset')
annotations_path = os.path.join(dataset_path, r'annotations')
annotations_df = an_ext.xml_annotations_to_dataframe(annotations_path)

In [7]:
annotations_df.sort_values(by='img', inplace=True)
annotations_df

Unnamed: 0,img,width,height,xmin,ymin,xmax,ymax,Background,Crack,Spallation,Efflorescence,ExposedBars,CorrosionStain
1765,image_0000005.jpg,1904,2856,661,472,992,1857,0,0,0,1,0,1
1766,image_0000005.jpg,1904,2856,1507,505,1904,2856,0,0,0,1,0,1
3093,image_0000021.jpg,1904,2856,253,8,335,2108,0,0,0,1,0,0
1230,image_0000028.jpg,1752,2632,1,1,907,1043,0,0,0,1,0,1
1231,image_0000028.jpg,1752,2632,1173,1,1747,1077,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,image_0001599.jpg,4608,3456,3339,734,3736,2826,0,0,0,0,0,1
701,image_0001599.jpg,4608,3456,2092,471,2416,1178,0,0,0,0,0,1
708,image_0001599.jpg,4608,3456,1002,1097,2931,3456,0,0,0,1,0,1
704,image_0001599.jpg,4608,3456,2627,463,3066,2000,0,0,0,0,0,1


In [8]:
aug_img_folder = os.path.join(dataset_path, 'augmented_imgs')

In [9]:
def augument_image_with_metadata(input_file_path: str, output_folder: str, metadata: pd.DataFrame) -> dict:

    bboxes = metadata[c.bbox_coordinate_names].astype(int).to_numpy()
    multi_hot_encoded_labels = [list(x) for x in metadata[c.defect_names].astype(int).to_numpy()]

    augmented_images: list[improc.ImageAugumentor] = improc.generate_augmented_images(image_path=input_file_path,
                                                                                      bounding_boxes=bboxes,
                                                                                      label_values=multi_hot_encoded_labels,
                                                                                      resize=True, out_width=512,
                                                                                      out_height=512)

    out_dict_list = []
    for augmented_image in augmented_images:
        cv2.imwrite(os.path.join(output_folder, augmented_image.processed_image_name), augmented_image.processed_image)
        out_dict_list.append(augmented_image.metadata_dict)

    return u.unpack_lists(out_dict_list, out_dict_list[0].keys())

In [10]:
# prompt: write a function to delete files inside directory

import os

def delete_files_in_directory(directory_path):
    """Deletes all files within a specified directory.

    Args:
        directory_path: The path to the directory whose files should be deleted.
    """
    if not os.path.exists(directory_path):
        print(f"Directory '{directory_path}' does not exist.")
        return

    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        try:
            if os.path.isfile(file_path):
                os.remove(file_path)
                print(f"Deleted file: {file_path}")
        except Exception as e:
            print(f"Error deleting file '{file_path}': {e}")



In [11]:
#delete_files_in_directory(out_path)

NameError: name 'out_path' is not defined

In [None]:
import os
import pandas as pd
from multiprocessing import Pool, cpu_count
from src import annotation_extractor as an_ext
from src import image_processing as improc
from src import utils as u
from src import constants as c
import cv2

def augument_image_with_metadata(input_file_path: str, output_folder: str, metadata: pd.DataFrame) -> dict:

    bboxes = metadata[c.bbox_coordinate_names].astype(int).to_numpy()
    multi_hot_encoded_labels = [list(x) for x in metadata[c.defect_names].astype(int).to_numpy()]

    augmented_images: list[improc.ImageAugumentor] = improc.generate_augmented_images(image_path=input_file_path,
                                                                                      bounding_boxes=bboxes,
                                                                                      label_values=multi_hot_encoded_labels,
                                                                                      resize=True, out_width=512,
                                                                                      out_height=512)

    out_dict_list = []
    for augmented_image in augmented_images:
        cv2.imwrite(os.path.join(output_folder, augmented_image.processed_image_name), augmented_image.processed_image)
        out_dict_list.append(augmented_image.metadata_dict)

    # Check if out_dict_list is empty before unpacking
    if out_dict_list:
        return u.unpack_lists(out_dict_list, out_dict_list[0].keys())
    else:
        return {} # Return an empty dictionary when no augmentations were generated


def process_image(args):
    fname, interim_df, input_imgs_folder, output_imgs_folder = args
    file_path = os.path.join(input_imgs_folder, str(fname))
    print(f'Processing: {fname}')
    return augument_image_with_metadata(file_path, output_imgs_folder, interim_df)


def augument_images_in_folder_parallel(input_imgs_folder, output_imgs_folder, metadata: pd.DataFrame) -> list[dict]:

    grouped = metadata.groupby('img')

    # Prepare arguments for the parallel processing
    args_list = [(fname, interim_df[list(c.columns_list)], input_imgs_folder, output_imgs_folder) for fname, interim_df in grouped]

    with Pool(cpu_count()) as pool:
        out_dict_list = pool.map(process_image, args_list)

    # Filter out any empty dictionaries from the results of process_image
    out_dict_list = [d for d in out_dict_list if d]

    if out_dict_list:
         return u.unpack_lists(out_dict_list, out_dict_list[0].keys())
    else:
        raise ValueError(f'no i chui no i czesc: no kurwa sie zjebao')

dataset_path = os.path.join(os.getcwd(), r'kaggle/input/codebrim-original/original_dataset')
annotations_path = os.path.join(dataset_path, r'annotations')
annotations_df = an_ext.xml_annotations_to_dataframe(annotations_path)

inp_path = os.path.join(dataset_path, 'images')
out_path = os.path.join(dataset_path, 'augmented_imgs')
aug_metadata = augument_images_in_folder_parallel(inp_path, out_path, annotations_df)

Processing: image_0000005.jpg
Processing: image_0000352.jpg
Processing: image_0000021.jpg
Processing: image_0000353.jpg
Processing: image_0000028.jpg
Processing: image_0000354.jpg
Processing: image_0000032.jpg
Processing: image_0000034.jpg
Processing: image_0000355.jpg
Processing: image_0000042.jpg
Processing: image_0000356.jpg
Processing: image_0000046.jpg
Processing: image_0000050.jpg
Processing: image_0000364.jpg
Processing: image_0000052.jpg
Processing: image_0000365.jpg
Processing: image_0000054.jpg
Processing: image_0000367.jpg
Processing: image_0000064.jpg
Processing: image_0000368.jpg
Processing: image_0000074.jpg
Processing: image_0000077.jpg
Processing: image_0000369.jpg
Processing: image_0000079.jpg
Processing: image_0000378.jpg
Processing: image_0000083.jpg
Processing: image_0000379.jpg
Processing: image_0000084.jpg
Processing: image_0000380.jpg
Processing: image_0000085.jpg
Processing: image_0000382.jpg
Processing: image_0000087.jpg
Processing: image_0000384.jpg
Processing

In [None]:
# prompt: save aug_metadata in main working dir as json

import json

# Assuming aug_metadata is already defined as in your provided code
# ... (your existing code) ...

# Convert aug_metadata to JSON
aug_metadata_json = json.dumps(aug_metadata, indent=4)

# Save the JSON data to a file in the main working directory
with open('aug_metadata.json', 'w') as f:
    f.write(aug_metadata_json)