Approach:
Split all images into sub_images
    a. Combine all csvs into master
    b. Filter out undesirable classes
    c. Use csv to collect image (add image name to columns)
    d. Resize all images to same size
    e. PCA by category

In [22]:
# Load necessary libraries
import numpy as np
import pandas as pd
import os
import cv2
from pathlib import Path
import matplotlib.pyplot as plt

# Environment variables
Filepath variables and constants for data processing update these as necessary

In [5]:
# Location of individual CSVs for each lake
SRC_CSV_FOLDERS = [r"C:\Users\Welcome\OneDrive - University of Toronto\Sta2453\HURON_OverlapTiffsWithPP\HURONOvlerap_csv",
                     r"C:\Users\Welcome\OneDrive - University of Toronto\Sta2453\SIMC_OverlapTiffsWithPP\SIMC.Overlap.csv"]

# Location of tif mosaics for each lake
SRC_TIFF_FOLDERS = [r"C:\Users\Welcome\OneDrive - University of Toronto\Sta2453\HURON_OverlapTiffsWithPP",
                      r"C:\Users\Welcome\OneDrive - University of Toronto\Sta2453\SIMC_OverlapTiffsWithPP"]

# Destination for master CSV file and demosaiced tifs
DEST_CSV_FOLDER = r"C:\Users\Welcome\Documents\MScAC\STA2453\Project\CodeRepo\STA2453-Zooplankton\data\merged_csv"
DEST_TIFF_FOLDER = r"C:\Users\Welcome\Documents\MScAC\STA2453\Project\CodeRepo\STA2453-Zooplankton\data\combined_pics"

# CSV pre-processing
Merging all csvs together and adding a few helper columns

In [3]:
def merge_csv_files(SRC_CSV_FOLDERS, SRC_TIF_FOLDERS):
    """
    Merges all CSV files from the folders provided into a single dataframe. 
    Adds in new columns:
        - tiff_fp: folder containing the relevent tif file
        - csv_filepath: full filepath for the source csv file
        - lake: Source lake (currently hardcoded based on existing lake folder struct.)

    Args:
        SRC_FOLDERS (list): List of folders containing CSV files to merge

    Returns:
        pandas.DataFrame: DataFrame containing the merged CSV data

    Raises:
        FileNotFoundError: If any of the specified paths do not exist
        pd.errors.EmptyDataError: If any CSV files are empty
    
    """
    csvs = [] #Empty list to be populated with individual dfs for each csv file
    
    # Loop through each folder
    for i,filepath in enumerate(SRC_CSV_FOLDERS):
        folder_path = Path(filepath)
        tiff_path = SRC_TIF_FOLDERS[i]

        # Loop through each file within the given folder
        for file_path in folder_path.iterdir():
            if file_path.is_file():
                csv = pd.read_csv(file_path)
                csv['tiff_fp'] = tiff_path
                
                csv['csv_filepath'] = file_path.name[:-4]
                if filepath == SRC_CSV_FOLDERS[0]:
                    csv['lake'] = 'Huron'
                else:
                    csv['lake'] = 'Simcoe'
                csvs.append(csv)
    
    # Append all individual csvs together
    master_csv = pd.concat(csvs)
    return master_csv

# TIFF Demosaicing
Split out the combined tiff files into individual images for each of the samples

In [4]:
def crop_save_image(image, start_point, size, csv_file_name, particle_id):
    """
    Crop out sample from mosaic and save new image
    
    Args:
        image (numpy.ndarray): Mosaic containing sample
        start_point (tuple): (x,y) coordinates of top-left corner of crop region
        size (tuple): (width, height) dimensions of crop region
        csv_file_name (str): Name of CSV file associated with this image
        particle_id (int): Particle ID of sample being cropped
    
    """
    # Determine the bottom right of bounding box
    end_point = tuple(a + b for a, b in zip(start_point, size))
    
    # Crop image
    cropped_image = image[start_point[1]:end_point[1],start_point[0]:end_point[0]]
    
    # Save new image using csv and particle ID as the identifier
    tif_file = fr"{DEST_TIFF_FOLDER}\{csv_file_name}_{particle_id}.tif"
    cv2.imwrite(tif_file, cropped_image)

    return

def split_csv(master_df):
    """
    Split the sample out of the mosaic for every sample with the csv file
    
    Args:
        master_df (pandas.DataFrame): source file for samples, generated by merge_csv_files

    Returns:
        list: collection of all tiffs that were not found
   
    """

    prev_tiff = ""
    # Lists to track failed demosaicing
    failed_tiffs = [] # Cases where the tiff file wasn't found
    failed_other = [] # Cases where something else went wrong

    N = master_df.shape[0]
    # Iterate through each sample in the master df
    for i, row in master_df.iterrows():
        if i % 114562 == 0:
            print(f"Completed: {int(i*100/N)+1}%")
        # Load and crop tiff file
        try:
            tiff_filename_i = row['tiff_fp'] + "\\" + row['Image.File']
            # To improve run time only read in the the tif mosaic when it is a new mosaic
            if prev_tiff != row['Image.File']:
                image = cv2.imread(tiff_filename_i, cv2.IMREAD_GRAYSCALE)
                prev_tiff = row['Image.File']
            
            # Collect bounding box coords from csv
            loc = (row['Image.X'],row['Image.Y'])
            size = (row['Image.Width'],row['Image.Height'])
            csv_filename = row['csv_filepath']

            # If tiff doesn't exist skip trying to crop it
            if image is None:
                failed_tiffs.append([csv_filename,row['Particle.ID'],row['Image.File']])
                continue
            else:
                crop_save_image(image,loc,size,csv_filename,row['Particle.ID'])

        # in case of failure record missing tif
        except:
            failed_other.append([csv_filename,row['Particle.ID'],row['Image.File']])
    
    #return collection of all the tiff files that weren't found
    return failed_tiffs, failed_other

In [5]:
master_df = merge_csv_files(SRC_CSV_FOLDERS, SRC_TIFF_FOLDERS)

# Uncomment below line to save master_df
# master_df.to_csv(r'C:\Users\Welcome\Documents\MScAC\STA2453\Project\CodeRepo\STA2453-Zooplankton\data\merged_csv\combined_lakes.csv', index=False)

In [None]:
# Warning this call takes a very long time
failed_tiffs, failed_other = split_csv(master_df)

Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Completed: 1%
Comple

Below code cell made to output images for EDA writeup

In [21]:
try: 
    master_df
except:
    master_df = pd.read_csv(r'C:\Users\Welcome\Documents\MScAC\STA2453\Project\CodeRepo\STA2453-Zooplankton\data\merged_csv\combined_lakes.csv')
image_fp = master_df.iloc[0,:]['Image.File']


row = master_df.iloc[0,:]

loc = (row['Image.X'],row['Image.Y'])
size = (row['Image.Width'],row['Image.Height'])

end_point = tuple(a + b for a, b in zip(loc, size))

example_img = cv2.imread(SRC_TIFF_FOLDERS[0] + "\\" +image_fp)
cv2.rectangle(example_img, loc, end_point,color = [0,255,0], thickness = 2)

if False:
    cv2.imshow(winname="blah", mat= example_img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
else:
    cv2.imwrite(r"C:\Users\Welcome\Documents\MScAC\STA2453\Project\EDA Writeup\mosaic.jpg", example_img)
