# Find Fish Sex

In [1]:
# Imports
import os
import csv
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from scipy import ndimage as ndi
from scipy.ndimage import measurements, center_of_mass, binary_dilation, zoom
from skimage.segmentation import clear_border
from skimage import measure
from skimage.measure import label, regionprops
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances

# Address Memory Leak in K-means
os.environ["OMP_NUM_THREADS"] = "1"

In [2]:
# Select Videos
selected_videos = []
for filename in os.listdir('.'):
    if filename.endswith('.mp4'):
        selected_videos.append(filename)

In [None]:
# Create NPY Files with Contours for Videos
file_count = 1
for video in selected_videos:
    
    # Use OpenCV for Frame Processing
    cap = cv2.VideoCapture(video)
    frames = []
    while cap.isOpened():
        
        # Get Frame unless End of Video is Reached
        ret, frame = cap.read()
        if not ret:
            break
        
        # Convert Color Space to HSV
        hsv_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2HSV)
        
        # Apply Blur
        blurred_img = cv2.GaussianBlur(hsv_frame, (3,3), 0) 
        
        # Convert to Gray for Contouring 
        gray_frame = cv2.cvtColor(blurred_img, cv2.COLOR_HSV2BGR)
        gray_frame = cv2.cvtColor(gray_frame, cv2.COLOR_BGR2GRAY)
        
        # Create Thresholding with Edges
        _, threshold_frame = cv2.threshold(gray_frame, 50, 255, cv2.THRESH_BINARY)
        edges = cv2.Canny(gray_frame, 50, 100)
        
        # Dilate the Edges for Better Contours
        kernel = np.ones((4, 4), np.uint8) 
        dilated_edges = cv2.dilate(edges, kernel, iterations=2)
        
        # Erode the Edges for Better Contours
        kernel = np.ones((4, 4), np.uint8)
        eroded_dilated_edges = cv2.erode(dilated_edges, kernel, iterations=1)
        
        # Invert the Edges so that Fish are Contoured instead of Sand
        inverted_edges = cv2.bitwise_not(eroded_dilated_edges)

        # Find the Contours 
        contours, _ = cv2.findContours(inverted_edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        # Filter Out Small and Big Contours
        # Removes Noise and Smaller Fish Tail Contours
        contours = [contour for contour in contours if cv2.contourArea(contour) > 300]
        # Removes Tank Detections
        contours = [contour for contour in contours if cv2.contourArea(contour) < 4500]
        
        # Filter Out Extremely Round-ish Contours (like Sand)
        selected_contours = []
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            aspect_ratio = float(w) / h if h != 0 else 0
            if aspect_ratio < 1.5:
                selected_contours.append(contour)
        contours = selected_contours

        # Needed to Save the Contours to Array
        contour_frame = frame.copy()
        
        # Smooth Contours for Less Jagged Edges
        smoothed_contours = []
        for contour in contours:
            epsilon = 0.004 * cv2.arcLength(contour, True)
            approx_contour = cv2.approxPolyDP(contour, epsilon, True)
            smoothed_contours.append(approx_contour)
            
            # Use PCA to Find the Head and Tail
            # PCA Usually Finds Longest Dimension of Object (Length of Fish)
            
            # Remove Extra "Wrapper Array" from OpenCV
            points = approx_contour.reshape(-1, 2).astype(np.float32)
            
            # Ensure At Least 1 Point
            if len(points) > 1:
                
                # Compute Eigenvectors, Primary is First One; Project to Principal Axis
                mean, eigenvectors = cv2.PCACompute(points, mean=None)
                principal_axis = eigenvectors[0]
                projections = np.dot(points - mean, principal_axis)
                
                # Get Head
                max_idx = np.argmax(projections)
                head_point = tuple(points[max_idx].astype(int))

                # Get Tail
                min_idx = np.argmin(projections)
                tail_point = tuple(points[min_idx].astype(int))
                
                # Get Hues of Head and Tail
                head_hue = hsv_frame[head_point[1], head_point[0], 0]
                tail_hue = hsv_frame[tail_point[1], tail_point[0], 0]
                
                # Collect Hues from All Points
                hue_all_pts = []
                for point in points:
                    x, y = int(point[0]), int(point[1])
                    hue = hsv_frame[y, x, 0]
                    hue_all_pts.append(hue)
                hue_all_pts = np.array(hue_all_pts)
                
                # Cluster
                hue_all_pts = hue_all_pts.reshape(-1, 1)
                kmeans = KMeans(n_clusters=2, random_state=42)
                kmeans.fit(hue_all_pts)
                labels = kmeans.labels_
                centroids = kmeans.cluster_centers_
                
                # Find Intra and Between Cluster Distances
                intra_cluster_distances = []
                for i in range(2):
                    cluster_points = hue_all_pts[labels == i]
                    centroid = centroids[i]
                    distances = np.linalg.norm(cluster_points - centroid, axis=1)
                    intra_cluster_distances.append(np.mean(distances))
                between_cluster_distance = np.linalg.norm(centroids[0] - centroids[1])
                intra_cluster_distances = intra_cluster_distances[0] + intra_cluster_distances[1]
                
                # Take the SD of the Hue Values
                hue_std_dev = np.std(hue_all_pts)
                hue_range = np.max(hue_all_pts) - np.min(hue_all_pts)
                
                # Calculate Abs Value of Difference in Hue btwn Head and Tail
                # HSV has Circular Range 0-179!! Must Check Forward and Backward
                hue_dif_1 = head_hue - tail_hue
                hue_dif_2 = tail_hue - head_hue
                if (hue_dif_1 < hue_dif_2):
                    hue_dif = abs(hue_dif_1)
                else:
                    hue_dif = abs(hue_dif_2)
                
                # Print Hues to CSV for Inspection
                inspect_hue_csv = "hue_comparison.csv"
                write_header = not os.path.exists(inspect_hue_csv)
                with open(inspect_hue_csv, mode='a', newline='') as file:
                    writer = csv.writer(file)
                    
                    # Write Header
                    if write_header:
                        writer.writerow(["File Name", 
                                         "Final Hue Dif", "Hue Range", "Hue Std Dev", 
                                         "Img Seg Label", "Hue Range Label", "Hue Std Dev Label", 
                                         "Between Cluster Distance", "Intracluster Distances",
                                         "Between Cluster Label", "Intracluster Label",
                                         "Manual True Label", 
                                         "Img Seg Label Correct", "Hue Range Label Correct",
                                         "Hue Std Dev Label Correct", 
                                         "Btwn Cluster Correct", "Intra Cluster Correct"
                                        ])
                        
                    # Create True Label Column Based on File Name
                    if "female" in video.lower(): # FEMALE
                        manual_true_label = 0
                    elif "male" in video.lower(): # MALE
                        manual_true_label = 1
                        
                    # Define Function to Check Label Correctness
                    def check_label(threshold, value, true_label):
                        if value < threshold:
                            label = 0  # FEMALE
                        else:
                            label = 1  # MALE
                        correct = 1 if true_label == label else 0
                        return label, correct

                    # Apply Function to Different Features
                    hue_dif_label, hue_dif_correct = check_label(19, hue_dif, manual_true_label)
                    hue_std_dev_label, hue_std_dev_correct = check_label(10, hue_std_dev, manual_true_label)
                    hue_range_label, hue_range_correct = check_label(50, hue_range, manual_true_label) 
                    between_cluster_distance_label, between_cluster_distance_correct = check_label(10.1, between_cluster_distance, manual_true_label) 
                    intra_cluster_distance_label, intra_cluster_distance_correct = check_label(5, intra_cluster_distances, manual_true_label) 
                    
                    # Write Data to CSV
                    writer.writerow([video, 
                                     hue_dif, hue_range, hue_std_dev,
                                     hue_dif_label, hue_range_label, hue_std_dev_label, 
                                     between_cluster_distance, intra_cluster_distances,
                                     between_cluster_distance_label, intra_cluster_distance_label,
                                     manual_true_label,
                                     hue_dif_correct, hue_range_correct, 
                                     hue_std_dev_correct,
                                     between_cluster_distance_correct, intra_cluster_distance_correct])
                
                # Draw Points on Frame for Reference
                cv2.circle(contour_frame, head_point, 5, (255, 0, 0), -1)
                cv2.circle(contour_frame, tail_point, 5, (0, 0, 0), -1)
    
        # Draw Contours
        cv2.drawContours(contour_frame, smoothed_contours, -1, (0, 0, 0), 2)
    
        # Append Frames
        frames.append(contour_frame)
        
    # Release Resource
    cap.release()
    
    # Save Frames to NPY File
    frames_np = np.array(frames, dtype=np.uint8)
    np.save(f"{video}.npy", frames_np)
    
    # Get Video Info for Contour Video
    fps = 30
    height, width = frames_np.shape[1:3]

    # Video Output Name(s)
    output_file = f"{video}_CONTOUR.mp4"

    # Video Codec Info
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')

    # Write Frames
    out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
    for frame in frames:
        out.write(frame)
    out.release()
    
    # Print Progress
    print(f"File Number {file_count} Complete.")
    file_count = file_count + 1

In [4]:
# Load New Files
selected_videos = []
for filename in os.listdir('.'):
    if filename.endswith('.npy'):
        selected_videos.append(filename)

In [5]:
# Read in CSV of Labels and Check Frame and Track Accuracy
from collections import defaultdict
file_counts = defaultdict(lambda: {'correct': 0, 'incorrect': 0, 'total': 0, 
                                   'range_correct': 0, 'range_incorrect': 0, 'range_total': 0,
                                   'stddev_correct': 0, 'stddev_incorrect': 0, 'stddev_total': 0,
                                   'btwn_correct': 0, 'btwn_incorrect': 0, 'btwn_total': 0,
                                   'intra_correct': 0, 'intra_incorrect': 0, 'intra_total': 0})

# Find the Count of Correct and Incorrect Rows
def update_counts(file_counts, file_name, label_name, correct_key, incorrect_key, total_key):
    label_correct = row.get(label_name)
    if label_correct == '1': 
        file_counts[file_name][correct_key] += 1
    elif label_correct == '0':
        file_counts[file_name][incorrect_key] += 1
    file_counts[file_name][total_key] += 1
        
# Read Each Row
with open('hue_comparison.csv', mode='r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        file_name = row.get("File Name")

        # Update Counts for Different Features
        update_counts(file_counts, file_name, "Img Seg Label Correct", 'correct', 'incorrect', 'total')
        update_counts(file_counts, file_name, "Hue Range Label Correct", 'range_correct', 'range_incorrect', 'range_total')
        update_counts(file_counts, file_name, "Hue Std Dev Label Correct", 'stddev_correct', 'stddev_incorrect', 'stddev_total')
        update_counts(file_counts, file_name, "Btwn Cluster Correct", 'btwn_correct', 'btwn_incorrect', 'btwn_total')
        update_counts(file_counts, file_name, "Intra Cluster Correct", 'intra_correct', 'intra_incorrect', 'intra_total')

# Create Results CSV
results_csv = "results.csv"
write_header = not os.path.exists(results_csv)

# Calculate the Percentage of Rows Correct per File
def calculate_percentage_and_predict(counts, correct_key, total_key, threshold=50):
    percentage_correct = (counts[correct_key] / counts[total_key]) * 100
    print(f"File Name: {file_name} - {correct_key.replace('_', ' ').title()} Percentage Correct: {percentage_correct:.2f}%")
    prediction = 1 if percentage_correct > threshold else 0
    return percentage_correct, prediction

# Write Results to CSV
with open(results_csv, mode='a', newline='') as file:
    writer = csv.writer(file)
    
    # Write Header
    if write_header:
        writer.writerow(["File Name", "Head Tail Hue Percent Correct", "Head Tail Hue Trackwise Correct",
                        "Hue Range Percent Correct", "Hue Range Trackwise Correct",
                        "Hue Std Dev Percent Correct", "Hue Std Dev Trackwise Correct",
                        "Btwn Cluster Percent Correct", "Btwn Cluster Trackwise Correct",
                        "Intracluster Percent Correct", "Intracluster Trackwise Correct"
                        ])
        
    # Get Percentage Correct and Write to CSV
    for file_name, counts in file_counts.items():
        if counts['total'] > 0:
            percentage_correct, prediction = calculate_percentage_and_predict(counts, 'correct', 'total')
            hue_range_percent_correct, hue_range_predict = calculate_percentage_and_predict(counts, 'range_correct', 'range_total')
            hue_stddev_percent_correct, hue_stddev_predict = calculate_percentage_and_predict(counts, 'stddev_correct', 'stddev_total')
            btwn_percent_correct, btwn_predict = calculate_percentage_and_predict(counts, 'btwn_correct', 'btwn_total')
            intra_percent_correct, intra_predict = calculate_percentage_and_predict(counts, 'intra_correct', 'intra_total')
            print()
        writer.writerow([file_name, percentage_correct, prediction, 
                         hue_range_percent_correct, hue_range_predict,
                         hue_stddev_percent_correct, hue_stddev_predict,
                         btwn_percent_correct, btwn_predict,
                         intra_percent_correct, intra_predict])

File Name: MC_singlenuc23_1_Tk33_021220__0001_vid__1330_female.mp4 - Correct Percentage Correct: 92.48%
File Name: MC_singlenuc23_1_Tk33_021220__0001_vid__1330_female.mp4 - Range Correct Percentage Correct: 99.40%
File Name: MC_singlenuc23_1_Tk33_021220__0001_vid__1330_female.mp4 - Stddev Correct Percentage Correct: 90.33%
File Name: MC_singlenuc23_1_Tk33_021220__0001_vid__1330_female.mp4 - Btwn Correct Percentage Correct: 61.62%
File Name: MC_singlenuc23_1_Tk33_021220__0001_vid__1330_female.mp4 - Intra Correct Percentage Correct: 61.99%

File Name: MC_singlenuc23_1_Tk33_021220__0002_vid__12986_male.mp4 - Correct Percentage Correct: 13.89%
File Name: MC_singlenuc23_1_Tk33_021220__0002_vid__12986_male.mp4 - Range Correct Percentage Correct: 0.00%
File Name: MC_singlenuc23_1_Tk33_021220__0002_vid__12986_male.mp4 - Stddev Correct Percentage Correct: 8.91%
File Name: MC_singlenuc23_1_Tk33_021220__0002_vid__12986_male.mp4 - Btwn Correct Percentage Correct: 80.80%
File Name: MC_singlenuc23_1

In [6]:
# Get Total Result Counts
df = pd.read_csv("results.csv")
total_valid_rows = df.dropna().shape[0]

# Get Counts for Each Feature
headtailhue_trackwise_correct_count = (df['Head Tail Hue Trackwise Correct'] == 1).sum()
huerange_trackwise_correct_count = (df['Hue Range Trackwise Correct'] == 1).sum()
huestddev_trackwise_correct_count = (df['Hue Std Dev Trackwise Correct'] == 1).sum()
btwn_trackwise_correct_count = (df['Btwn Cluster Trackwise Correct'] == 1).sum()
intra_trackwise_correct_count = (df['Intracluster Trackwise Correct'] == 1).sum()

# Percentage Correct
print(f"Head Tail Hue Dif Trackwise Percentage Correct: {headtailhue_trackwise_correct_count / total_valid_rows}")
print(f"Hue Range Trackwise Percentage Correct: {huerange_trackwise_correct_count / total_valid_rows}")
print(f"Hue Std Dev Trackwise Percentage Correct: {huestddev_trackwise_correct_count / total_valid_rows}")
print(f"Btwn Cluster Trackwise Percentage Correct: {btwn_trackwise_correct_count / total_valid_rows}")
print(f"Intracluster Trackwise Percentage Correct: {intra_trackwise_correct_count / total_valid_rows}")

Head Tail Hue Dif Trackwise Percentage Correct: 0.8214285714285714
Hue Range Trackwise Percentage Correct: 0.7214285714285714
Hue Std Dev Trackwise Percentage Correct: 0.7357142857142858
Btwn Cluster Trackwise Percentage Correct: 0.7285714285714285
Intracluster Trackwise Percentage Correct: 0.75


TODO Next: 
1) Choose train and test. 
2) Using train, choose threshold values for head/tail hue differenct, hue range, hue std, btwn cluster distance, and intracluster distance.
3) Sort intracluster distances by size and try thresholding both OR try new clustering algorithm.
4) Choose the 2 or 3 best methods and run through neural net or something simple that can be trained with around 100 samples in train.
5) Integrate with Eric's existing method for the outline. This may take some time. Note that the errors appear to be mostly from the coloration being difficult, not tank detections. 
6) Tune contour filtering, contour smoothing, and contour aspect ratio elimination.

Sources will be listed in their associated cells. Some generative ai was used to assist in writing code. The following tutorial was used as a starting point: https://www.youtube.com/watch?v=UIgaLDgb2fYv.