In [None]:
# Import libraries
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm
import yaml
import matplotlib.pyplot as plt
from ultralytics import YOLO
import numpy as np
from PIL import Image
import torch

In [None]:
# INPUT_DIRS
INPUT_DATA_DIR = Path('dataset')
## Drop the Folder if it already exists
DATASETS_DIR = Path('dataset')
# Image & labels directory
TRAIN_IMAGES_DIR = DATASETS_DIR / 'images' / 'train'
TRAIN_LABELS_DIR = DATASETS_DIR / 'labels'/ 'train'
TEST_IMAGES_DIR = DATASETS_DIR / 'images' / 'test'
VAL_IMAGES_DIR = DATASETS_DIR / 'images' /'val'
VAL_LABELS_DIR = DATASETS_DIR / 'labels' /'val'

# Load train and test files
train = pd.read_csv(INPUT_DATA_DIR / 'Train_df.csv')
val = pd.read_csv(INPUT_DATA_DIR / 'Val_df.csv')
test = pd.read_csv(INPUT_DATA_DIR / 'Test.csv')
ss = pd.read_csv(INPUT_DATA_DIR / 'SampleSubmission.csv')

class_map = {cls: i for i, cls in enumerate(sorted(train['class'].unique().tolist()))}
# Strip any spacing from the class item and make sure that it is a str
train['class'] = train['class'].str.strip()

# Map {'healthy': 2, 'cssvd': 1, anthracnose: 0}
train['class_id'] = train['class'].map(class_map)

train_df = train
val_df = val

# Create a data.yaml file required by yolo
class_names = sorted(train['class'].unique().tolist())
num_classes = len(class_names)
data_yaml = {
    "path" : str(DATASETS_DIR.absolute()),
    'train': str(TRAIN_IMAGES_DIR.absolute()),
    'val': str(VAL_IMAGES_DIR.absolute()),
    'test': str(TEST_IMAGES_DIR.absolute()),
    'nc': num_classes,
    'names': class_names
}

val_image_names = [str(Path(name).stem) for name in val_df['Image_ID'].unique()]
train_image_names = [str(Path(name).stem) for name in train['ImagePath'].unique()]

In [None]:
from glob import glob

# Validate the model on the validation set
BEST_PATH = sorted(glob("runs/detect/train*/weights/best.pt"))[-1]
BEST_PATH

In [None]:
# Load the trained YOLO model
model = YOLO(BEST_PATH)

# Path to the test images directory
test_dir_path = VAL_IMAGES_DIR

# Get a list of all image files in the test directory
image_files = os.listdir(test_dir_path)

# Initialize an empty list to store the results for all images
all_data = []

# Initialize an empty list to store the results for all images
all_data = []

# Iterate through each image in the directory
for image_file in tqdm(image_files):
    # Full path to the image
    img_path = os.path.join(test_dir_path, image_file)

    # Make predictions on the image
    results = model(img_path, verbose=False)

    # Extract bounding boxes, confidence scores, and class labels
    boxes = results[0].boxes.xyxy.tolist() if results[0].boxes else []  # Bounding boxes in xyxy format
    classes = results[0].boxes.cls.tolist() if results[0].boxes else []  # Class indices
    confidences = results[0].boxes.conf.tolist() if results[0].boxes else []  # Confidence scores
    names = results[0].names  # Class names dictionary

    if boxes:  # If detections are found
        for box, cls, conf in zip(boxes, classes, confidences):
            x1, y1, x2, y2 = box
            detected_class = names[int(cls)]  # Get the class name from the names dictionary

            # Add the result to the all_data list
            all_data.append({
                'Image_ID': str(image_file),
                'class': detected_class,
                'confidence': conf,
                'ymin': y1,
                'xmin': x1,
                'ymax': y2,
                'xmax': x2
            })
    else:  # If no objects are detected
        all_data.append({
            'Image_ID': str(image_file),
            'class': "None",
            'confidence': None,
            'ymin': None,
            'xmin': None,
            'ymax': None,
            'xmax': None
        })


In [None]:
# Convert the list to a DataFrame for all images
sub = pd.DataFrame(all_data)

In [None]:
sub.head()

In [None]:
sub['class'].value_counts()

In [None]:
from PIL import Image

def load_yolo_labels(label_folder):
	label_data = {}
	label_folder = Path(label_folder)

	for label_file in label_folder.glob("*.txt"):
		with open(label_file, "r") as file:
			annotations = []
			for line in file:
				parts = line.strip().split()
				if len(parts) == 5:
					class_id, x_center, y_center, width, height = map(float, parts)
					annotations.append({
						"class_id": int(class_id),
						"x_center": x_center,
						"y_center": y_center,
						"width": width,
						"height": height
					})
			label_data[label_file.stem] = annotations
	# Convert the label data to a pandas DataFrame
	label_df = []
	for image_id, annotations in label_data.items():
		for annotation in annotations:
			label_df.append({
				"Image_ID": image_id,
				"class_id": annotation["class_id"],
				"x_center": annotation["x_center"],
				"y_center": annotation["y_center"],
				"width": annotation["width"],
				"height": annotation["height"]
			})

	label_df = pd.DataFrame(label_df)
	return label_df

# Example usage
label_folder = VAL_LABELS_DIR
labels = load_yolo_labels(label_folder)
labels.sample(5)

def yolo_to_bbox(image_folder, labels_df: pd.DataFrame):
	image_folder = Path(image_folder)
	converted_bboxes = []
	for image_file in image_folder.glob("*.*"):
		image_id = image_file.stem
		if image_id not in labels_df['Image_ID'].values:
			converted_bboxes.append({
				"Image_ID": image_id,
				"class_id": -1,  # Indicating no label
				"x_min": None,
				"y_min": None,
				"x_max": None,
				"y_max": None
			})

	for _, row in labels_df.iterrows():
		image_path = image_folder / f"{row['Image_ID']}.jpg"
		if not image_path.exists():
			image_path = image_folder / f"{row['Image_ID']}.jpeg"
		if not image_path.exists():
			image_path = image_folder / f"{row['Image_ID']}.png"

		if image_path.exists():
			with Image.open(image_path) as img:
				img_width, img_height = img.size

			x_center = row['x_center'] * img_width
			y_center = row['y_center'] * img_height
			width = row['width'] * img_width
			height = row['height'] * img_height

			x_min = x_center - (width / 2)
			y_min = y_center - (height / 2)
			x_max = x_center + (width / 2)
			y_max = y_center + (height / 2)

			converted_bboxes.append({
				"Image_ID": row['Image_ID'],
				"class_id": row['class_id'],
				"x_min": x_min,
				"y_min": y_min,
				"x_max": x_max,
				"y_max": y_max
			})

	return pd.DataFrame(converted_bboxes)

# Example usage
converted_labels = yolo_to_bbox(VAL_IMAGES_DIR, labels)
converted_labels.sample(5)

In [None]:
converted_labels['class_id'].value_counts()

In [None]:
class_map

In [None]:
id_class_map = {v: k for k, v in class_map.items()}
converted_labels['class'] = converted_labels['class_id'].map(id_class_map)
converted_labels['class'].value_counts()

In [None]:
converted_labels.sample(5)

In [None]:
sub.sample(5)

In [None]:
sub.loc[:, "Image_ID"] = sub["Image_ID"].apply(lambda x: str(Path(x).stem))

sub.sample(3)

In [None]:
from sklearn.metrics import precision_recall_curve
import numpy as np

def calculate_iou(pred_box, gt_box):
	"""
	Calculate Intersection over Union (IoU) between two bounding boxes.
	pred_box: [xmin, ymin, xmax, ymax]
	gt_box: [xmin, ymin, xmax, ymax]
	"""
	x1 = max(pred_box[0], gt_box[0])
	y1 = max(pred_box[1], gt_box[1])
	x2 = min(pred_box[2], gt_box[2])
	y2 = min(pred_box[3], gt_box[3])

	intersection = max(0, x2 - x1) * max(0, y2 - y1)
	pred_area = (pred_box[2] - pred_box[0]) * (pred_box[3] - pred_box[1])
	gt_area = (gt_box[2] - gt_box[0]) * (gt_box[3] - gt_box[1])
	union = pred_area + gt_area - intersection

	return intersection / union if union > 0 else 0

def evaluate_model(predictions, ground_truth, iou_threshold=0.5):
	"""
	Evaluate the model by calculating precision and recall at different confidence thresholds.
	predictions: DataFrame with columns [Image_ID, class, confidence, ymin, xmin, ymax, xmax]
	ground_truth: DataFrame with columns [Image_ID, class_id, x_min, y_min, x_max, y_max, class]
	iou_threshold: IoU threshold to consider a prediction as a true positive.
	"""
	y_true = []
	y_scores = []

	for _, pred in predictions.iterrows():
		pred_box = [pred['xmin'], pred['ymin'], pred['xmax'], pred['ymax']]
		pred_class = pred['class']
		pred_confidence = pred['confidence']

		# Filter ground truth for the same image and class
		gt_boxes: pd.DataFrame = ground_truth[(ground_truth['Image_ID'] == pred['Image_ID']) & 
								(ground_truth['class'] == pred_class)]
		
		
		if pred['confidence'] is None or pd.isna(pred['confidence']):
			y_true.append(0)  # No prediction, treated as false positive
			y_scores.append(0)  # No confidence score
			continue

		if gt_boxes.empty:
			y_true.append(0)  # No ground truth, treated as false positive
			y_scores.append(pred_confidence)
			continue

		# Calculate IoU with all ground truth boxes
		ious = [calculate_iou(pred_box, [gt['x_min'], gt['y_min'], gt['x_max'], gt['y_max']]) 
				for _, gt in gt_boxes.iterrows()]

		# Determine if the prediction is a true positive
		if ious and max(ious) >= iou_threshold:
			y_true.append(1)  # True positive
		else:
			y_true.append(0)  # False positive

		y_scores.append(pred_confidence)

	# Calculate precision-recall curve
	precision, recall, thresholds = precision_recall_curve(y_true, y_scores)

	# Find the best confidence threshold (e.g., maximize F1 score)
	f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)
	best_index = np.argmax(f1_scores)
	best_confidence = thresholds[best_index]

	return best_confidence, precision[best_index], recall[best_index], f1_scores[best_index]

# Example usage
best_confidence, best_precision, best_recall, best_f1 = evaluate_model(sub, converted_labels)
print(f"Best Confidence: {best_confidence:.2f}, Precision: {best_precision:.2f}, Recall: {best_recall:.2f}, F1 Score: {best_f1:.2f}")

In [None]:
sub.to_csv('dataset/evaluations/validation.csv', index=False)