# Object Detection Model Training with YOLOv8

This notebook demonstrates the process of training a YOLOv8 object detection model on a custom dataset. The dataset is downloaded from Google Drive, unzipped, and then used for training. The training process involves sampling a subset of the training data for each epoch, evaluating the model periodically, and tracking metrics.

## 1. Downloading and Extracting the Dataset

In [None]:
# Download the dataset from Google Drive
#!gdown --folder "Link to the drive folder with the consolidated dataset" --remaining-ok

In [None]:
import zipfile
import os

# Define the name of the zip file downloaded from Google Drive
zip_file_name = '/content/Consolidated Dataset/combined_dataset.zip'
# Define the directory where the dataset will be extracted
extract_dir = './extracted cmbined data'

# Check if the zip file exists and extract its contents
if os.path.exists(zip_file_name):
    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"'{zip_file_name}' unzipped successfully to '{extract_dir}'.")
else:
    print(f"'{zip_file_name}' not found. Please ensure the download was successful.")

## 2. Installing Dependencies

Install the necessary libraries, including Ultralytics for YOLOv8.

In [None]:
# Install the ultralytics library
%pip install ultralytics

## 3. Training the YOLOv8 Model

Train the YOLOv8 model on the extracted dataset. The training uses a sampling strategy to select a subset of images for each epoch.

In [None]:
# This notebook trains a YOLOv8 object detection model on a custom dataset.
# The dataset is downloaded from a Google Drive folder, unzipped, and then used to train the model.
# The training process involves sampling a subset of the training data for each epoch and evaluating the model periodically.
# Metrics such as precision, recall, and mAP are tracked and saved to a CSV file.

from ultralytics import YOLO
import pandas as pd
import os
import yaml
import random
import shutil

# Define the path to the data.yaml file within the extracted dataset
data_yaml_path = './extracted cmbined data/data.yaml'

# Update the paths in data.yaml to be relative to the data.yaml file's location
# This is necessary for YOLOv8 to correctly locate the image and label directories
try:
    with open(data_yaml_path, 'r') as file:
        data = yaml.safe_load(file)

    data['train'] = '../train/images'
    data['val'] = '../valid/images'
    data['test'] = '../test/images'

    with open(data_yaml_path, 'w') as file:
        yaml.dump(data, file)
    print(f"Updated '{data_yaml_path}' with relative paths.")
except FileNotFoundError:
    print(f"Error: {data_yaml_path} not found. Please ensure the dataset is extracted correctly.")
    # Exit the cell execution if data.yaml is not found
    raise

# Instantiate a YOLO model with pretrained weights (YOLOv8n is a good starting point)
# The model weights will be loaded from the last saved checkpoint in the training loop for resuming
model = YOLO('yolov8n.pt')

# Define the path for the training metrics CSV file where results will be saved
metrics_csv_path = 'training_metrics.random_sample.csv'

# Initialize a list to store the training metrics collected during evaluation
metrics_data = []

# Define the total number of training epochs and the interval for model evaluation
total_epochs = 50
eval_interval = 5
# Define the number of images to sample from the training set for each epoch
images_per_epoch = 2000

# Get the list of all training image files from the extracted dataset
train_images_dir = './extracted cmbined data/train/images'
all_train_images = [os.path.join(train_images_dir, img) for img in os.listdir(train_images_dir) if img.endswith(('.jpg', '.jpeg', '.png'))]

# Create a temporary directory to store the sampled data for each epoch's training
temp_data_dir = './temp_sampled_data'
os.makedirs(temp_data_dir, exist_ok=True)

# Create subdirectories for train and valid within the temporary directory to mimic the dataset structure
temp_train_dir = os.path.join(temp_data_dir, 'train')
temp_valid_dir = os.path.join(temp_data_dir, 'valid')
os.makedirs(temp_train_dir, exist_ok=True)
os.makedirs(temp_valid_dir, exist_ok=True)
temp_train_images_dir = os.path.join(temp_train_dir, 'images')
temp_train_labels_dir = os.path.join(temp_train_dir, 'labels')
temp_valid_images_dir = os.path.join(temp_data_dir, 'valid', 'images') # Corrected path for valid images
temp_valid_labels_dir = os.path.join(temp_data_dir, 'valid', 'labels') # Corrected path for valid labels
os.makedirs(temp_train_images_dir, exist_ok=True)
os.makedirs(temp_train_labels_dir, exist_ok=True)
os.makedirs(temp_valid_images_dir, exist_ok=True) # Ensure valid image directory is created
os.makedirs(temp_valid_labels_dir, exist_ok=True) # Ensure valid labels directory is created


# Copy validation data to the temporary directory once before the training loop
# This is done to avoid repeatedly copying the validation set in each epoch
val_images_dir = './extracted cmbined data/valid/images'
val_labels_dir = './extracted cmbined data/data/valid/labels' # Corrected path based on previous outputs

for img_name in os.listdir(val_images_dir):
    if img_name.endswith(('.jpg', '.jpeg', '.png')):
        img_path = os.path.join(val_images_dir, img_name)
        label_name = img_name.replace('.jpeg', '.txt').replace('.jpg', '.txt').replace('.png', '.txt')
        label_path = os.path.join(val_labels_dir, label_name)

        shutil.copy(img_path, temp_valid_images_dir)
        if os.path.exists(label_path):
            shutil.copy(label_path, temp_valid_labels_dir)

# Create a temporary data.yaml file in the temporary directory for YOLOv8 to use
temp_data_yaml_path = os.path.join(temp_data_dir, 'data.yaml')
temp_data = data.copy()
temp_data['train'] = 'train/images'  # Relative path within the temporary directory
temp_data['val'] = 'valid/images'    # Relative path within the temporary directory
with open(temp_data_yaml_path, 'w') as file:
    yaml.dump(temp_data, file)

# Access and modify the number of classes (nc) in the model based on the dataset
# The dataset has 43 classes, but some labels go up to 49. We set to 50 to be safe.
model.model.nc = data['nc'] if 'nc' in data and data['nc'] is not None else 43 # Default to 43 if nc is not specified or None
# Assuming the last layer is the detect layer, update its nc and no
if hasattr(model.model, 'model') and len(model.model.model) > 0 and hasattr(model.model.model[-1], 'nc'):
    model.model.model[-1].nc = model.model.nc
    model.model.model[-1].no = model.model.nc + 5 # Update number of outputs (nc + 5)


# Start the training loop
for epoch in range(1, total_epochs + 1):
    print(f"Starting epoch {epoch}/{total_epochs}")

    # Clean up previous sampled training data before sampling for the current epoch
    shutil.rmtree(temp_train_images_dir)
    shutil.rmtree(temp_train_labels_dir)
    os.makedirs(temp_train_images_dir, exist_ok=True)
    os.makedirs(temp_train_labels_dir, exist_ok=True)

    # Randomly sample images for this epoch's training
    sampled_images = random.sample(all_train_images, images_per_epoch)

    # Copy the sampled images and their corresponding labels to the temporary training directory
    for img_path in sampled_images:
        img_name = os.path.basename(img_path)
        label_name = img_name.replace('.jpeg', '.txt').replace('.jpg', '.txt').replace('.png', '.txt')
        label_path = os.path.join('./extracted cmbined data/train/labels', label_name)

        shutil.copy(img_path, temp_train_images_dir)
        if os.path.exists(label_path):
            shutil.copy(label_path, temp_train_labels_dir)

    # Train the model for one epoch using the sampled data
    # Load the last saved weights to resume training if they exist
    last_weights_path = 'runs/detect/train/weights/last.pt' # Assuming the latest run is named 'train' by default
    if os.path.exists(last_weights_path):
        model = YOLO(last_weights_path)

        # Access and modify the number of classes (nc) in the model based on the dataset
        # The dataset has 43 classes, but some labels go up to 49. We set to 50 to be safe.
        model.model.nc = data['nc'] if 'nc' in data and data['nc'] is not None else 43 # Default to 43 if nc is not specified or None
        # Assuming the last layer is the detect layer, update its nc and no
        if hasattr(model.model, 'model') and len(model.model.model) > 0 and hasattr(model.model.model[-1], 'nc'):
            model.model.model[-1].nc = model.model.nc
            model.model.model[-1].no = model.model.nc + 5 # Update number of outputs (nc + 5)

    model.train(data=temp_data_yaml_path, epochs=1, imgsz=640, fliplr=0.5, degrees=10)


    # Evaluate the model every eval_interval epochs
    if epoch % eval_interval == 0:
        print(f"Evaluating model after epoch {epoch}...")
        metrics = model.val()

        # Extract key metrics from the evaluation results
        precision = metrics.results_dict['metrics/precision(B)']
        recall = metrics.results_dict['metrics/recall(B)']
        map50 = metrics.results_dict['metrics/mAP50(B)']
        map50_95 = metrics.results_dict['metrics/mAP50-95(B)']

        # Append the extracted metrics to the list
        metrics_data.append({
            'epoch': epoch,
            'precision': precision,
            'recall': recall,
            'mAP@0.5': map50,
            'mAP@0.5:0.95': map50_95
        })

        # Save the accumulated metrics to a CSV file
        metrics_df = pd.DataFrame(metrics_data)
        metrics_df.to_csv(metrics_csv_path, index=False)
        print(f"Metrics saved to {metrics_csv_path}")


print("Training and evaluation complete.")

# Clean up the main temporary data directory after training is finished
shutil.rmtree(temp_data_dir)