# I developed several strategies to handle the dataset.

### 3. Model Training with Tracking (MLflow)
### 4. Model Validation & Evaluation (offline, metrics-based)
### 5. Model Fine-Tuning
### 6. Model Prediction & Exporting (real-world & deployment readiness)
### 7. Model Monitoring & Maintenance (post-deployment)
### 8. Flask API (serving layer)

## 3. Model Training with Tracking (MLflow)

### Experiment on MLflow Dataset Tracking

In [None]:
# from collections import defaultdict
# from tqdm import tqdm
# from pathlib import Path

In [None]:
# # YOLO metadata extraction

# def extract_yolo_metadata(splits_path):
#     images_path = os.path.join(splits_path, "images")
#     labels_path = os.path.join(splits_path, "labels")
    
#     images_dir = os.listdir(images_path)
#     labels_dir = os.listdir(labels_path)
    
#     # Storage variables
#     image_count = 0
#     label_count = 0
#     other_format_image = 0
#     total_bboxes = 0

#     images_per_class = defaultdict(int)
#     objects_per_image = defaultdict(int)

#     missing_label_files = []
#     empty_label_files = []
#     images_without_annotations = []
#     bbox_areas = []

    
#     for img_file in tqdm(images_dir):
#         if not img_file.lower().endswith((".jpg", ".png", ".jpeg")):
#             other_format_image += 1
#             continue
        
#         image_count += 1
#         image_name = os.path.splitext(img_file)[0]
#         label_file = os.path.join(labels_path, f"{image_name}.txt")
        
#         # Check if label file exists
#         if not os.path.exists(label_file):
#             missing_label_files.append(img_file)
#             images_without_annotations.append(img_file)
#             continue
#         else:
#             label_count += 1
        
#         # Read annotate 
#         with open(label_file, "r") as f:
#             lines = f.readlines()
        
#         if lines == [''] or len(lines) == 0:
#             empty_label_files.append(img_file)
#             images_without_annotations.append(img_file)
#             continue
        
#         # Count objects
#         objects_per_image[img_file] = len(lines)
#         total_bboxes += len(lines)
        
#         # Count instances
#         for line in lines:
#             cls, x, y, w, h = map(float, line.split())
#             cls = int(cls)
#             images_per_class[cls] += 1
            
#             # Calculate bbox area
#             bbox_area = w * h
#             bbox_areas.append(bbox_area)
    
#     return {
#         "num_images": image_count,
#         "num_labels": label_count,
#         "total_bboxes": total_bboxes,
#         "images_per_class": images_per_class,
#         "num_classes": len(images_per_class),
#         "other_format_images": other_format_image,
#         "missing_label_files": missing_label_files,
#         "empty_label_files": empty_label_files,
#         "images_without_annotations": images_without_annotations,
#         "objects_per_image": objects_per_image,
#         "bbox_areas": bbox_areas 
#     }

In [None]:
# # Save metadata as JSON artifact

# import json

# meta_all = {}

# for split in ["train", "valid", "test"]:
#     meta_all[split] = extract_yolo_metadata(f"../datasets/data/{split}")

# with open("dataset_metadata.json", "w") as f:
#     json.dump(meta_all, f, indent=2)

# mlflow.log_artifact("dataset_metadata.json", artifact_path="dataset_info")

In [None]:
# # Dataset hashing (split-level) for dataset versioning

# import hashlib

# def hash_directory(path):
#     h = hashlib.sha256()
#     for root, _, files in os.walk(path):
#         for f in sorted(files):
#             with open(os.path.join(root, f), "rb") as file:
#                 h.update(file.read())
#     return h.hexdigest()

In [None]:
# # Log metadata to MLflow

# from mlflow.data.dataset import Dataset
# from mlflow.data.filesystem_dataset_source import FileSystemDatasetSource
# from mlflow.data.dataset_source import DatasetSource

# with mlflow.start_run(run_name="yolo_dataset_v1"):

#     # Log datasets
#     DatasetSource()
#     training_dataset = Dataset.from_source(FileSystemDatasetSource("../datasets/data/train", digest=hash_directory("../datasets/data/train")))
#     validation_dataset = Dataset(FileSystemDatasetSource("../datasets/data/valid", digest=hash_directory("../datasets/data/valid")))
#     evaluation_dataset = Dataset(FileSystemDatasetSource("../datasets/data/test", digest=hash_directory("../datasets/data/test")))
    
#     mlflow.log_input(training_dataset, context="training")
#     mlflow.log_input(validation_dataset, context="validation")
#     mlflow.log_input(evaluation_dataset, context="evaluation")

#     # Metadata + versioning
#     for split in ["train", "valid", "test"]:
#         meta = extract_yolo_metadata(f"../datasets/data/{split}")
#         mlflow.log_metric(f"{split}_images", meta["num_images"])
#         mlflow.log_metric(f"{split}_labels", meta["num_labels"])
#         mlflow.log_metric(f"{split}_boxes", meta["total_bboxes"])
#         mlflow.log_metric(f"{split}_classes_distribution", meta["images_per_class"])
#         mlflow.log_metric(f"{split}_other_formats", meta["other_format_images"])
#         mlflow.log_metric(f"{split}_missing_labels", meta["missing_label_files"])
#         mlflow.log_metric(f"{split}_empty_labels", meta["empty_label_files"])
#         mlflow.log_metric(f"{split}_images_without_annotations", meta["images_without_annotations"])
#         mlflow.log_metric(f"{split}_objects_per_image", meta["objects_per_image"])
#         mlflow.log_metric(f"{split}_bbox_area", meta["bbox_areas"])
#         mlflow.log_param(
#             f"{split}_dataset_hash",
#             hash_directory(f"../datasets/data/{split}")
#         )

#     mlflow.log_param("dataset_version", "v1.0")

### 3.1. Model Training with Tracking

In [None]:
print("ok")

In [None]:
# Importing libraries
import os
import mlflow
from tqdm import tqdm
from pathlib import Path

# Disable python warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Config MLflow

MODEL_REGISTRY = Path("./mlruns")
Path(MODEL_REGISTRY).mkdir(parents=True, exist_ok=True)

experiment_name = "Waste-Material-System"
artifact_location = str(MODEL_REGISTRY)
tags = {"version": "v1"}
mlflow.create_experiment(experiment_name, artifact_location, tags)

In [None]:
# Print connection information

print(f"Active Experiment: {mlflow.get_experiment_by_name(experiment_name)}")

In [None]:
mlflow.set_experiment(experiment_name)

In [None]:
from ultralytics import YOLO

In [None]:
from ultralytics import settings

In [None]:
settings.update({
    "weights_dir": "./notebooks/",
    "runs_dir": "./notebooks/",
    "mlflow": False
})

In [None]:
# View all settings
print(settings)

In [None]:
# Enable MLflow callback/autologging

mlflow.autolog(
    log_input_examples=True,
    log_model_signatures=True,
    log_models=True,
    log_datasets=True,
    log_traces=True,
    extra_tags={"model_type": "YOLOv10b"}
)

In [None]:
# Load yaml file
import yaml

with open("../datasets/data.yaml", "r") as f:
    dataset = yaml.load(f, Loader=yaml.FullLoader)

In [None]:
dataset

In [None]:
def train_yolov10b_model(data_yaml: str, epochs: int, batch: int, optimizer: str, patience: int, deviced: str, save_period: int):

  with mlflow.start_run():

    model = YOLO("yolov10b.pt")

    train_results = model.train(
        data=data_yaml,
        epochs=epochs,
        imgsz=640,
        batch=batch,
        optimizer=optimizer,
        patience=patience,
        project="yolov10b_runs",
        name="yolov10b_baseline",
        device=deviced,
        save_period=save_period
    )

  return train_results

In [None]:
data_yaml = "../datasets/data.yaml"
epochs = 3
batch = 16
optimizer = "AdamW"
patience = 20
deviced = 'cpu'
save_period = 10

results = train_yolov10b_model(data_yaml, epochs, batch, optimizer, patience, deviced, save_period)

### 3.2. Model Training

In [None]:
print("ok")

In [None]:
# import libraries

from ultralytics import YOLO
from ultralytics import settings

In [None]:
# Updata yolo settings
settings.update({
    "weights_dir": "./notebooks/",
    "runs_dir": "./notebooks/"
})

In [None]:
# View all settings
print(settings)

In [None]:
def train_yolov10b_model(
        data_yaml: str, 
        epochs: int, 
        batch: int, 
        optimizer: str, 
        patience: int, 
        deviced: str, 
        save_period: int
    ):

    model = YOLO(
        model="yolov10b.pt",
        task="detect"
    )

    train_results = model.train(
        data=data_yaml,
        epochs=epochs,
        imgsz=640,
        batch=batch,
        optimizer=optimizer,
        patience=patience,
        project="yolov10b_train_runs",
        name="yolov10b_baseline",
        device=deviced,
        cache=True,
        save_period=save_period
    )
    
    return train_results

In [None]:
data_yaml = "../datasets/data.yaml"
epochs = 3
batch = 16
optimizer = "AdamW"
patience = 20
deviced = 'gpu'
save_period = 10

train_results = train_yolov10b_model(data_yaml, epochs, batch, optimizer, patience, deviced, save_period)

### 4. Model Validataion and Evaluation (Offline metrices)

In [None]:
# load your best custom model
train_results = YOLO("path/to/best.pt")

In [None]:
def test_yolov10b_model(
        data_yaml: str, 
        batch: int,
        save_json: bool,
        deviced: str,
    ):
    
    test_results = train_results.val(
        data=data_yaml,
        batch=batch,
        save_json=save_json,
        device=deviced,
        project="yolov10b_test_runs",
        name="yolov10b_baseline",
    )
    
    return test_results

In [None]:
save_json = True

test_results = test_yolov10b_model(data_yaml, batch, save_json, deviced)

In [None]:
test_results.box.map

In [None]:
test_results.box.maps

In [None]:
test_results.box.mapiou

In [None]:
test_results.summary()

### 6. Model Prediction & Exporting

In [None]:
train_results.predict()