In [1]:
from ultralytics import YOLO
import os
import json
from glob import glob
import csv
import cv2
import pytesseract
import torch
import re
import numpy as np
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image 

  from .autonotebook import tqdm as notebook_tqdm


### Convertimos del formato Labelme a formato YOLO (Aplicar a Train, Valid y Test)

In [None]:
input_folder = "plates_model/train"
output_folder = "plates_model/train/labels"
os.makedirs(output_folder, exist_ok=True)

label2id = {
    "plate": 0
}

def convert_points_to_yolo(points, img_w, img_h):
    xs = [p[0] for p in points]
    ys = [p[1] for p in points]
    x_min, x_max = min(xs), max(xs)
    y_min, y_max = min(ys), max(ys)

    x_center = ((x_min + x_max) / 2) / img_w
    y_center = ((y_min + y_max) / 2) / img_h
    width = (x_max - x_min) / img_w
    height = (y_max - y_min) / img_h

    return x_center, y_center, width, height

for json_file in glob(os.path.join(input_folder, "*.json")):
    with open(json_file, "r") as f:
        data = json.load(f)

    img_w = data.get("imageWidth", 1)
    img_h = data.get("imageHeight", 1)
    yolo_lines = []

    for shape in data["shapes"]:
        label = shape["label"]
        if label not in label2id:
            continue
        cls_id = label2id[label]

        x_center, y_center, w, h = convert_points_to_yolo(shape["points"], img_w, img_h)
        yolo_lines.append(f"{cls_id} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}")

    txt_path = os.path.join(output_folder, os.path.basename(json_file).replace(".json", ".txt"))
    with open(txt_path, "w") as f:
        f.write("\n".join(yolo_lines))

### Entrenamiento del modelo Yolo para detección de matrículas

In [None]:
model = YOLO("yolo11n.pt")

"""
Este fue el entrenamiento que le realizamos a 
nuestro modelo para la detección de matriculas
que usaremos posteriormente en la práctica.
"""
results = model.train(
    data="plates_model/data.yaml",  
    epochs=60,                
    imgsz=640,                
    batch=2,                  
    device=0,
    workers=2                  
)

print(results.save_dir) 

### Cargamos los modelos y se lo aplicamos al vídeo de prueba, generando el fichero csv correspondiente para la entrega. (pytesseract)

In [4]:
model_coco = YOLO("yolo11n.pt")  
model_custom = YOLO("runs/detect/train/weights/best.pt")  

video_path = "../videoTest.mp4"
output_path = "salida_detecciones_pytesseract.mp4"
csv_path = "detecciones_pytesseract.csv"

cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    print("Error al abrir el video")
    exit()

fps = int(cap.get(cv2.CAP_PROP_FPS))
width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

with open(csv_path, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow([
        "fotograma", "tipo_objeto", "confianza", "identificador_tracking",
        "x1", "y1", "x2", "y2",
        "matricula_en_su_caso", "confianza_matricula",
        "mx1", "my1", "mx2", "my2",
        "texto_matricula"
    ])

    frame_num = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_num += 1
        display_frame = frame.copy()

        results_coco = model_coco.track(
            frame, classes=[0, 2], persist=True, tracker="bytetrack.yaml"
        )
        display_frame = results_coco[0].plot()
        for box in results_coco[0].boxes:
            cls_id = int(box.cls[0].item())
            conf = box.conf[0].item()
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            tipo = "persona" if cls_id == 0 else "coche"
            track_id = int(box.id[0]) if box.id is not None else None

            matricula_en_su_caso = None
            conf_plate = None
            mx1 = my1 = mx2 = my2 = None
            texto_matricula = None

        
            if tipo == "coche":
                car_roi = frame[y1:y2, x1:x2]
                results_plate = model_custom.predict(car_roi, save=False, show=False)

                if len(results_plate[0].boxes) > 0:
                    matricula_en_su_caso = True
                    plate_plot = results_plate[0].plot()
                    display_frame[y1:y2, x1:x2] = plate_plot
                    plate_box = results_plate[0].boxes[0]
                    px1, py1, px2, py2 = map(int, plate_box.xyxy[0])
                    conf_plate = plate_box.conf[0].item()
                    mx1, my1, mx2, my2 = x1 + px1, y1 + py1, x1 + px2, y1 + py2
                    plate_roi = car_roi[py1:py2, px1:px2]
                    plate_roi_rgb = cv2.cvtColor(plate_roi, cv2.COLOR_BGR2RGB)
                    texto_matricula = pytesseract.image_to_string(plate_roi_rgb).replace("\n", " ").strip()



            writer.writerow([
                frame_num, tipo, conf, track_id,
                x1, y1, x2, y2,
                matricula_en_su_caso, conf_plate,
                mx1, my1, mx2, my2,
                texto_matricula
            ])
            
        out.write(display_frame)

        if cv2.waitKey(1) & 0xFF == 27:
            break

cap.release()
out.release()
cv2.destroyAllWindows()

print(f"Video procesado guardado en: {output_path}")
print(f"CSV generado en: {csv_path}")


0: 384x640 4 cars, 106.5ms
Speed: 7.9ms preprocess, 106.5ms inference, 20.2ms postprocess per image at shape (1, 3, 384, 640)

0: 512x640 (no detections), 64.0ms
Speed: 2.1ms preprocess, 64.0ms inference, 1.5ms postprocess per image at shape (1, 3, 512, 640)

0: 416x640 (no detections), 59.5ms
Speed: 2.2ms preprocess, 59.5ms inference, 1.0ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 (no detections), 8.2ms
Speed: 3.2ms preprocess, 8.2ms inference, 1.2ms postprocess per image at shape (1, 3, 416, 640)

0: 480x640 (no detections), 62.5ms
Speed: 1.4ms preprocess, 62.5ms inference, 0.6ms postprocess per image at shape (1, 3, 480, 640)

0: 384x640 4 cars, 10.0ms
Speed: 3.1ms preprocess, 10.0ms inference, 4.3ms postprocess per image at shape (1, 3, 384, 640)

0: 512x640 (no detections), 11.0ms
Speed: 1.6ms preprocess, 11.0ms inference, 0.8ms postprocess per image at shape (1, 3, 512, 640)

0: 416x640 (no detections), 9.6ms
Speed: 1.7ms preprocess, 9.6ms inference, 0.7ms pos

## Cargamos el modelo y se lo aplicamos al vídeo de prueba, generando el fichero csv correspondiente para la entrega. (SmolVLM)

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

print("Cargando SmolVLM...")
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForImageTextToText.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.float16,
    device_map="auto",
    offload_folder="./offload",
    offload_state_dict=True
)

model_coco = YOLO("yolo11n.pt")  
model_custom = YOLO("runs/detect/train/weights/best.pt")


messages_prompt = "User: <image> Can you give me the text in the license plate of the image?(only plate text)\nAssistant:"

def extract_plate_text_smolvlm(plate_image_bgr):
    """Extrae el texto de la matrícula usando SmolVLM."""
    if plate_image_bgr is None or plate_image_bgr.size == 0:
        return None
    
    plate_rgb = cv2.cvtColor(plate_image_bgr, cv2.COLOR_BGR2RGB)
    image_pil = Image.fromarray(plate_rgb)
    

    inputs = processor(text=messages_prompt, images=[image_pil], return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    generated_ids = model.generate(**inputs, max_new_tokens=8)
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
    
    respuesta = generated_texts[0].split("Assistant:")[-1].strip()
    

    match = re.search(r'\b\d{3,4}\s?[A-Z]{1,3}\b', respuesta)
    if match:
        respuesta_limpia = match.group(0)
    else:
        respuesta_limpia = respuesta
    
    respuesta_limpia = respuesta_limpia.replace(" ", "").replace("\n", "").replace("-", "")
    return respuesta_limpia


video_path = "../videoTest.mp4"
output_path = "salida_detecciones_SmolVLM.mp4"
csv_path = "detecciones_SmolVLM.csv"

cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    print("Error al abrir el video")
    exit()

fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

with open(csv_path, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([
        "fotograma", "tipo_objeto", "confianza", "identificador_tracking",
        "x1", "y1", "x2", "y2",
        "matricula_en_su_caso", "confianza_matricula",
        "mx1", "my1", "mx2", "my2",
        "texto_matricula"
    ])

    frame_num = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_num += 1
        print(f"Procesando frame {frame_num}...", end='\r')
        
        display_frame = frame.copy()

        results_coco = model_coco.track(
            frame, classes=[0, 2], persist=True, tracker="bytetrack.yaml"
        )
        display_frame = results_coco[0].plot()
        
        for box in results_coco[0].boxes:
            cls_id = int(box.cls[0].item())
            conf = box.conf[0].item()
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            tipo = "persona" if cls_id == 0 else "coche"
            track_id = int(box.id[0]) if box.id is not None else None

            matricula_en_su_caso = None
            conf_plate = None
            mx1 = my1 = mx2 = my2 = None
            texto_matricula = None

            if tipo == "coche":
                car_roi = frame[y1:y2, x1:x2]
                results_plate = model_custom.predict(car_roi, save=False, verbose=False)

                if len(results_plate[0].boxes) > 0:
                    matricula_en_su_caso = True
                    plate_plot = results_plate[0].plot()
                    display_frame[y1:y2, x1:x2] = plate_plot
                    
                    plate_box = results_plate[0].boxes[0]
                    px1, py1, px2, py2 = map(int, plate_box.xyxy[0])
                    conf_plate = plate_box.conf[0].item()
                    mx1, my1, mx2, my2 = x1 + px1, y1 + py1, x1 + px2, y1 + py2
                    
                    plate_roi = car_roi[py1:py2, px1:px2]
                    texto_matricula = extract_plate_text_smolvlm(plate_roi)
                    
                    if texto_matricula:
                        print(f"\nFrame {frame_num} - Matrícula detectada: {texto_matricula}")

            writer.writerow([
                frame_num, tipo, conf, track_id,
                x1, y1, x2, y2,
                matricula_en_su_caso, conf_plate,
                mx1, my1, mx2, my2,
                texto_matricula
            ])

        out.write(display_frame)

        if cv2.waitKey(1) & 0xFF == 27:  
            break

cap.release()
out.release()
cv2.destroyAllWindows()

print(f"\n Video procesado guardado en: {output_path}")
print(f" CSV generado en: {csv_path}")

Using device: cuda
Cargando SmolVLM...


`torch_dtype` is deprecated! Use `dtype` instead!
Some parameters are on the meta device because they were offloaded to the cpu.


Procesando frame 1...
0: 384x640 4 cars, 77.2ms
Speed: 53.1ms preprocess, 77.2ms inference, 60.4ms postprocess per image at shape (1, 3, 384, 640)
Procesando frame 2...
0: 384x640 4 cars, 7.6ms
Speed: 2.0ms preprocess, 7.6ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)
Procesando frame 3...
0: 384x640 4 cars, 9.5ms
Speed: 2.0ms preprocess, 9.5ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)
Procesando frame 4...
0: 384x640 4 cars, 9.2ms
Speed: 1.9ms preprocess, 9.2ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)
Procesando frame 5...
0: 384x640 4 cars, 10.2ms
Speed: 2.0ms preprocess, 10.2ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)
Procesando frame 6...
0: 384x640 4 cars, 8.8ms
Speed: 3.9ms preprocess, 8.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)
Procesando frame 7...
0: 384x640 4 cars, 10.2ms
Speed: 1.9ms preprocess, 10.2ms inference, 1.8ms postprocess per image at shape (1, 3, 3

### Contamos el número de coche y personas que aparecieron en el vídeo

In [3]:
def countNumberOfUniqueCarsAndPersons(archivo_csv):
    tuplas = []

    with open(archivo_csv, newline='', encoding='utf-8') as csvfile:
        lector = csv.DictReader(csvfile)
        for fila in lector:
            tipo_objeto = fila['tipo_objeto']
            identificador_tracking = fila['identificador_tracking']  
            tuplas.append((tipo_objeto, identificador_tracking))

    car_set = set()
    person_set = set()

    for item in tuplas:
        if item[0] == 'coche':
            car_set.add(item[1])
        elif item[0] == 'persona':
            person_set.add(item[1])

    print(f"Número de coches únicos detectados: {len(car_set)}")
    print(f"Número de personas únicas detectadas: {len(person_set)}")

countNumberOfUniqueCarsAndPersons("detecciones_SmolVLM.csv")


Número de coches únicos detectados: 256
Número de personas únicas detectadas: 47
