# 1. Brute Force

In [1]:
import pyautogui
import threading
import tkinter as tk
import string
import time
import ctypes  # Used for switching keyboard input language on Windows

# Global variable to control the typing loop
running = False

# Keyboard layout ID for English (EN-US) in Windows
ENGLISH_KEYBOARD_LAYOUT = 0x0409

def is_english_keyboard():
    """Check if the current keyboard layout is set to English."""
    current_layout = ctypes.windll.user32.GetKeyboardLayout(0) & 0xFFFF
    return current_layout == ENGLISH_KEYBOARD_LAYOUT

def switch_to_english_keyboard():
    """Switch the keyboard layout to English (EN-US)."""
    hwnd = ctypes.windll.user32.GetForegroundWindow()
    ctypes.windll.user32.PostMessageW(hwnd, 0x0050, 0, ENGLISH_KEYBOARD_LAYOUT)
    print("Switched keyboard layout to English.")

def countdown(seconds):
    """Display a countdown before starting the typing process."""
    for i in range(seconds, 0, -1):
        print(f"Starting in {i} seconds...")
        time.sleep(1)

def type_characters():
    """Automated typing function that continuously types characters while 'running' is True."""
    global running

    # Ensure the keyboard layout is set to English before starting
    if not is_english_keyboard():
        print("Current keyboard layout is not English. Switching to English...")
        switch_to_english_keyboard()

    # Define the set of characters to be typed
    characters = string.ascii_lowercase + "0123456789'-"
    combined_text = characters + "0123456789"  # Merged text to be typed in each loop

    # Countdown before starting to type
    countdown(5)

    while running:
        try:
            # Type all characters at once without delay
            pyautogui.typewrite(combined_text, interval=0)

            # Press 'Tab' key after each sequence
            pyautogui.press('tab')

        except pyautogui.FailSafeException:
            print("Fail-safe triggered. Stopping typing process.")
            break

def start_typing():
    """Start the automated typing process in a separate thread."""
    global running
    running = True
    threading.Thread(target=type_characters, daemon=True).start()

def stop_typing():
    """Stop the automated typing process."""
    global running
    running = False

# Create GUI using Tkinter
root = tk.Tk()
root.title("Type Characters Controller")

# Start button
start_button = tk.Button(root, text="Start", command=start_typing, width=10)
start_button.pack(pady=10)

# Stop button
stop_button = tk.Button(root, text="Stop", command=stop_typing, width=10)
stop_button.pack(pady=10)

# Start the Tkinter event loop
root.mainloop()


Starting in 5 seconds...
Starting in 4 seconds...
Starting in 3 seconds...
Starting in 2 seconds...
Starting in 1 seconds...


# 2. YOLO11+EasyOCR

In [2]:
import mss
import cv2
import numpy as np
import tkinter as tk
from tkinter import ttk
from PIL import Image, ImageTk
from ultralytics import YOLO
import torch
import re
from easyocr import Reader
import threading
import queue
import time
import pyautogui
import random
from collections import defaultdict
import os
import sys
import csv
from datetime import datetime

# Suppress YOLO print outputs
class SuppressStdout:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout.close()
        sys.stdout = self._original_stdout

# ฟังก์ชัน Crop กลาง Bounding Box
def center_crop(image, x1, y1, x2, y2, scale=0.3):
    center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
    width = int((x2 - x1) * scale)
    height = int((y2 - y1) * scale)

    new_x1 = max(center_x - width // 2, 0)
    new_y1 = max(center_y - height // 2, 0)
    new_x2 = min(center_x + width // 2, image.shape[1])
    new_y2 = min(center_y + height // 2, image.shape[0])

    return image[new_y1:new_y2, new_x1:new_x2]

# OCR Cleaning Function for textbox
def clean_and_correct_text(text):
    cleaned_text = re.sub(r"[^a-zA-Z0-9\s']", "", text)  # Remove unwanted characters
    return text.strip()

class ObjectDetectionApp:
    def __init__(self, model_path):
        self.root = tk.Tk()
        self.root.title("Object Detection with OCR")
        self.photo = None
        self.frame_queue = queue.Queue(maxsize=2)
        self.running = False
        self.capture_thread = None
        self.monitor = None
        self.color_map = defaultdict(lambda: [random.randint(0, 255) for _ in range(3)])  # Random color for each class
        
        # Confidence thresholds for each class
        self.confidence_thresholds = {
            "quick_text": 0.5,  # Confidence for quick_text
            "textbox": 0.7,     # Confidence for textbox
            "other": 0.1        # Confidence for other classes
        }
        
        self.setup_ui()
        self.setup_model(model_path)
        self.setup_monitor()
        self.easyocr_reader = Reader(['en'], gpu=torch.cuda.is_available())
        self.log_file = "detection_log.csv"  # Log file
        self.setup_logging()

    def setup_ui(self):
        self.image_label = ttk.Label(self.root)
        self.image_label.pack(padx=10, pady=10)

        controls_frame = ttk.Frame(self.root)
        controls_frame.pack(pady=10)

        ttk.Button(controls_frame, text="Start", command=self.start_detection).pack(side=tk.LEFT, padx=5)
        ttk.Button(controls_frame, text="Stop", command=self.stop_detection).pack(side=tk.LEFT, padx=5)

    def setup_model(self, model_path):
        with SuppressStdout():
            self.model = YOLO(model_path)
        if torch.cuda.is_available():
            self.model.to('cuda')
            print("YOLO model running on GPU.")
        else:
            print("YOLO model running on CPU.")

    def setup_monitor(self):
        with mss.mss() as screen:
            for monitor in screen.monitors:
                if monitor["width"] == 1920 and monitor["height"] == 1080 and monitor["left"] == 0 and monitor["top"] == 0:
                    self.monitor = monitor
                    print(f"Selected Monitor: {self.monitor}")
                    return
        raise ValueError("Monitor with resolution 1920x1080 and position (0, 0) not found.")

    def setup_logging(self):
        with open(self.log_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Timestamp", "Class", "Text", "Action", "Confidence"])

    def log_detection(self, cls_name, text, action, confidence):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        with open(self.log_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow([timestamp, cls_name, text, action, f"{confidence:.2f}"])

    def start_detection(self):
        if not self.running:
            self.running = True
            self.capture_thread = threading.Thread(target=self.detect_objects, daemon=True)
            self.capture_thread.start()
            self.update_gui()

    def stop_detection(self):
        self.running = False
        if self.capture_thread and self.capture_thread.is_alive():
            self.capture_thread.join(timeout=1)
        print("Detection stopped and resources cleared.")

    def detect_objects(self):
        with mss.mss() as screen:
            while self.running:
                try:
                    screen_shot = screen.grab(self.monitor)
                    frame = np.array(screen_shot)
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR)

                    results = self.model(frame, conf=0.1)
                    quick_texts = []
                    textbox_texts = []
                    other_classes = []

                    for r in results:
                        for box in r.boxes:
                            x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
                            cls_id = int(box.cls[0])
                            cls_name = self.model.names[cls_id]
                            confidence = float(box.conf[0])

                            # Use class-specific confidence threshold
                            threshold = self.confidence_thresholds.get(cls_name, self.confidence_thresholds["other"])
                            if confidence < threshold:
                                continue  # Skip low-confidence detections

                            color = self.color_map[cls_name]

                            # Draw bounding box
                            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                            label = f"{cls_name} {confidence:.2f}"
                            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

                            if cls_name == 'quick_text':
                                cropped_frame = center_crop(frame, x1, y1, x2, y2, scale=0.3)
                                gray = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2GRAY)
                                ocr_results = self.easyocr_reader.readtext(gray, detail=1)
                                quick_text = " ".join([result[1] for result in ocr_results]).strip()
                                self.log_detection(cls_name, quick_text, "Typed", confidence)
                                quick_texts.append(quick_text)

                                if quick_text:
                                    cv2.putText(frame, quick_text, (x1, y2 + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

                            elif cls_name == 'textbox':
                                cropped_frame = frame[y1:y2, x1:x2]
                                gray = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2GRAY)
                                ocr_results = self.easyocr_reader.readtext(gray, detail=1)
                                merged_text = " ".join([result[1] for result in ocr_results])
                                corrected_text = clean_and_correct_text(merged_text)
                                self.log_detection(cls_name, corrected_text, "Typed", confidence)
                                textbox_texts.append(corrected_text)

                                if corrected_text:
                                    cv2.putText(frame, corrected_text, (x1, y2 + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

                            else:
                                self.log_detection(cls_name, "", "Tab Pressed", confidence)
                                other_classes.append(cls_name)

                    for text in quick_texts:
                        pyautogui.typewrite(text, interval=0.0001)

                    for text in textbox_texts:
                        pyautogui.typewrite(text, interval=0.0001)

                    if other_classes:
                        pyautogui.press('tab')

                    resized_frame = cv2.resize(frame, None, fx=0.7, fy=0.7, interpolation=cv2.INTER_AREA)
                    frame_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
                    image = Image.fromarray(frame_rgb)

                    if not self.frame_queue.full():
                        self.frame_queue.put(image)

                except Exception as e:
                    print(f"Detection error: {e}")
                    self.running = False

        cv2.destroyAllWindows()

    def update_gui(self):
        if not self.running:
            return
        try:
            if not self.frame_queue.empty():
                image = self.frame_queue.get()
                self.photo = ImageTk.PhotoImage(image=image)
                self.image_label.configure(image=self.photo)
        except Exception as e:
            print(f"GUI update error: {e}")
        self.root.after(30, self.update_gui)

    def run(self):
        self.root.protocol("WM_DELETE_WINDOW", self.on_close)
        self.root.mainloop()

    def on_close(self):
        self.stop_detection()
        self.root.destroy()

def main():
    model_path = r'c:\is_project_totd\textbox_label-4\models\type_frame_detector\weights\best.pt'
    app = ObjectDetectionApp(model_path)
    try:
        app.run()
    except Exception as e:
        print(f"Error: {e}")
    finally:
        print("Application closed successfully.")

if __name__ == "__main__":
    main()


YOLO model running on GPU.
Selected Monitor: {'left': 0, 'top': 0, 'width': 1920, 'height': 1080}
Detection stopped and resources cleared.
Detection stopped and resources cleared.
Application closed successfully.


# 3. YOLO11+Tesseract

In [11]:
import mss
import cv2
import numpy as np
import tkinter as tk
from tkinter import ttk
from PIL import Image, ImageTk
from ultralytics import YOLO
import torch
import re
import pytesseract
import threading
import queue
import time
import pyautogui
import random
from collections import defaultdict
import os
import sys
import csv
from datetime import datetime

# ตั้งค่า path ของ Tesseract OCR
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Suppress YOLO print outputs
class SuppressStdout:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout.close()
        sys.stdout = self._original_stdout

# ฟังก์ชัน Crop กลาง Bounding Box
def center_crop(image, x1, y1, x2, y2, scale=0.3):
    center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
    width = int((x2 - x1) * scale)
    height = int((y2 - y1) * scale)

    new_x1 = max(center_x - width // 2, 0)
    new_y1 = max(center_y - height // 2, 0)
    new_x2 = min(center_x + width // 2, image.shape[1])
    new_y2 = min(center_y + height // 2, image.shape[0])

    return image[new_y1:new_y2, new_x1:new_x2]

# OCR Cleaning Function for textbox
def clean_and_correct_text(text):
    cleaned_text = re.sub(r"[^a-zA-Z0-9\s']", "", text)  # Remove unwanted characters
    return text.strip()

class ObjectDetectionApp:
    def __init__(self, model_path):
        self.root = tk.Tk()
        self.root.title("Object Detection with OCR")
        self.photo = None
        self.frame_queue = queue.Queue(maxsize=2)
        self.running = False
        self.capture_thread = None
        self.monitor = None
        self.color_map = defaultdict(lambda: [random.randint(0, 255) for _ in range(3)])  # Random color for each class
        
        # Confidence thresholds for each class
        self.confidence_thresholds = {
            "quick_text": 0.5,  # Confidence for quick_text
            "textbox": 0.7,     # Confidence for textbox
            "other": 0.1        # Confidence for other classes
        }
        
        self.setup_ui()
        self.setup_model(model_path)
        self.setup_monitor()
        self.log_file = "detection_log.csv"  # Log file
        self.setup_logging()

    def setup_ui(self):
        self.image_label = ttk.Label(self.root)
        self.image_label.pack(padx=10, pady=10)

        controls_frame = ttk.Frame(self.root)
        controls_frame.pack(pady=10)

        ttk.Button(controls_frame, text="Start", command=self.start_detection).pack(side=tk.LEFT, padx=5)
        ttk.Button(controls_frame, text="Stop", command=self.stop_detection).pack(side=tk.LEFT, padx=5)

    def setup_model(self, model_path):
        with SuppressStdout():
            self.model = YOLO(model_path)
        if torch.cuda.is_available():
            self.model.to('cuda')
            print("YOLO model running on GPU.")
        else:
            print("YOLO model running on CPU.")

    def setup_monitor(self):
        with mss.mss() as screen:
            for monitor in screen.monitors:
                if monitor["width"] == 1920 and monitor["height"] == 1080 and monitor["left"] == 0 and monitor["top"] == 0:
                    self.monitor = monitor
                    print(f"Selected Monitor: {self.monitor}")
                    return
        raise ValueError("Monitor with resolution 1920x1080 and position (0, 0) not found.")

    def setup_logging(self):
        with open(self.log_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Timestamp", "Class", "Text", "Action", "Confidence"])

    def log_detection(self, cls_name, text, action, confidence):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        with open(self.log_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow([timestamp, cls_name, text, action, f"{confidence:.2f}"])

    def start_detection(self):
        if not self.running:
            self.running = True
            self.capture_thread = threading.Thread(target=self.detect_objects, daemon=True)
            self.capture_thread.start()
            self.update_gui()

    def stop_detection(self):
        self.running = False
        if self.capture_thread and self.capture_thread.is_alive():
            self.capture_thread.join(timeout=1)
        print("Detection stopped and resources cleared.")

    def detect_objects(self):
        with mss.mss() as screen:
            while self.running:
                try:
                    screen_shot = screen.grab(self.monitor)
                    frame = np.array(screen_shot)
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR)

                    results = self.model(frame, conf=0.1)
                    quick_texts = []
                    textbox_texts = []
                    other_classes = []

                    for r in results:
                        for box in r.boxes:
                            x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
                            cls_id = int(box.cls[0])
                            cls_name = self.model.names[cls_id]
                            confidence = float(box.conf[0])

                            # Use class-specific confidence threshold
                            threshold = self.confidence_thresholds.get(cls_name, self.confidence_thresholds["other"])
                            if confidence < threshold:
                                continue  # Skip low-confidence detections

                            color = self.color_map[cls_name]

                            # Draw bounding box
                            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                            label = f"{cls_name} {confidence:.2f}"
                            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

                            if cls_name == 'quick_text':
                                cropped_frame = center_crop(frame, x1, y1, x2, y2, scale=0.3)
                                gray = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2GRAY)

                                quick_text = pytesseract.image_to_string(gray, lang='eng').strip()
                                self.log_detection(cls_name, quick_text, "Typed", confidence)
                                quick_texts.append(quick_text)

                                if quick_text:
                                    cv2.putText(frame, quick_text, (x1, y2 + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

                            elif cls_name == 'textbox':
                                cropped_frame = frame[y1:y2, x1:x2]
                                gray = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2GRAY)
                                merged_text = pytesseract.image_to_string(gray, lang='eng').strip()
                                corrected_text = clean_and_correct_text(merged_text)
                                self.log_detection(cls_name, corrected_text, "Typed", confidence)
                                textbox_texts.append(corrected_text)

                                if corrected_text:
                                    cv2.putText(frame, corrected_text, (x1, y2 + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

                            else:
                                self.log_detection(cls_name, "", "Tab Pressed", confidence)
                                other_classes.append(cls_name)

                    for text in quick_texts:
                        pyautogui.typewrite(text, interval=0.0001)

                    for text in textbox_texts:
                        pyautogui.typewrite(text, interval=0.0001)

                    if other_classes:
                        pyautogui.press('tab')

                    resized_frame = cv2.resize(frame, None, fx=0.7, fy=0.7, interpolation=cv2.INTER_AREA)
                    frame_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
                    image = Image.fromarray(frame_rgb)

                    if not self.frame_queue.full():
                        self.frame_queue.put(image)

                except Exception as e:
                    print(f"Detection error: {e}")
                    self.running = False

        cv2.destroyAllWindows()

    def update_gui(self):
        if not self.running:
            return
        try:
            if not self.frame_queue.empty():
                image = self.frame_queue.get()
                self.photo = ImageTk.PhotoImage(image=image)
                self.image_label.configure(image=self.photo)
        except Exception as e:
            print(f"GUI update error: {e}")
        self.root.after(30, self.update_gui)

    def run(self):
        self.root.protocol("WM_DELETE_WINDOW", self.on_close)
        self.root.mainloop()

    def on_close(self):
        self.stop_detection()
        self.root.destroy()

def main():
    model_path = r'c:\is_project_totd\textbox_label-4\models\type_frame_detector\weights\best.pt'
    app = ObjectDetectionApp(model_path)
    try:
        app.run()
    except Exception as e:
        print(f"Error: {e}")
    finally:
        print("Application closed successfully.")

if __name__ == "__main__":
    main()

YOLO model running on GPU.
Selected Monitor: {'left': 0, 'top': 0, 'width': 1920, 'height': 1080}

0: 384x640 1 textbox, 50.8ms
Speed: 11.8ms preprocess, 50.8ms inference, 128.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 textbox, 39.5ms
Speed: 2.0ms preprocess, 39.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 textboxs, 37.5ms
Speed: 2.5ms preprocess, 37.5ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 textbox, 29.9ms
Speed: 2.4ms preprocess, 29.9ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 textbox, 32.1ms
Speed: 2.1ms preprocess, 32.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 textboxs, 28.2ms
Speed: 1.1ms preprocess, 28.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 textboxs, 22.1ms
Speed: 1.5ms preprocess, 22.1ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384

# 4. YOLO11+EasyOCR+SymSpell

In [1]:
import mss
import cv2
import numpy as np
import tkinter as tk
from tkinter import ttk
from PIL import Image, ImageTk
from ultralytics import YOLO
import torch
import re
from easyocr import Reader
import threading
import queue
import time
import pyautogui
import random
from collections import defaultdict
import os
import sys
import csv
from datetime import datetime
from symspellpy import SymSpell, Verbosity

# Suppress YOLO print outputs
class SuppressStdout:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout.close()
        sys.stdout = self._original_stdout

def center_crop(image, x1, y1, x2, y2, scale=0.5):
    center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
    width = int((x2 - x1) * scale)
    height = int((y2 - y1) * scale)

    new_x1 = max(center_x - width // 2, 0)
    new_y1 = max(center_y - height // 2, 0)
    new_x2 = min(center_x + width // 2, image.shape[1])
    new_y2 = min(center_y + height // 2, image.shape[0])

    return image[new_y1:new_y2, new_x1:new_x2]

# OCR Cleaning Function for textbox
def clean_and_correct_text(text):
    cleaned_text = re.sub(r"[^a-zA-Z0-9\s']", "", text)  # Remove unwanted characters
    return text.strip()


def setup_symspell(dictionary_path):
    max_edit_distance = 2
    prefix_length = 7
    sym_spell = SymSpell(max_dictionary_edit_distance=max_edit_distance, prefix_length=prefix_length)
    if not sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1):
        raise FileNotFoundError(f"Dictionary file not found: {dictionary_path}")
    return sym_spell

def correct_with_symspell(sym_spell, text):
    """
    Text correction that skips words starting with capital letters
    """
    if not text:
        return ""
    
    words = text.split()
    corrected_words = []
    
    for word in words:
        if word and word[0].isupper():
            corrected_words.append(word)
            continue
            
        if word.isdigit():
            corrected_words.append(word)
            continue
        
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        if suggestions:
            corrected_words.append(suggestions[0].term)
        else:
            corrected_words.append(word)  # ถ้าไม่มีคำแนะนำ ให้ใช้คำเดิม
    
    return " ".join(corrected_words)

class ObjectDetectionApp:
    def __init__(self, model_path, dictionary_path):
        self.root = tk.Tk()
        self.root.title("Object Detection with OCR")
        self.photo = None
        self.frame_queue = queue.Queue(maxsize=2)
        self.running = False
        self.capture_thread = None
        self.monitor = None
        self.color_map = defaultdict(lambda: [random.randint(0, 255) for _ in range(3)])  # Random color for each class
        
        # Confidence thresholds for each class
        self.confidence_thresholds = {
            "quick_text": 0.5,  # Confidence for quick_text
            "textbox": 0.7,     # Confidence for textbox
            "other": 0.1        # Confidence for other classes
        }

        self.setup_ui()
        self.setup_model(model_path)
        self.setup_monitor()
        self.easyocr_reader = Reader(['en'], gpu=torch.cuda.is_available())
        self.sym_spell = setup_symspell(dictionary_path)  # ตั้งค่า SymSpell
        self.log_file = "detection_log.csv"  # Log file
        self.setup_logging()

    def setup_ui(self):
        self.image_label = ttk.Label(self.root)
        self.image_label.pack(padx=10, pady=10)

        controls_frame = ttk.Frame(self.root)
        controls_frame.pack(pady=10)

        ttk.Button(controls_frame, text="Start", command=self.start_detection).pack(side=tk.LEFT, padx=5)
        ttk.Button(controls_frame, text="Stop", command=self.stop_detection).pack(side=tk.LEFT, padx=5)

    def setup_model(self, model_path):
        with SuppressStdout():
            self.model = YOLO(model_path)
        if torch.cuda.is_available():
            self.model.to('cuda')
            print("YOLO model running on GPU.")
        else:
            print("YOLO model running on CPU.")

    def setup_monitor(self):
        with mss.mss() as screen:
            for monitor in screen.monitors:
                if monitor["width"] == 1920 and monitor["height"] == 1080 and monitor["left"] == 0 and monitor["top"] == 0:
                    self.monitor = monitor
                    print(f"Selected Monitor: {self.monitor}")
                    return
        raise ValueError("Monitor with resolution 1920x1080 and position (0, 0) not found.")

    def setup_logging(self):
        with open(self.log_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Timestamp", "Class", "Text", "Action", "Confidence"])

    def log_detection(self, cls_name, text, action, confidence):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        with open(self.log_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow([timestamp, cls_name, text, action, f"{confidence:.2f}"])

    def start_detection(self):
        if not self.running:
            self.running = True
            self.capture_thread = threading.Thread(target=self.detect_objects, daemon=True)
            self.capture_thread.start()
            self.update_gui()

    def stop_detection(self):
        self.running = False
        if self.capture_thread and self.capture_thread.is_alive():
            self.capture_thread.join(timeout=1)
        print("Detection stopped and resources cleared.")

    def detect_objects(self):
        with mss.mss() as screen:
            while self.running:
                try:
                    screen_shot = screen.grab(self.monitor)
                    frame = np.array(screen_shot)
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR)

                    results = self.model(frame, conf=0.1)
                    quick_texts = []
                    textbox_texts = []
                    other_classes = []  # Initialize other_classes list

                    for r in results:
                        for box in r.boxes:
                            x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
                            cls_id = int(box.cls[0])
                            cls_name = self.model.names[cls_id]
                            confidence = float(box.conf[0])

                            # Use class-specific confidence threshold
                            threshold = self.confidence_thresholds.get(cls_name, self.confidence_thresholds["other"])
                            if confidence < threshold:
                                continue  # Skip low-confidence detections

                            color = self.color_map[cls_name]
                            # Draw bounding box and annotate
                            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                            label = f"{cls_name} {confidence:.2f}"
                            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

                            if cls_name == 'quick_text':
                                cropped_frame = center_crop(frame, x1, y1, x2, y2, scale=0.5)
                                gray = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2GRAY)
                                ocr_results = self.easyocr_reader.readtext(gray, detail=1)
                                quick_text = " ".join([result[1] for result in ocr_results]).strip()
                                self.log_detection(cls_name, quick_text, "Typed", confidence)
                                quick_texts.append(quick_text)

                                if quick_text:
                                    cv2.putText(frame, quick_text, (x1, y2 + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

                            elif cls_name == 'textbox':
                                cropped_frame = frame[y1:y2, x1:x2]
                                gray = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2GRAY)
                                ocr_results = self.easyocr_reader.readtext(gray, detail=1)
                                merged_text = " ".join([result[1] for result in ocr_results])
                                corrected_text = clean_and_correct_text(merged_text)

                                # SymSpell correction with capital letter handling
                                corrected_text = correct_with_symspell(self.sym_spell, corrected_text)
                                
                                self.log_detection(cls_name, corrected_text, "Typed", confidence)
                                textbox_texts.append(corrected_text)

                                if corrected_text:
                                    cv2.putText(frame, corrected_text, (x1, y2 + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

                            else:
                                self.log_detection(cls_name, "", "Tab Pressed", confidence)
                                other_classes.append(cls_name)  # Add to other_classes list

                    # Process detected texts and actions
                    for text in quick_texts:
                        pyautogui.typewrite(text, interval=0.0001)

                    for text in textbox_texts:
                        pyautogui.typewrite(text, interval=0.0001)

                    # Press tab if other classes were detected
                    if other_classes:
                        pyautogui.press('tab')

                    resized_frame = cv2.resize(frame, None, fx=0.7, fy=0.7, interpolation=cv2.INTER_AREA)
                    frame_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
                    image = Image.fromarray(frame_rgb)

                    if not self.frame_queue.full():
                        self.frame_queue.put(image)

                except Exception as e:
                    print(f"Detection error: {e}")
                    self.running = False

        cv2.destroyAllWindows()

    def update_gui(self):
        if not self.running:
            return
        try:
            if not self.frame_queue.empty():
                image = self.frame_queue.get()
                self.photo = ImageTk.PhotoImage(image=image)
                self.image_label.configure(image=self.photo)
        except Exception as e:
            print(f"GUI update error: {e}")
        self.root.after(30, self.update_gui)

    def run(self):
        self.root.protocol("WM_DELETE_WIDOW", self.on_close)
        self.root.mainloop()

    def on_close(self):
        self.stop_detection()
        self.root.destroy()

def main():
    model_path = r'c:\is_project_totd\textbox_label-4\models\type_frame_detector\weights\best.pt'
    dictionary_path = r"C:\is_project_totd\frequency_dictionary_en_82_765.txt"
    app = ObjectDetectionApp(model_path, dictionary_path)
    try:
        app.run()
    except Exception as e:
        print(f"Error: {e}")
    finally:
        print("Application closed successfully.")

if __name__ == "__main__":
    main()

YOLO model running on GPU.
Selected Monitor: {'left': 0, 'top': 0, 'width': 1920, 'height': 1080}

0: 384x640 1 health, 37.5ms
Speed: 3.3ms preprocess, 37.5ms inference, 52.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 47.6ms
Speed: 2.0ms preprocess, 47.6ms inference, 3.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 51.0ms
Speed: 1.5ms preprocess, 51.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 16.7ms
Speed: 2.0ms preprocess, 16.7ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 16.6ms
Speed: 2.1ms preprocess, 16.6ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 15.8ms
Speed: 2.5ms preprocess, 15.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 15.1ms
Speed: 2.3ms preprocess, 15.1ms inference, 1.0ms postprocess per image at 

# 5. YOLO11+EasyOCR+SymSpell+SortTrack+Bruteforce

In [2]:
import mss
import cv2
import numpy as np
import tkinter as tk
from tkinter import ttk
from PIL import Image, ImageTk
from ultralytics import YOLO
import torch
import re
from easyocr import Reader
import threading
import queue
import time
import pyautogui
import random
from collections import defaultdict
import os
import sys
import csv
from datetime import datetime
from symspellpy import SymSpell, Verbosity
from sort_tracker import Sort

# Suppress YOLO print outputs
class SuppressStdout:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout.close()
        sys.stdout = self._original_stdout

def center_crop(image, x1, y1, x2, y2, scale=0.5):
    center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
    width = int((x2 - x1) * scale)
    height = int((y2 - y1) * scale)

    new_x1 = max(center_x - width // 2, 0)
    new_y1 = max(center_y - height // 2, 0)
    new_x2 = min(center_x + width // 2, image.shape[1])
    new_y2 = min(center_y + height // 2, image.shape[0])

    return image[new_y1:new_y2, new_x1:new_x2]

def clean_and_correct_text(text):
    cleaned_text = re.sub(r"[^a-zA-Z0-9\s']", "", text)
    return text.strip()

def setup_symspell(dictionary_path):
    max_edit_distance = 2
    prefix_length = 7
    sym_spell = SymSpell(max_dictionary_edit_distance=max_edit_distance, prefix_length=prefix_length)
    if not sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1):
        raise FileNotFoundError(f"Dictionary file not found: {dictionary_path}")
    return sym_spell

def correct_with_symspell(sym_spell, text):
    if not text:
        return ""
    
    words = text.split()
    corrected_words = []
    
    for word in words:
        if word and word[0].isupper():
            corrected_words.append(word)
            continue
            
        if word.isdigit():
            corrected_words.append(word)
            continue
        
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        if suggestions:
            corrected_words.append(suggestions[0].term)
        else:
            corrected_words.append(word)
    
    return " ".join(corrected_words)

class ObjectDetectionApp:
    def __init__(self, model_path, dictionary_path):
        self.root = tk.Tk()
        self.root.title("Object Detection with OCR")
        self.photo = None
        self.frame_queue = queue.Queue(maxsize=2)
        self.running = False
        self.capture_thread = None
        self.monitor = None
        self.color_map = defaultdict(lambda: [random.randint(0, 255) for _ in range(3)])
        
        # Tracking related attributes
        self.tracker = Sort(max_age=15, min_hits=2, iou_threshold=0.05)
        self.tracked_ids = {}
        self.id_attempts = {}
        self.id_original_texts = {}
        self.brute_force_states = {}
        
        # Constants for tracking
        self.MAX_ATTEMPTS = 3
        self.STUCK_THRESHOLD = 3
        self.ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        
        # Confidence thresholds
        self.confidence_thresholds = {
            "quick_text": 0.5,
            "textbox": 0.7,
            "other": 0.1
        }

        # Initialize OCR and models first
        self.easyocr_reader = Reader(['en'], gpu=torch.cuda.is_available())
        self.sym_spell = setup_symspell(dictionary_path)
        self.setup_model(model_path)
        self.setup_monitor()
        self.setup_logging()
        
        # Setup UI last
        self.setup_ui()

    def start_detection(self):
        if not self.running:
            self.running = True
            self.capture_thread = threading.Thread(target=self.detect_objects, daemon=True)
            self.capture_thread.start()
            self.update_gui()
            print("Detection started")

    def stop_detection(self):
        self.running = False
        if self.capture_thread and self.capture_thread.is_alive():
            self.capture_thread.join(timeout=1)
        print("Detection stopped and resources cleared.")

    def setup_ui(self):
        self.image_label = ttk.Label(self.root)
        self.image_label.pack(padx=10, pady=10)

        controls_frame = ttk.Frame(self.root)
        controls_frame.pack(pady=10)

        ttk.Button(controls_frame, text="Start", command=self.start_detection).pack(side=tk.LEFT, padx=5)
        ttk.Button(controls_frame, text="Stop", command=self.stop_detection).pack(side=tk.LEFT, padx=5)

    def setup_model(self, model_path):
        with SuppressStdout():
            self.model = YOLO(model_path)
        if torch.cuda.is_available():
            self.model.to('cuda')
            print("YOLO model running on GPU.")
        else:
            print("YOLO model running on CPU.")

    def setup_monitor(self):
        with mss.mss() as screen:
            for monitor in screen.monitors:
                if monitor["width"] == 1920 and monitor["height"] == 1080 and monitor["left"] == 0 and monitor["top"] == 0:
                    self.monitor = monitor
                    print(f"Selected Monitor: {self.monitor}")
                    return
        raise ValueError("Monitor with resolution 1920x1080 and position (0, 0) not found.")

    def setup_logging(self):
        self.log_file = "detection_log.csv"
        with open(self.log_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Timestamp", "Class", "Text", "Action", "Confidence"])

    def log_detection(self, cls_name, text, action, confidence):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        with open(self.log_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow([timestamp, cls_name, text, action, f"{confidence:.2f}"])

    # def try_brute_force_quick_text(self, track_id):
    #     if track_id not in self.brute_force_states:
    #         self.brute_force_states[track_id] = 0
        
    #     current_pos = self.brute_force_states[track_id]
        
    #     if current_pos < len(self.ALPHABET):
    #         char_to_try = self.ALPHABET[current_pos]
    #         self.brute_force_states[track_id] = (current_pos + 1) % len(self.ALPHABET)
    #         return char_to_try
        
    #     return None
    
    def try_brute_force_quick_text(self, track_id):
        if track_id not in self.brute_force_states:
            self.brute_force_states[track_id] = 0
        
        current_pos = self.brute_force_states[track_id]
        char_to_try = self.ALPHABET[current_pos]
        
        # เปลี่ยนการ reset ตัวนับเมื่อถึง Z ให้กลับไปเริ่มที่ A ใหม่
        self.brute_force_states[track_id] = (current_pos + 1) % len(self.ALPHABET)
        
        return char_to_try

    def process_tracked_object(self, cls_name, frame, box, track_id, confidence):
        x1, y1, x2, y2 = box
        
        if track_id not in self.tracked_ids:
            self.tracked_ids[track_id] = {
                'frames_seen': 1,
                'last_success': False,
                'attempts': 0,
                'tried_original': False
            }
            self.id_attempts[track_id] = 0
        else:
            self.tracked_ids[track_id]['frames_seen'] += 1
        
        is_stuck = (self.tracked_ids[track_id]['frames_seen'] > self.STUCK_THRESHOLD and 
                   not self.tracked_ids[track_id]['last_success'])
        
        if cls_name == 'quick_text':
            if is_stuck:
                char_to_try = self.try_brute_force_quick_text(track_id)
                if char_to_try:
                    print(f"Quick text ID {track_id}: Trying brute force with {char_to_try}")
                    return char_to_try
                return None
            else:
                cropped_frame = center_crop(frame, x1, y1, x2, y2, scale=0.5)
                gray = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2GRAY)
                ocr_results = self.easyocr_reader.readtext(
                    gray
                )
                if ocr_results:
                    text = " ".join([result[1] for result in ocr_results]).strip()
                    if text:
                        self.tracked_ids[track_id]['last_success'] = True
                        if track_id in self.brute_force_states:
                            del self.brute_force_states[track_id]
                        return text
                
                self.id_attempts[track_id] += 1
                return None
                
        elif cls_name == 'textbox':
            if is_stuck:
                track_data = self.tracked_ids[track_id]
                
                if not track_data['tried_original'] and track_id in self.id_original_texts:
                    track_data['tried_original'] = True
                    original_text = self.id_original_texts[track_id]
                    return original_text
                
                char_to_try = self.try_brute_force_quick_text(track_id)
                if char_to_try:
                    print(f"Textbox ID {track_id}: Trying brute force with {char_to_try}")
                    return char_to_try
                return None
            
            cropped_frame = frame[y1:y2, x1:x2]
            gray = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2GRAY)
            ocr_results = self.easyocr_reader.readtext(gray, detail=1)
            
            if ocr_results:
                merged_text = " ".join([result[1] for result in ocr_results])
                cleaned_text = clean_and_correct_text(merged_text)
                self.id_original_texts[track_id] = cleaned_text
                
                corrected_text = correct_with_symspell(self.sym_spell, cleaned_text)
                
                if corrected_text:
                    self.tracked_ids[track_id]['last_success'] = True
                    if track_id in self.brute_force_states:
                        del self.brute_force_states[track_id]
                    self.tracked_ids[track_id]['tried_original'] = False
                    return corrected_text
            
            self.id_attempts[track_id] += 1
            return None

    def detect_objects(self):
        with mss.mss() as screen:
            while self.running:
                try:
                    screen_shot = screen.grab(self.monitor)
                    frame = np.array(screen_shot)
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR)

                    results = self.model(frame, conf=0.1)
                    detections = []
                    quick_texts = []
                    textbox_texts = []
                    other_classes = []

                    # Prepare detections for tracker
                    for r in results:
                        for box in r.boxes:
                            x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
                            cls_id = int(box.cls[0])
                            cls_name = self.model.names[cls_id]
                            confidence = float(box.conf[0])

                            threshold = self.confidence_thresholds.get(cls_name, self.confidence_thresholds["other"])
                            if confidence < threshold:
                                continue

                            if cls_name in ['textbox', 'quick_text']:
                                detections.append([x1, y1, x2, y2, confidence])
                            else:
                                other_classes.append(cls_name)

                    # Update tracker
                    if detections:
                        tracked_objects = self.tracker.update(np.array(detections))
                        
                        for x1, y1, x2, y2, track_id in tracked_objects.astype(int):
                            # Get class name for this detection
                            for r in results:
                                for box in r.boxes:
                                    box_x1, box_y1, box_x2, box_y2 = map(int, box.xyxy[0].cpu().numpy())
                                    if (abs(x1 - box_x1) < 5 and abs(y1 - box_y1) < 5):
                                        cls_id = int(box.cls[0])
                                        cls_name = self.model.names[cls_id]
                                        confidence = float(box.conf[0])
                                        
                                        color = self.color_map[cls_name]
                                        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                                        label = f"{cls_name} ID:{track_id} {confidence:.2f}"
                                        cv2.putText(frame, label, (x1, y1 - 10), 
                                                  cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

                                        result_text = self.process_tracked_object(
                                            cls_name, frame, (x1, y1, x2, y2), 
                                            track_id, confidence
                                        )
                                        
                                        if result_text:
                                            if cls_name == 'quick_text':
                                                quick_texts.append(result_text)
                                            else:
                                                textbox_texts.append(result_text)
                                                
                                            cv2.putText(frame, result_text, (x1, y2 + 20), 
                                                      cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

                    # Process texts and actions
                    for text in quick_texts:
                        pyautogui.typewrite(text, interval=0.0001)

                    for text in textbox_texts:
                        pyautogui.typewrite(text, interval=0.0001)
                        print(f"Typing textbox text: {text}")  # Added debug print

                    if other_classes:
                        pyautogui.press('tab')

                    # Clean up old tracking data
                    current_ids = set(obj[4] for obj in tracked_objects) if len(detections) > 0 else set()
                    self._cleanup_tracking_data(current_ids)

                    resized_frame = cv2.resize(frame, None, fx=0.7, fy=0.7, interpolation=cv2.INTER_AREA)
                    frame_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
                    image = Image.fromarray(frame_rgb)

                    if not self.frame_queue.full():
                        self.frame_queue.put(image)

                except Exception as e:
                    print(f"Detection error: {e}")
                    self.running = False

        cv2.destroyAllWindows()

    def _cleanup_tracking_data(self, current_ids):
        """Clean up tracking data for IDs that are no longer visible"""
        all_tracking_dicts = [
            self.tracked_ids, 
            self.id_attempts, 
            self.id_original_texts,
            self.brute_force_states
        ]
        for tracking_dict in all_tracking_dicts:
            obsolete_ids = set(tracking_dict.keys()) - current_ids
            for old_id in obsolete_ids:
                tracking_dict.pop(old_id, None)

    def update_gui(self):
        if not self.running:
            return
        try:
            if not self.frame_queue.empty():
                image = self.frame_queue.get()
                self.photo = ImageTk.PhotoImage(image=image)
                self.image_label.configure(image=self.photo)
        except Exception as e:
            print(f"GUI update error: {e}")
        self.root.after(30, self.update_gui)

    def run(self):
        self.root.protocol("WM_DELETE_WINDOW", self.on_close)
        self.root.mainloop()

    def on_close(self):
        self.stop_detection()
        self.root.destroy()

def main():
    model_path = r'c:\is_project_totd\textbox_label-4\models\type_frame_detector\weights\best.pt'
    dictionary_path = r"C:\is_project_totd\frequency_dictionary_en_82_765.txt"
    app = ObjectDetectionApp(model_path, dictionary_path)
    try:
        app.run()
    except Exception as e:
        print(f"Error: {e}")
    finally:
        print("Application closed successfully.")

if __name__ == "__main__":
    main()

YOLO model running on GPU.
Selected Monitor: {'left': 0, 'top': 0, 'width': 1920, 'height': 1080}
Detection started

0: 384x640 (no detections), 69.8ms
Speed: 7.0ms preprocess, 69.8ms inference, 56.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 28.9ms
Speed: 2.0ms preprocess, 28.9ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 comic, 32.5ms
Speed: 1.0ms preprocess, 32.5ms inference, 131.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 24.3ms
Speed: 1.5ms preprocess, 24.3ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 24.3ms
Speed: 2.0ms preprocess, 24.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 25.3ms
Speed: 2.1ms preprocess, 25.3ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 24.9ms
Speed: 1.5ms preprocess, 24.9ms inference, 0.0ms postpr