In [51]:
import numpy as np
import onnxruntime as ort
from PIL import Image
import cv2
from typing import List, Tuple
import os
from llama_cpp import Llama
import gc

In [52]:
# --- Setup YOLO for Object Detection ---
def setup_yolo(model_path: str) -> ort.InferenceSession:
    try:
        # Initialize ONNX runtime session
        session = ort.InferenceSession(model_path, providers=['CPUExecutionProvider'])  # Use 'NNAPIExecutionProvider' for Android
        return session
    except Exception as e:
        print(f"Error setting up YOLO: {e}")
        return None

In [53]:
# --- Setup Gemma with llama_cpp ---
def setup_gemma():
    os.environ['LLAMA_NUMA'] = '1'
    os.environ['LLAMA_MMX_NTHREADS'] = '8'

    model = Llama(
        model_path=r"C:\Users\voutl\OneDrive\Documents\LifeEase\gemma-2-2b-it-Q5_K_M.gguf",  # Ensure this path is correct
        n_ctx=512,
        n_threads=8,
        n_gpu_layers=-1 
    )
    return model

In [54]:
# --- Setup BLIP for Image Captioning ---
def setup_blip():
    try:
        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        # Move to GPU if available, else CPU (mobile may need CPU-only)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        model.eval()
        return processor, model, device
    except Exception as e:
        print(f"Error setting up BLIP: {e}")
        return None, None, None

In [55]:
# --- Preprocess Image for YOLO ---
def preprocess_yolo(image_path: str, input_size: Tuple[int, int] = (640, 640)) -> np.ndarray:
    try:
        # Load and preprocess image
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        h, w = image.shape[:2]
        
        # Resize while maintaining aspect ratio
        ratio = min(input_size[0] / w, input_size[1] / h)
        new_w, new_h = int(w * ratio), int(h * ratio)
        image = cv2.resize(image, (new_w, new_h))
        
        # Pad to input_size
        padded = np.zeros((input_size[1], input_size[0], 3), dtype=np.uint8)
        padded[:new_h, :new_w] = image
        image = padded
        
        # Normalize and convert to batch format (1, 3, H, W)
        image = image.astype(np.float32) / 255.0
        image = np.transpose(image, (2, 0, 1))  # HWC to CHW
        image = np.expand_dims(image, axis=0)  # Add batch dimension
        return image, (w, h), ratio
    except Exception as e:
        print(f"Error preprocessing image for YOLO: {e}")
        return None, None, None

In [46]:
# --- Postprocess YOLO Outputs ---
def postprocess_yolo(outputs: np.ndarray, original_size: Tuple[int, int], ratio: float, conf_thres: float = 0.5) -> List[dict]:
    try:
        # Assuming YOLOv8 output: [batch, num_boxes, (x, y, w, h, conf, class_probs)]
        detections = outputs[0]  # Shape: [num_boxes, 5 + num_classes]
        boxes, scores, classes = [], [], []
        
        for det in detections:
            x, y, w, h, conf = det[:5]
            class_id = np.argmax(det[5:])
            if conf > conf_thres:
                # Convert to original image coordinates
                x = (x - w / 2) / ratio
                y = (y - h / 2) / ratio
                w /= ratio
                h /= ratio
                boxes.append([x, y, w, h])
                scores.append(conf)
                classes.append(class_id)
        
        # Convert to descriptive format
        results = []
        class_names = [...]  # Load your class names (e.g., COCO classes)
        for box, score, cls in zip(boxes, scores, classes):
            x, y, w, h = box
            results.append({
                "class": class_names[cls],
                "confidence": float(score),
                "box": {"x": float(x), "y": float(y), "w": float(w), "h": float(h)}
            })
        return results
    except Exception as e:
        print(f"Error postprocessing YOLO outputs: {e}")
        return []

In [47]:
# --- Modified Generate Response with YOLO ---
def generate_response(gemma_model, blip_processor=None, blip_model=None, yolo_session=None, device=None, user_input=None, image_path=None):
    caption, yolo_results = None, []
    
    if image_path and blip_processor and blip_model:  # Image mode
        # BLIP Caption
        try:
            image = Image.open(image_path).convert("RGB")
            inputs = blip_processor(images=image, return_tensors="pt").to(device)
            with torch.no_grad():
                caption_ids = blip_model.generate(**inputs, max_length=50)
            caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
            print(f"BLIP Caption: {caption}")
        except Exception as e:
            print(f"Error processing image with BLIP: {e}")
            caption = "Image processing failed."
        
        # YOLO Object Detection
        if yolo_session:
            try:
                image, original_size, ratio = preprocess_yolo(image_path)
                if image is not None:
                    inputs = {yolo_session.get_inputs()[0].name: image}
                    outputs = yolo_session.run(None, inputs)
                    yolo_results = postprocess_yolo(outputs, original_size, ratio)
                    print(f"YOLO Results: {yolo_results}")
            except Exception as e:
                print(f"Error processing image with YOLO: {e}")
        
        # Combine BLIP and YOLO for Gemma
        yolo_desc = ""
        if yolo_results:
            yolo_desc = "Objects detected: " + "; ".join(
                [f"{res['class']} at ({res['box']['x']:.1f}, {res['box']['y']:.1f})" for res in yolo_results]
            )
        prompt = f"<start_of_turn>user\nProvide a detailed description of this scene: {caption}. {yolo_desc}<end_of_turn>\n<start_of_turn>model\n"
    
    elif user_input:  # Text mode
        prompt = f"<start_of_turn>user\n{user_input}<end_of_turn>\n<start_of_turn>model\n"
    else:
        return "Please provide text or an image."

    # Generate response with Gemma
    response = ""
    try:
        for chunk in gemma_model(
            prompt,
            max_tokens=500,
            temperature=0.7,
            stream=True
        ):
            text_chunk = chunk["choices"][0]["text"]
            response += text_chunk
            print(text_chunk, end="", flush=True)
    except Exception as e:
        print(f"Error during generation: {e}")
    
    return response

In [48]:
# --- Cleanup Resources ---
def cleanup(gemma_model, blip_model=None):
    # Cleanup Gemma
    gemma_model.reset()
    del gemma_model
    
    # Cleanup BLIP if loaded
    if blip_model:
        del blip_model
        torch.cuda.empty_cache()  # Clear GPU memory if used
    
    gc.collect()

In [49]:
# --- Modified Main Function ---
def main():
    try:
        # Initialize models
        gemma_model = setup_gemma()
        blip_processor, blip_model, device = setup_blip()
        yolo_session = setup_yolo("yolov8n.onnx")  # Replace with your YOLO ONNX model path

        # Example: Image input
        print("\n--- Image Example ---")
        image_path = r"C:\Users\voutl\OneDrive\Desktop\download (1).jpg"
        response = generate_response(
            gemma_model,
            blip_processor,
            blip_model,
            yolo_session,
            device,
            image_path=image_path
        )

    except Exception as e:
        print(f"Error in main: {e}")
    finally:
        # Cleanup
        cleanup(gemma_model, blip_model)



In [50]:
if __name__ == "__main__":
    main()

llama_model_loader: loaded meta data with 39 key-value pairs and 288 tensors from C:\Users\voutl\OneDrive\Documents\LifeEase\gemma-2-2b-it-Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Gemma 2 2b It
llama_model_loader: - kv   3:                           general.finetune str              = it
llama_model_loader: - kv   4:                           general.basename str              = gemma-2
llama_model_loader: - kv   5:                         general.size_label str              = 2B
llama_model_loader: - kv   6:                            general.license str              = gemma
llama_model_loader: - kv   7:    

Error setting up BLIP: name 'BlipProcessor' is not defined
Error setting up YOLO: [ONNXRuntimeError] : 3 : NO_SUCHFILE : Load model from yolov8n.onnx failed:Load model yolov8n.onnx failed. File doesn't exist

--- Image Example ---
