In [50]:
import numpy as np
import onnxruntime as ort
from PIL import Image
import cv2
from typing import List, Tuple
import os
from llama_cpp import Llama
import torch
import gc
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

In [51]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [52]:
# --- Setup YOLO for Object Detection ---
def setup_yolo(model_path: str) -> ort.InferenceSession:
    try:
        # Initialize ONNX runtime session
        session = ort.InferenceSession(model_path, providers=['CPUExecutionProvider'])  # Use 'NNAPIExecutionProvider' for Android
        return session
    except Exception as e:
        print(f"Error setting up YOLO: {e}")
        return None

In [53]:
# --- Setup Gemma with llama_cpp ---
def setup_gemma():
    os.environ['LLAMA_NUMA'] = '1'
    os.environ['LLAMA_MMX_NTHREADS'] = '8'

    model = Llama(
        model_path=r"C:\Users\voutl\OneDrive\Documents\LifeEase\gemma-2-2b-it-Q5_K_M.gguf",  # Ensure this path is correct
        n_ctx=512,
        n_threads=8,
        n_gpu_layers=-1 
    )
    return model

In [54]:
# --- Setup BLIP for Image Captioning ---
def setup_blip():
    try:
        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        # Move to GPU if available, else CPU (mobile may need CPU-only)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        model.eval()
        return processor, model, device
    except Exception as e:
        print(f"Error setting up BLIP: {e}")
        return None, None, None

In [55]:
# --- Preprocess Image for YOLO ---
def preprocess_yolo(image_path: str, input_size: Tuple[int, int] = (640, 640)) -> np.ndarray:
    try:
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        h, w = image.shape[:2]
        
        ratio = min(input_size[0] / w, input_size[1] / h)
        new_w, new_h = int(w * ratio), int(h * ratio)
        image = cv2.resize(image, (new_w, new_h))
        
        padded = np.zeros((input_size[1], input_size[0], 3), dtype=np.uint8)
        padded[:new_h, :new_w] = image
        image = padded
        
        image = image.astype(np.float32) / 255.0
        image = np.transpose(image, (2, 0, 1))
        image = np.expand_dims(image, axis=0)
        return image, (w, h), ratio
    except Exception as e:
        print(f"Error preprocessing image for YOLO: {e}")
        return None, None, None

In [56]:
# --- Postprocess YOLO Outputs ---
def postprocess_yolo(outputs: np.ndarray, original_size: Tuple[int, int], ratio: float, conf_thres: float = 0.4, iou_thres: float = 0.5) -> List[dict]:
    try:
        detections = outputs[0]  # Shape: (1, 84, 8400)
        print(f"Raw detections shape: {detections.shape}")
        
        # Squeeze and transpose to (8400, 84)
        detections = np.transpose(np.squeeze(detections), (1, 0))  # (8400, 84)
        print(f"Reshaped detections shape: {detections.shape}")
        
        if detections.size == 0:
            print("No detections found.")
            return []

        boxes, scores, classes = [], [], []
        for det in detections:
            # Extract box coordinates, confidence, and class scores
            x_center, y_center, width, height, conf = det[:5]
            class_scores = det[5:]  # 80 class probabilities
            class_id = np.argmax(class_scores)
            score = conf * class_scores[class_id]  # Combined confidence
            
            if score > conf_thres:
                # Convert to corner coordinates
                x = (x_center - width / 2) / ratio
                y = (y_center - height / 2) / ratio
                w = width / ratio
                h = height / ratio
                boxes.append([x, y, x + w, y + h])  # [x_min, y_min, x_max, y_max]
                scores.append(float(score))
                classes.append(int(class_id))

        if not boxes:
            return []

        # Simple NMS (approximation using NumPy)
        indices = np.argsort(scores)[::-1]
        suppress = set()
        for i in range(len(indices)):
            if i in suppress:
                continue
            for j in range(i + 1, len(indices)):
                if j in suppress:
                    continue
                iou = compute_iou(boxes[indices[i]], boxes[indices[j]])
                if iou > iou_thres:
                    suppress.add(j)

        # Filter valid detections
        valid_indices = [i for i in indices if i not in suppress]
        results = []
        class_names = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
                       "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
                       "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
                       "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
                       "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                       "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
                       "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
                       "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
                       "hair drier", "toothbrush"]  # COCO 80 classes
        for i in valid_indices:
            x, y, x_max, y_max = boxes[i]
            results.append({
                "class": class_names[classes[i]],
                "confidence": scores[i],
                "box": {"x": float(x), "y": float(y), "w": float(x_max - x), "h": float(y_max - y)}
            })
        return results
    except Exception as e:
        print(f"Error postprocessing YOLO outputs: {e}")
        return []

In [57]:
# --- Helper Function for IoU ---
def compute_iou(box1: List[float], box2: List[float]) -> float:
    x1, y1, x1_max, y1_max = box1
    x2, y2, x2_max, y2_max = box2
    
    inter_x1 = max(x1, x2)
    inter_y1 = max(y1, y2)
    inter_x2 = min(x1_max, x2_max)
    inter_y2 = min(y1_max, y2_max)
    
    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
    box1_area = (x1_max - x1) * (y1_max - y1)
    box2_area = (x2_max - x2) * (y2_max - y2)
    union_area = box1_area + box2_area - inter_area
    
    return inter_area / union_area if union_area > 0 else 0

In [58]:
# --- Modified Generate Response with YOLO ---
def generate_response(gemma_model, blip_processor=None, blip_model=None, yolo_session=None, device=None, user_input=None, image_path=None):
    caption, yolo_results = None, []
    
    if image_path and blip_processor and blip_model:
        try:
            image = Image.open(image_path).convert("RGB")
            inputs = blip_processor(images=image, return_tensors="pt").to(device)
            with torch.no_grad():
                caption_ids = blip_model.generate(**inputs, max_length=50)
            caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
            print(f"BLIP Caption: {caption}")
        except Exception as e:
            print(f"Error processing image with BLIP: {e}")
            caption = "Image processing failed."
        
        if yolo_session:
            try:
                image, original_size, ratio = preprocess_yolo(image_path)
                if image is not None:
                    inputs = {yolo_session.get_inputs()[0].name: image}
                    outputs = yolo_session.run(None, inputs)
                    yolo_results = postprocess_yolo(outputs, original_size, ratio)
                    print(f"YOLO Results: {yolo_results}")
            except Exception as e:
                print(f"Error processing image with YOLO: {e}")
        
        yolo_desc = ""
        if yolo_results:
            yolo_desc = "Objects detected: " + "; ".join(
                [f"{res['class']} at ({res['box']['x']:.1f}, {res['box']['y']:.1f})" for res in yolo_results]
            )
        prompt = f"<start_of_turn>user\nDescribe the scene based only on the caption and detected objects with their approximate locations. Provide a short, factual description without poetic language or unnecessary details: {caption}. Objects detected: {yolo_desc}<end_of_turn>\n<start_of_turn>model\n"
    elif user_input:
        prompt = f"<start_of_turn>user\n{user_input}<end_of_turn>\n<start_of_turn>model\n"
    else:
        return "Please provide text or an image."

    response = ""
    try:
        for chunk in gemma_model(
            prompt,
            max_tokens=500,
            temperature=0.7,
            stream=True
        ):
            text_chunk = chunk["choices"][0]["text"]
            response += text_chunk
            print(text_chunk, end="", flush=True)
    except Exception as e:
        print(f"Error during generation: {e}")
    
    return response

In [59]:
# --- Cleanup Resources ---
def cleanup(gemma_model, blip_model=None):
    # Cleanup Gemma
    gemma_model.reset()
    del gemma_model
    
    # Cleanup BLIP if loaded
    if blip_model:
        del blip_model
        torch.cuda.empty_cache()  # Clear GPU memory if used
    
    gc.collect()

In [60]:
# --- Modified Main Function ---
def main():
    try:
        # Initialize models
        gemma_model = setup_gemma()
        blip_processor, blip_model, device = setup_blip()
        yolo_session = setup_yolo(r"C:\Users\voutl\OneDrive\Documents\LifeEase\yolov8n.onnx")  # Replace with your YOLO ONNX model path

        # Example: Image input
        print("\n--- Image Example ---")
        image_path = r"C:\Users\voutl\OneDrive\Desktop\download (1).jpg"
        response = generate_response(
            gemma_model,
            blip_processor,
            blip_model,
            yolo_session,
            device,
            image_path=image_path
        )

    except Exception as e:
        print(f"Error in main: {e}")
    finally:
        # Cleanup
        cleanup(gemma_model, blip_model)



In [61]:
if __name__ == "__main__":
    main()

llama_model_loader: loaded meta data with 39 key-value pairs and 288 tensors from C:\Users\voutl\OneDrive\Documents\LifeEase\gemma-2-2b-it-Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Gemma 2 2b It
llama_model_loader: - kv   3:                           general.finetune str              = it
llama_model_loader: - kv   4:                           general.basename str              = gemma-2
llama_model_loader: - kv   5:                         general.size_label str              = 2B
llama_model_loader: - kv   6:                            general.license str              = gemma
llama_model_loader: - kv   7:    


--- Image Example ---
BLIP Caption: a street scene with a man walking down the street
Raw detections shape: (1, 84, 8400)
Reshaped detections shape: (8400, 84)
YOLO Results: []
Please provide the caption! 😊  I need the text to describe the scene and identify the objects. 


llama_perf_context_print:        load time =    3968.55 ms
llama_perf_context_print: prompt eval time =    3967.72 ms /    53 tokens (   74.86 ms per token,    13.36 tokens per second)
llama_perf_context_print:        eval time =    1870.62 ms /    22 runs   (   85.03 ms per token,    11.76 tokens per second)
llama_perf_context_print:       total time =    5936.33 ms /    75 tokens
