In [None]:
import os
import json
import torch
from PIL import Image
from transformers import LlavaForConditionalGeneration, LlavaProcessor
from transformers import AutoProcessor, AutoModelForVision2Seq
import accelerate

In [None]:
# Pad naar de hoofd directory met geëxtraheerde frames
frames_dir = "frames/stabilizer pressure control/"
output_json_path = "combined_frames_blip2.json"
object_json_path = r"C:\Users\maxim\OneDrive\01-Opleidingen\03-MAAI\Afstuderen\Objectdetectie\detections_with_tracking_YOLOv12mv8_stabilizer_pressure_control.json"  # <-- jouw objectdetectie JSON

model_id = "liuhaotian/llava-v1.5-7b" 

#### Blip

In [None]:
import os
import json
from PIL import Image
import torch
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration

from transformers import BitsAndBytesConfig


# Paden
frames_dir = "frames/stabilizer pressure control/"
output_json_path = "combined_frames_instructblip.json"
object_json_path = r"C:\Users\maxim\OneDrive\01-Opleidingen\03-MAAI\Afstuderen\Objectdetectie\detections_with_tracking_YOLOv12mv8_stabilizer_pressure_control.json"

# Model initialiseren
def initialize_instruct_blip_model():
    print("Initialiseren van InstructBLIP model in 8-bit...")
    processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")

    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True,
        llm_int8_enable_fp32_cpu_offload=True  # CPU fallback toestaan
    )

    model = InstructBlipForConditionalGeneration.from_pretrained(
        "Salesforce/instructblip-flan-t5-xl",
        device_map="auto",
        #load_in_8bit=True
        quantization_config=quantization_config
    )
    print("Model geladen in 8-bit.")
    return model, processor


# Objecten laden
def load_object_detection(object_json_path):
    if os.path.exists(object_json_path):
        with open(object_json_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    else:
        print(f"Warning: {object_json_path} niet gevonden.")
        return {}

# Caption genereren
def generate_caption(image_path, object_list, model, processor, device):
    try:
        image = Image.open(image_path).convert("RGB")

        if object_list:
            prompt = (
                f"In deze afbeelding zijn de volgende objecten aanwezig: {', '.join(object_list)}. "
                "Beschrijf wat de operator doet met deze objecten."
            )
        else:
            prompt = "Beschrijf in detail wat er in de afbeelding gebeurt."

        inputs = processor(images=image, text=prompt, return_tensors="pt").to(device, torch.float16)

        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            num_beams=5,
            early_stopping=True
        )

        caption = processor.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

        print(caption)
        return caption

    except Exception as e:
        import traceback
        print(f"Fout bij genereren caption voor {image_path}:")
        traceback.print_exc()
        return ""

# Alle frames verwerken
def process_all_frames(frames_dir, model, processor, objects, device):
    combined_data = []
    supported_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff')

    video_segments = [d for d in os.listdir(frames_dir) if os.path.isdir(os.path.join(frames_dir, d))]
    print(f"Gevonden {len(video_segments)} video segmenten.")

    for seg_idx, segment in enumerate(sorted(video_segments), start=1):
        segment_path = os.path.join(frames_dir, segment)
        frame_files = [f for f in os.listdir(segment_path) if f.lower().endswith(supported_extensions)]

        if not frame_files:
            print(f"Warning: Geen frames gevonden in segment '{segment}'.")
            continue

        print(f"Segment {seg_idx}/{len(video_segments)}: '{segment}' met {len(frame_files)} frames.")

        for idx, frame_file in enumerate(sorted(frame_files)):
            frame_path = os.path.join(segment_path, frame_file)
            frame_number = idx
            frame_time_seconds = frame_number * 0.033  # 30 FPS verondersteld

            # Objecten ophalen zichtbaar rond dit tijdstip
            visible_objects = []
            for obj_id, obj_data in objects.items():
                for traj in obj_data.get("trajectory", []):
                    if abs(traj["time"] - frame_time_seconds) < 0.05:
                        visible_objects.append(obj_data["class_name"])
                        break

            # Caption genereren
            caption = generate_caption(frame_path, visible_objects, model, processor, device)

            combined_data.append({
                "video_segment": segment,
                "frame_number": frame_number,
                "frame_filename": frame_file,
                "frame_time_seconds": round(frame_time_seconds, 2),
                "visible_objects": visible_objects,
                "caption": caption
            })

    return combined_data

# JSON opslaan
def save_combined_data(combined_data, output_json_path):
    try:
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(combined_data, f, indent=4, ensure_ascii=False)
        print(f"Combined JSON opgeslagen naar '{output_json_path}'.")
    except Exception as e:
        print(f"Error bij opslaan JSON: {e}")

# Main functie
def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Gebruik device: {device}")

    model, processor = initialize_instruct_blip_model()
    objects = load_object_detection(object_json_path)

    combined_data = process_all_frames(
        frames_dir=frames_dir,
        model=model,
        processor=processor,
        objects=objects,
        device=device
    )

    if combined_data:
        save_combined_data(combined_data, output_json_path)
    else:
        print("Geen data om op te slaan.")

if __name__ == "__main__":
    main()


#### LLaVa

In [None]:
# ====== Model laden ======
def initialize_llava_model():
    print("Initialiseren van LLaVa model...")
    processor = LlavaProcessor.from_pretrained(model_id)
    model = LlavaForConditionalGeneration.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,
        load_in_8bit=True
    )
    print("Model geladen.")
    return model, processor

# ====== Object detectie laden ======
def load_object_detection(object_json_path):
    with open(object_json_path, 'r', encoding='utf-8') as f:
        return json.load(f)

# ====== Caption genereren ======
def generate_caption_llava(image_path, object_list, model, processor):
    try:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        image = Image.open(image_path).convert("RGB")

        # Prompt met objecten
        if object_list:
            objects = ", ".join(object_list)
            prompt = f"Beschrijf de afbeelding. De volgende objecten zijn aanwezig: {objects}."
        else:
            prompt = "Beschrijf de afbeelding."

        inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, max_new_tokens=100)

        caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
        return caption.strip()
    except Exception as e:
        import traceback
        print(f"Fout bij genereren caption voor {image_path}:")
        traceback.print_exc()
        return ""

# ====== Frames verwerken ======
def process_all_frames(frames_dir, model, processor, objects):
    combined_data = []
    supported_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff')

    video_segments = [d for d in os.listdir(frames_dir) if os.path.isdir(os.path.join(frames_dir, d))]
    print(f"Gevonden {len(video_segments)} video segmenten.")

    for seg_idx, segment in enumerate(sorted(video_segments), start=1):
        segment_path = os.path.join(frames_dir, segment)
        frame_files = [f for f in os.listdir(segment_path) if f.lower().endswith(supported_extensions)]

        if not frame_files:
            print(f"Geen frames gevonden in segment '{segment}'.")
            continue

        print(f"Segment {seg_idx}/{len(video_segments)}: '{segment}' met {len(frame_files)} frames.")

        for idx, frame_file in enumerate(sorted(frame_files), start=1):
            frame_path = os.path.join(segment_path, frame_file)

            # Frame tijd in seconden (30 fps)
            frame_time_seconds = round((idx - 1) * 0.033, 2)

            visible_objects = []
            for obj_id, obj_info in objects.items():
                trajectory = obj_info.get("trajectory", [])
                for point in trajectory:
                    if abs(point["time"] - frame_time_seconds) <= 0.05:
                        visible_objects.append(obj_info["class_name"])
                        break

            caption = generate_caption_llava(frame_path, visible_objects, model, processor)

            combined_data.append({
                "video_segment": segment,
                "frame_number": idx - 1,
                "frame_filename": frame_file,
                "frame_time_seconds": frame_time_seconds,
                "visible_objects": visible_objects,
                "caption": caption
            })

    return combined_data

# ====== JSON opslaan ======
def save_combined_data(combined_data, output_json_path):
    try:
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(combined_data, f, indent=4, ensure_ascii=False)
        print(f"Combined JSON opgeslagen naar '{output_json_path}'.")
    except Exception as e:
        print(f"Fout bij opslaan JSON: {e}")

# ====== Main functie ======
def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Gebruik device: {device}")

    model, processor = initialize_llava_model()
    objects = load_object_detection(object_json_path)

    combined_data = process_all_frames(
        frames_dir=frames_dir,
        model=model,
        processor=processor,
        objects=objects
    )

    if not combined_data:
        print("Geen data om op te slaan.")
        return

    save_combined_data(combined_data, output_json_path)

if __name__ == "__main__":
    main()

# Poging 2

In [None]:
import os
import json
import re
from PIL import Image
from tqdm import tqdm
import torch
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration

# ==== CONFIG ====
frames_dir = "frames/stabilizer pressure control/"
object_json_path = r"C:\Users\maxim\OneDrive\01-Opleidingen\03-MAAI\Afstuderen\Objectdetectie\detections_with_tracking_YOLOv12mv8_stabilizer_pressure_control.json"
output_json_path = "instructblip_captions_with_objects.json"
frame_interval_sec = 3  # interval tussen frames in seconden
base_prompt = "Briefly describe what is happening in this technical industrial scene."
device = "cpu"  # gebruik "cuda" als je een geschikte GPU hebt
# ================

# Laad objectdata
with open(object_json_path, "r", encoding="utf-8") as f:
    object_data = json.load(f)

# Laad model en processor
print("Loading InstructBLIP...")
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
model = InstructBlipForConditionalGeneration.from_pretrained(
    "Salesforce/instructblip-flan-t5-xl",
    torch_dtype=torch.float32,
    device_map={"": device}
)
model.to(device)

captions = {}

# Doorloop segmentmappen
for segment_dir in sorted(os.listdir(frames_dir)):
    segment_path = os.path.join(frames_dir, segment_dir)
    if not os.path.isdir(segment_path):
        continue

    match_segment = re.search(r"segment_\d+_(\d+)_(\d+)", segment_dir)
    if not match_segment:
        print(f"Ongeldig segment: {segment_dir}")
        continue
    start_time = int(match_segment.group(1))

    for filename in sorted(os.listdir(segment_path)):
        if not filename.lower().endswith((".jpg", ".png")):
            continue

        frame_path = os.path.join(segment_path, filename)
        match_frame = re.search(r"frame_(\d+)", filename)
        if not match_frame:
            continue
        frame_idx = int(match_frame.group(1))
        timestamp = start_time + frame_idx * frame_interval_sec

        # Bepaal zichtbare objecten
        visible_objects = []
        for obj in object_data.values():
            if obj["first_seen"] <= timestamp <= obj["last_seen"]:
                visible_objects.append(obj["class_name"])

        object_context = ", ".join(sorted(set(visible_objects)))
        if object_context:
            prompt = f"{base_prompt} The following objects are visible: {object_context}."
        else:
            prompt = base_prompt

        # Caption genereren
        try:
            image = Image.open(frame_path).convert("RGB")
            inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=100,
                do_sample=False,
                num_beams=4,
                early_stopping=True
            )
            caption = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            print(caption)
        except Exception as e:
            print(f"Fout bij {filename}: {e}")
            caption = ""

        key = os.path.join(segment_dir, filename)
        captions[key] = {
            "caption": caption,
            "timestamp": timestamp,
            "visible_objects": visible_objects,
            "prompt_used": prompt
        }

# Opslaan
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(captions, f, indent=4, ensure_ascii=False)

print(f"\n{len(captions)} captions opgeslagen in '{output_json_path}'")
