In [None]:
import os
from pathlib import Path
from ultralytics import YOLO
from PIL import Image
import shutil
import pandas as pd
from source import image_id_converter as img_idc
from source.db_loader import MLDataLoader
from source.db_loader import delete_images
#from source import sort_img_files as sif
from source import llm_input as llm_i
from source import llm_output as llm_o
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np

In [None]:
import ollama
import json
import re
import pickle

In [None]:
import pandas as pd
import psycopg2
from dotenv import load_dotenv


In [None]:
import glob


In [None]:
from psycopg2.extras import execute_batch
from typing import Dict, List, Optional

In [None]:
# Standard library imports
import sys
from datetime import datetime
from pathlib import Path

# Database connection library
# psycopg2: PostgreSQL adapter for Python - handles all DB communication
import psycopg2
from psycopg2 import extras  # extras provides advanced features like Json adapter

# Data manipulation
import pandas as pd  # For handling CSV files and DataFrames

# Environment variable management
# python-dotenv: Loads database credentials from .env file (keeps passwords out of code)
from dotenv import load_dotenv

In [None]:
def get_existing_image_ids(source_filter, conn=None, cur=None):
    """
    Get all source_image_ids from database.
    
    Parameters:
    -----------
    source_filter : str
        Source to filter by (e.g., 'giub')
    conn : connection object, optional
        Existing database connection (if None, creates new one)
    cur : cursor object, optional
        Existing cursor (if None, creates new one)
    
    Returns:
    --------
    list : List of source_image_ids
    """
    import psycopg2
    from dotenv import load_dotenv
    import os
    
    close_after = False
    if conn is None:
        load_dotenv()
        conn = psycopg2.connect(
            dbname=os.getenv('DB_NAME'),
            user=os.getenv('DB_USER'),
            password=os.getenv('DB_PASSWORD'),
            host=os.getenv('DB_HOST'),
            port=os.getenv('DB_PORT')
        )
        cur = conn.cursor()
        close_after = True
    elif cur is None:
        cur = conn.cursor()
    
    # Added WHERE clause with parameter
    cur.execute("""
        SELECT DISTINCT source_image_id 
        FROM images 
        WHERE source = %s 
        ORDER BY source_image_id;
    """, (source_filter,))
                    
    existing_ids = [row[0] for row in cur.fetchall()]
    print(f"Existing image IDs for source '{source_filter}': {existing_ids}")
    
    if close_after:
        cur.close()
        conn.close()
    
    return existing_ids

### Empty database:

### Cave! Only activate this cell if you want to delete all giub images and dependent data from the database!

### Set paths:

In [None]:
project_path = Path.cwd()
#root_path = (project_path / '..' / 'test_data_folders/test_rec_multi_object_MiniCPM').resolve()
root_path = (project_path / '..' / 'test_data_folders/test_filter_out_people_multi_approach').resolve()

data_path = root_path / 'data'
tif_data_path = root_path / 'data_1'
jpg_data_path = root_path / 'data_jpg'
output_dir_not_photo = root_path / 'not_photo'
output_dir_with_person = root_path / 'with_person'
output_dir_without_person = root_path / 'without_person'

#visual_genome_path = (project_path/ '..' /'data_folders' / 'visual_genome_data').resolve()
#visual_genome_proc_path = (project_path/ '..' /'data_folders' / 'visual_genome_proc_data').resolve()


In [None]:
files = sorted(os.listdir(data_path), key=lambda f: os.path.getmtime(os.path.join(data_path, f)), reverse=True)
files

### Set file parameters:

In [None]:
file_source = 'giub' # Institute of Geography, University of Berne.
file_extension = '.tif' # First batch of images obtained from the institute of Geography (giub)
filename_tag = 'Oberland' # First batch of images obtained from the institute of Geography (giub)
meta_data_file = 'labels_mod.csv' # File containing ids and labels.
times_file = 'times_people_detect_multi_approach_llm_20260213_144241.pkl' # File containing timestamp and duration of analysis-run.
results_file = 'results_llm_people_detect_multi_approach_20260213_144241.pkl'
responses_file = 'responses_llm_people_detect_multi_approach_20260213_144241.pkl'
model_name_file = 'minicpm_v_model_info.txt'


### Get info about the minicpm-v version used by Ollama:

In [None]:
# Get yolo model version: 
model_name_file_path = data_path / model_name_file
with open(model_name_file_path, 'r') as f:
    text = f.read()
model_version = text.split(': ')[1]
model_version

### Set analysis run parameters:

In [None]:
analysis_type = 'llm_classification'
model_name = 'minicpm-v'
model_version = model_version
python_script = 'filter_out_people_multi_approach.ipynb'


### Set database_name: 

In [None]:
database_name = 'image_analysis_dev'

### Check environment variables:

In [None]:
import os
from pathlib import Path

# Check if .env exists in current directory
env_path = Path('.env')
print(f"Current directory: {os.getcwd()}")
print(f".env exists: {env_path.exists()}")

# If loaded, check what environment variables are available
print(f"\nDB_NAME: {os.getenv('DB_NAME')}")
print(f"DB_USER: {os.getenv('DB_USER')}")
print(f"DB_HOST: {os.getenv('DB_HOST')}")
print(f"DB_PORT: {os.getenv('DB_PORT')}")

### Get image file names and id's of image files in folder (to be loaded):

In [None]:
# Get file names of image files in tif_data_path folder: 
search_pattern = '*' + file_extension
tif_files = sorted(list(tif_data_path.glob(search_pattern)))
#tif_files = sorted(list(tif_data_path.glob('*.tif')) + list(tif_data_path.glob('*.tiff')))
print(len(tif_files))
print(tif_files[0:2])

In [None]:
# Get id's of image files: 
file_ids_in_folder = []
for tif_file in tif_files:
    tif_file_str = str(tif_file)
    last_part = tif_file_str.split(filename_tag)[-1]
    #print(last_part)
    id_part = last_part.split('.')[0]
    #print(id_part)
    file_ids_in_folder.append(int(id_part))
print(len(file_ids_in_folder))
print(file_ids_in_folder[0:3])

### Check image ids present in the database:

In [None]:
existing_ids = get_existing_image_ids(source_filter='giub', conn=None, cur=None)
print(len(existing_ids))
print(existing_ids[0:2])


### Check differences between image files present in database and image files in folder:

In [None]:
print('In database:')
print(len(existing_ids))
print(existing_ids)
print('In folder:')
print(file_ids_in_folder)

### Delete one image file from database for testing purposes:

In [None]:
#delete_images(image_ids=[107])

### Check differences between image files in database and in folder again: 

In [None]:
set.difference(set(file_ids_in_folder), set(existing_ids))

### Load images into the database: 

In [None]:
import re

print("=" * 70)
print("STEP 1: LOADING 12 TIF IMAGES")
print("=" * 70)

# Initialize MLDataLoader, this also establishes the connection with the database:
loader = MLDataLoader(db_name='image_analysis_dev', source=file_source)

# Extract image information from TIF files
image_ids = []      # Will be integers: [2, 3, 8, 15, ...]
filenames = []      # Will be: ['BernerOberland002.tif', ...]
file_paths = []     # Full paths

for tif_file in tif_files:  # tif_files from our earlier exploration
    # Extract numeric ID from filename
    name = tif_file.stem  # 'BernerOberland002'
    match = re.search(r'(\d+)$', name)
    
    if match:
        id_int = int(match.group(1))  # '002' ‚Üí 2 (integer)
        
        image_ids.append(id_int)
        filenames.append(tif_file.name)
        file_paths.append(str(tif_file))

print(f"Found {len(image_ids)} images")
print(f"Image IDs (integers): {sorted(image_ids)}")
print()

# Load images using MLDataLoader
#id_mapping = loader.load_images(
result = loader.load_images_safe(
    image_ids=image_ids,
    filenames=filenames,
    file_paths=file_paths,
    source='giub'
)

print()

id_mapping = result['id_mapping']
new_files = result['inserted_files']
reused_files = result['existing_files']

print(f"‚úÖ ID Mapping created: {len(id_mapping)} entries")
print("Sample mappings (source, source_image_id) ‚Üí database_image_id:")
for key in sorted(id_mapping.keys())[:5]:
    print(f"  {key} ‚Üí {id_mapping[key]}")
print('New files:')
print(new_files)
print('Reused files:')
print(reused_files)

In [None]:
id_mapping

### Check differences between image files in database and in folder again: 

In [None]:
existing_ids_after = get_existing_image_ids(source_filter='giub', conn=None, cur=None)
existing_ids_after
set.difference(set(file_ids_in_folder), set(existing_ids_after))

In [None]:
os.listdir(data_path)

### Load label data: 

In [None]:
# Load labels CSV
label_data_path = os.path.join(data_path, meta_data_file)
label_data = pd.read_csv(label_data_path)

# Reconvert image ids to integers (e.g. '234') as strings from the form they were saved in (e.g. 'id234' to ensure 
# string data type to deal with duck typing): 
img_ids = list(label_data.image_id)
label_data['image_id'] = img_idc.reconvert_image_ids(img_ids)

label_data.head()

### Load times data:

In [None]:
times_path_llm = os.path.join(data_path, times_file)
# Reload saved dictionary to check if saving worked:
with open(times_path_llm, 'rb') as f:
   times_data_llm = pickle.load(f)


### Get relevant times data:

In [None]:
timestamp_id = times_data_llm['time_stamp_start'][0].strftime('%Y%m%d_%H%M%S')
timestamp_id


In [None]:
duration_seconds = times_data_llm['duration_seconds'][0]
duration_seconds


### Load results: 

In [None]:
# load results:
results_tabular_path = os.path.join(data_path, results_file)
# Reload saved dictionary to check if saving worked:
with open(results_tabular_path, 'rb') as f:
   reloaded_results_tabular = pickle.load(f)

In [None]:
print(type(reloaded_results_tabular))
print(len(reloaded_results_tabular))
print(reloaded_results_tabular.keys())

### Get number of processed images:

In [None]:
images_processed = reloaded_results_tabular[timestamp_id]['predictions']['contains_persons'].shape[0]
images_processed 


### Load raw results:

In [None]:

# Save dictionary with LLM responses:
llm_responses_path = os.path.join(data_path, responses_file)
# 
# Reload saved dictionary to check if saving worked:
with open(llm_responses_path, 'rb') as f:
   reloaded_image_descr = pickle.load(f)

# Check if original and reloaded dictionary are the same:
#print(len(image_descr))
#print(type(image_descr))
print(type(reloaded_image_descr))
print(len(reloaded_image_descr))

#print(image_descr.keys() == reloaded_image_descr.keys())

### Check results (predictions):

In [None]:
print("=" * 70)
print("DATA LOADED IN JUPYTER SESSION")
print("=" * 70)

print("\n1. RESULTS (reloaded_results_tabular)")
print(f"   Type: {type(reloaded_results_tabular)}")
if isinstance(reloaded_results_tabular, dict):
    print(f"   Keys: {list(reloaded_results_tabular.keys())}")
    first_key = list(reloaded_results_tabular.keys())[0]
    print(f"   First key: {first_key}")
    print(f"   Structure under first key: {list(reloaded_results_tabular[first_key].keys())}")

print("\n2. RAW RESPONSES (reloaded_image_descr)")
print(f"   Type: {type(reloaded_image_descr)}")
if isinstance(reloaded_image_descr, dict):
    print(f"   Keys: {list(reloaded_image_descr.keys())}")

print("\n3. LABEL DATA (label_data)")
print(f"   Type: {type(label_data)}")
if hasattr(label_data, 'shape'):
    print(f"   Shape: {label_data.shape}")
    print(f"   Columns: {list(label_data.columns)}")
    print(f"   First few rows:")
    print(label_data.head(3))

### Load ground truth (from label data) into the database: 

In [None]:
print("=" * 70)
print("STEP 3: TRANSFORMING AND LOADING GROUND TRUTH")
print("=" * 70)

# Transform label_data from wide to long format
label_data_long = label_data.melt(
    id_vars=['image_id'], 
    var_name='label_name',
    value_name='value'
)

# Convert image_id from '001' to 1 (integer)
label_data_long['image_id'] = label_data_long['image_id'].astype(int)

# Convert value from 0/1 to 'false'/'true' (database stores as text)
label_data_long['value'] = label_data_long['value'].apply(lambda x: 'true' if x == 1 else 'false')

print(f"Transformed {len(label_data)} rows (wide) ‚Üí {len(label_data_long)} rows (long)")
print(f"\nFirst few rows of transformed data:")
print(label_data_long.head(10))
print()

# Now load using the original load_ground_truth method we built
# (Not the single-row method, but the batch method)
from pandas import DataFrame
#loader.load_ground_truth(label_data_long, source='giub')
loader.load_ground_truth_safe(label_data_long, source='giub')

print("\n‚úÖ Ground truth loading complete!")

### Load analysis run meta data into database: 

In [None]:
analysis_run_id = loader.load_analysis_run(run_timestamp=timestamp_id, analysis_type=analysis_type, model_name=model_name,
                        python_script=python_script, model_version=model_version, hyperparameters=None, 
                        notes=None, start_time=None, duration_seconds=duration_seconds, images_processed=images_processed)

loader.conn.commit()


In [None]:
analysis_run_id

### Load predictions (results) into database: 

In [None]:
print("=" * 70)
print("STEP 2: LOADING PREDICTIONS FROM RESULTS DICT")
print("=" * 70)

# The loader is already initialized and connected
# Just call load_predictions_from_dict with your data!

loader.load_predictions_from_dict(analysis_run_id=analysis_run_id,
    results_dict=reloaded_results_tabular,
    source='giub'
)

print("\n‚úÖ Predictions loading complete!")

### Check raw response data:

In [None]:
print("=" * 70)
print("CHECKING LLM RESPONSES STRUCTURE")
print("=" * 70)

timestamp_key = list(reloaded_image_descr.keys())[0]
print(f"Timestamp key: {timestamp_key}")

prompt_keys = list(reloaded_image_descr[timestamp_key].keys())
print(f"Prompt keys: {prompt_keys}")

prompt_name = prompt_keys[0]
image_responses = reloaded_image_descr[timestamp_key][prompt_name]

print(f"\nNumber of images with responses: {len(image_responses)}")
print(f"Sample image IDs: {list(image_responses.keys())[:5]}")

# Look at one response
sample_img_id = list(image_responses.keys())[0]
print(f"\nSample response for image '{sample_img_id}':")
print(image_responses[sample_img_id])

### Load raw responses into the database:

In [None]:
print("=" * 70)
print("STEP 4: LOADING LLM RESPONSES")
print("=" * 70)

# Get the timestamp and prompt name
#timestamp_key = '20260107_001027'
timestamp_key = timestamp_id
#prompt_name = 'create_prompt_img_type_multi_object_v1'
prompt_name = reloaded_results_tabular[timestamp_id]['prompt_id']

# Get the prompt_id (should be 1 - the one we already created)
prompt_id = loader.get_or_create_prompt(prompt_name, '')
print(f"Using prompt_id: {prompt_id}")

# Get the analysis_run_id (should be 1 - from the predictions we loaded)
# We need to query for it based on the timestamp
from datetime import datetime
run_timestamp = datetime.strptime(timestamp_key, '%Y%m%d_%H%M%S')

loader.cur.execute("""
    SELECT analysis_run_id FROM analysis_runs 
    WHERE run_timestamp = %s
""", (run_timestamp,))
result = loader.cur.fetchone()
analysis_run_id = result[0] if result else None
print(f"Using analysis_run_id: {analysis_run_id}")

# Get the image responses
image_responses = reloaded_image_descr[timestamp_key][prompt_name]

print(f"\nLoading {len(image_responses)} LLM responses...")

loaded_count = 0
skipped_count = 0

for img_id_str, response_dict in image_responses.items():
    try:
        # Convert '002' ‚Üí 2 (integer)
        source_img_id = int(img_id_str)
        
        # Get database image_id
        db_image_id = loader.get_database_image_id('giub', source_img_id)
        
        # Insert LLM response
        loader.cur.execute("""
            INSERT INTO llm_responses (
                analysis_run_id, image_id, prompt_id, parsed_response,
                raw_response_text, parse_success, tokens_used
            )
            VALUES (%s, %s, %s, %s, %s, %s, %s)
        """, (
            analysis_run_id,
            db_image_id,
            prompt_id,
            extras.Json(response_dict),  # Store as JSONB
            str(response_dict),          # Store as text
            True,                        # Parse succeeded
            None                         # No token info available
        ))
        loaded_count += 1
        
    except ValueError as e:
        print(f"‚ö†Ô∏è  Skipping image {img_id_str}: {e}")
        skipped_count += 1

# Commit
loader.conn.commit()

print(f"\n‚úÖ Loaded {loaded_count} LLM responses")
if skipped_count > 0:
    print(f"‚ö†Ô∏è  Skipped {skipped_count} responses")

### Close database connection:

In [None]:
loader.close()

In [None]:
# ============================================================================
# VERIFICATION QUERIES - Check Data Loaded Correctly
# ============================================================================
# Run these queries after loading data to verify database state

import psycopg2
from dotenv import load_dotenv
import os
import pandas as pd

load_dotenv()

# Connect to database
conn = psycopg2.connect(
    dbname=os.getenv('DB_NAME'),
    user=os.getenv('DB_USER'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    port=os.getenv('DB_PORT')
)

print("=" * 70)
print("DATABASE VERIFICATION QUERIES")
print("=" * 70)

# ---------------------------------------------------------------------------
# Query 1: Overall Counts
# ---------------------------------------------------------------------------
print("\n1. OVERALL DATA COUNTS")
print("-" * 70)

query1 = """
    SELECT 
        (SELECT COUNT(*) FROM images WHERE source = 'giub') as images,
        (SELECT COUNT(*) FROM ground_truth_history WHERE is_current = TRUE) as ground_truth,
        (SELECT COUNT(*) FROM predictions) as predictions,
        (SELECT COUNT(*) FROM llm_responses) as llm_responses,
        (SELECT COUNT(*) FROM analysis_runs) as runs,
        (SELECT COUNT(*) FROM prompts) as prompts
"""

df1 = pd.read_sql_query(query1, conn)
print(df1.to_string(index=False))

print("\nExpected for 12 test images:")
print("  images: 12")
print("  ground_truth: 60 (12 images √ó 5 labels)")
print("  predictions: 24 (12 images √ó 2 labels)")
print("  llm_responses: 12 (1 per image)")
print("  runs: 1")
print("  prompts: 1")

# ---------------------------------------------------------------------------
# Query 2: Check Prediction Format (CRITICAL FIX VERIFICATION)
# ---------------------------------------------------------------------------
print("\n\n2. PREDICTION FORMAT CHECK (Critical Fix)")
print("-" * 70)

query2 = """
    SELECT DISTINCT predicted_value 
    FROM predictions 
    ORDER BY predicted_value
"""

df2 = pd.read_sql_query(query2, conn)
print("Distinct prediction values:")
print(df2.to_string(index=False))

print("\nExpected: 'false' and 'true' (NOT '0' and '1')")

# ---------------------------------------------------------------------------
# Query 3: Predictions vs Ground Truth Comparison
# ---------------------------------------------------------------------------
print("\n\n3. PREDICTIONS VS GROUND TRUTH - Format Match Check")
print("-" * 70)

query3 = """
    SELECT 
        i.filename,
        p.label_name,
        p.predicted_value,
        pg_typeof(p.predicted_value) as pred_type,
        gt.value as ground_truth,
        pg_typeof(gt.value) as gt_type,
        CASE 
            WHEN p.predicted_value = gt.value THEN 'MATCH ‚úì'
            ELSE 'MISMATCH ‚úó'
        END as comparison
    FROM predictions p
    JOIN images i ON p.image_id = i.image_id
    JOIN ground_truth_history gt 
        ON p.image_id = gt.image_id 
        AND p.label_name = gt.label_name 
        AND gt.is_current = TRUE
    WHERE i.source = 'giub'
    ORDER BY i.filename, p.label_name
    LIMIT 10
"""

df3 = pd.read_sql_query(query3, conn)
print(df3.to_string(index=False))

print("\nExpected: All comparisons should show 'MATCH ‚úì'")
print("Expected: Both types should be 'text'")

# ---------------------------------------------------------------------------
# Query 4: Model Performance
# ---------------------------------------------------------------------------
print("\n\n4. MODEL PERFORMANCE (Accuracy by Label)")
print("-" * 70)

query4 = """
    SELECT 
        label_name,
        total_predictions,
        correct_predictions,
        accuracy_percentage,
        avg_confidence
    FROM model_performance
    ORDER BY label_name
"""

df4 = pd.read_sql_query(query4, conn)
print(df4.to_string(index=False))

print("\nExpected for test data:")
print("  is_photo: ~91.67% (11/12)")
print("  with_person: 100% (12/12)")

# ---------------------------------------------------------------------------
# Query 5: Analysis Run Details
# ---------------------------------------------------------------------------
print("\n\n5. ANALYSIS RUN DETAILS")
print("-" * 70)

query5 = """
    SELECT 
        analysis_run_id,
        run_timestamp,
        model_name,
        analysis_type,
        duration_seconds,
        ROUND(CAST(duration_seconds AS NUMERIC) / 60.0, 2) as duration_minutes,
        images_processed,
        start_time
    FROM analysis_runs
    ORDER BY run_timestamp DESC
"""

df5 = pd.read_sql_query(query5, conn)
print(df5.to_string(index=False))

print("\nExpected:")
print("  Model: MiniCPM")
print("  Duration: ~362 seconds (~6 minutes)")
print("  Run timestamp: 2026-01-07 00:10:27")

# ---------------------------------------------------------------------------
# Query 6: LLM Response Summary
# ---------------------------------------------------------------------------
print("\n\n6. LLM RESPONSE SUMMARY")
print("-" * 70)

query6 = """
    SELECT * FROM llm_response_summary
"""

df6 = pd.read_sql_query(query6, conn)
print(df6.to_string(index=False))

print("\nExpected:")
print("  total_responses: 12")
print("  successful_parses: 12")
print("  parse_success_rate: 100%")

# ---------------------------------------------------------------------------
# Query 7: Ground Truth Wide Format (Sample)
# ---------------------------------------------------------------------------
print("\n\n7. GROUND TRUTH WIDE FORMAT (First 5 Images)")
print("-" * 70)

query7 = """
    SELECT * FROM ground_truth_wide
    LIMIT 5
"""

df7 = pd.read_sql_query(query7, conn)
print(df7.to_string(index=False))

print("\nExpected: Each row shows one image with all 5 labels as columns")

# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
print("\n" + "=" * 70)
print("VERIFICATION COMPLETE")
print("=" * 70)

# Close connection
conn.close()

print("\n‚úÖ If all queries show expected results, data loaded correctly!")