In [None]:
import os
from pathlib import Path
from ultralytics import YOLO
from PIL import Image
import shutil
import pandas as pd
from source import image_id_converter as img_idc
from source.db_loader import MLDataLoader
from source.db_loader import delete_images
#from source import sort_img_files as sif
from source import llm_input as llm_i
from source import llm_output as llm_o
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np

In [None]:
import ollama
import json
import re
import pickle

In [None]:
import glob


In [None]:
from psycopg2.extras import execute_batch
from typing import Dict, List, Optional


In [None]:
# Standard library imports
import sys
from datetime import datetime

# Database connection library
# psycopg2: PostgreSQL adapter for Python - handles all DB communication
from psycopg2 import extras  # extras provides advanced features like Json adapter
from psycopg2.extras import execute_batch
import psycopg2

# Environment variable management
# python-dotenv: Loads database credentials from .env file (keeps passwords out of code)
from dotenv import load_dotenv

In [None]:
def get_existing_image_ids(source_filter, conn=None, cur=None):
    """
    Get all source_image_ids from database.
    
    Parameters:
    -----------
    source_filter : str
        Source to filter by (e.g., 'giub')
    conn : connection object, optional
        Existing database connection (if None, creates new one)
    cur : cursor object, optional
        Existing cursor (if None, creates new one)
    
    Returns:
    --------
    list : List of source_image_ids
    """
    import psycopg2
    from dotenv import load_dotenv
    import os
    
    close_after = False
    if conn is None:
        load_dotenv()
        conn = psycopg2.connect(
            dbname=os.getenv('DB_NAME'),
            user=os.getenv('DB_USER'),
            password=os.getenv('DB_PASSWORD'),
            host=os.getenv('DB_HOST'),
            port=os.getenv('DB_PORT')
        )
        cur = conn.cursor()
        close_after = True
    elif cur is None:
        cur = conn.cursor()
    
    # Added WHERE clause with parameter
    cur.execute("""
        SELECT DISTINCT source_image_id 
        FROM images 
        WHERE source = %s 
        ORDER BY source_image_id;
    """, (source_filter,))
                    
    existing_ids = [row[0] for row in cur.fetchall()]
    print(f"Existing image IDs for source '{source_filter}': {existing_ids}")
    
    if close_after:
        cur.close()
        conn.close()
    
    return existing_ids

### Empty database:

### Cave! Only activate this cell if you want to delete all giub images and dependent data from the database!

In [None]:
# ============================================================================
# EMPTY DATABASE - Delete All Data
# ============================================================================
# Run this to reset database for testing data reload
# CASCADE handles all foreign key dependencies automatically


load_dotenv()

# Connect to database
conn = psycopg2.connect(
    dbname=os.getenv('DB_NAME'),
    user=os.getenv('DB_USER'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    port=os.getenv('DB_PORT')
)
cur = conn.cursor()

print("üóëÔ∏è  Deleting all data from database...")

# Delete data (CASCADE handles dependencies)
cur.execute("DELETE FROM images WHERE source = 'giub';")
print(f"   Deleted {cur.rowcount} images (CASCADE deleted related data)")

cur.execute("DELETE FROM analysis_runs;")
print(f"   Deleted {cur.rowcount} analysis runs")

cur.execute("DELETE FROM prompts;")
print(f"   Deleted {cur.rowcount} prompts")

# Commit changes
conn.commit()

# Verify everything is deleted
cur.execute("""
    SELECT 
        (SELECT COUNT(*) FROM images) as images,
        (SELECT COUNT(*) FROM ground_truth_history) as ground_truth,
        (SELECT COUNT(*) FROM predictions) as predictions,
        (SELECT COUNT(*) FROM llm_responses) as llm_responses,
        (SELECT COUNT(*) FROM analysis_runs) as runs,
        (SELECT COUNT(*) FROM prompts) as prompts
""")

result = cur.fetchone()
print(f"\n‚úÖ Database emptied. Current counts:")
print(f"   Images: {result[0]}")
print(f"   Ground truth: {result[1]}")
print(f"   Predictions: {result[2]}")
print(f"   LLM responses: {result[3]}")
print(f"   Analysis runs: {result[4]}")
print(f"   Prompts: {result[5]}")

# All should be 0
if all(count == 0 for count in result):
    print("\nüéâ Database is empty and ready for fresh data!")
else:
    print("\n‚ö†Ô∏è  Warning: Some data remains")

# Close connection
cur.close()
conn.close()

### Set paths:

In [None]:
project_path = Path.cwd()
#root_path = (project_path / '..' / 'test_data_folders/test_rec_multi_object_MiniCPM').resolve()
root_path = (project_path / '..' / 'test_data_folders/test_filter_out_people_multi_approach').resolve()

data_path = root_path / 'data'
tif_data_path = root_path / 'data_1'
jpg_data_path = root_path / 'data_jpg'
output_dir_not_photo = root_path / 'not_photo'
output_dir_with_person = root_path / 'with_person'
output_dir_without_person = root_path / 'without_person'

#visual_genome_path = (project_path/ '..' /'data_folders' / 'visual_genome_data').resolve()
#visual_genome_proc_path = (project_path/ '..' /'data_folders' / 'visual_genome_proc_data').resolve()


In [None]:


files = sorted(os.listdir(data_path), key=lambda f: os.path.getmtime(os.path.join(data_path, f)), reverse=True)
files

### Set file parameters:

In [None]:
file_source = 'giub' # Institute of Geography, University of Berne.
file_extension = '.tif' # First batch of images obtained from the institute of Geography (giub)
filename_tag = 'Oberland' # First batch of images obtained from the institute of Geography (giub)
meta_data_file = 'labels_mod.csv' # File containing ids and labels.
times_file = 'times_people_detect_multi_approach_yolo_20260213_144223.pkl' # File containing timestamp and duration of analysis-run.
results_file = 'people_detect_multi_approach_labels_results_yolo_20260213_144223.csv'
model_name_file = 'yolo_model_info.txt'


### Get yolo model version: 

In [None]:
# Get yolo model version: 
model_name_file_path = data_path / model_name_file
with open(model_name_file_path, 'r') as f:
    text = f.read()
model_version = text.split(': ')[1]
model_version

### Set analysis run parameters: 

In [None]:

analysis_type = 'yolo_classification'
model_name = 'yolo'
model_version = model_version
python_script = 'filter_out_people_multi_approach.ipynb'
#duration_seconds = times_data_llm['duration_seconds'][0]
#duration_seconds = times_data_yolo['duration_seconds'][0]


### Set database_name: 

In [None]:
database_name = 'image_analysis_dev'

### Check environment variables:

In [None]:

# Check if .env exists in current directory
env_path = Path('.env')
print(f"Current directory: {os.getcwd()}")
print(f".env exists: {env_path.exists()}")

# If loaded, check what environment variables are available
print(f"\nDB_NAME: {os.getenv('DB_NAME')}")
print(f"DB_USER: {os.getenv('DB_USER')}")
print(f"DB_HOST: {os.getenv('DB_HOST')}")
print(f"DB_PORT: {os.getenv('DB_PORT')}")

### Get image file names and id's of image files in folder (to be loaded):

In [None]:
# Get file names of image files in tif_data_path folder: 
search_pattern = '*' + file_extension
tif_files = sorted(list(tif_data_path.glob(search_pattern)))
#tif_files = sorted(list(tif_data_path.glob('*.tif')) + list(tif_data_path.glob('*.tiff')))
print(len(tif_files))
print(tif_files[0:2])

In [None]:
# Get id's of image files: 
file_ids_in_folder = []
for tif_file in tif_files:
    tif_file_str = str(tif_file)
    last_part = tif_file_str.split(filename_tag)[-1]
    #print(last_part)
    id_part = last_part.split('.')[0]
    #print(id_part)
    file_ids_in_folder.append(int(id_part))
print(len(file_ids_in_folder))
print(file_ids_in_folder[0:3])

### Check image ids present in the database:

In [None]:

existing_ids = get_existing_image_ids(source_filter=file_source, conn=None, cur=None)
print(len(existing_ids))
print(existing_ids[0:2])

### Check differences between image files present in database and image files in folder:

In [None]:
print('In database:')
print(len(existing_ids))
print(existing_ids)
print('In folder:')
print(file_ids_in_folder)

### Delete one image file from database for testing purposes:

In [None]:
#delete_images(image_ids=[107])

### Check differences between image files in database and in folder again: 

In [None]:
set.difference(set(file_ids_in_folder), set(existing_ids))

### Load images into the database: 

In [None]:
import re

print("=" * 70)
print("STEP 1: LOADING IMAGES")
print("=" * 70)

# Initialize MLDataLoader, this also establishes the connection with the database:
loader = MLDataLoader(db_name=database_name, source=file_source)

# Extract image information from TIF files
image_ids = []      # Will be integers: [2, 3, 8, 15, ...]
filenames = []      # Will be: ['BernerOberland002.tif', ...]
file_paths = []     # Full paths

for tif_file in tif_files:  # tif_files from our earlier exploration
    # Extract numeric ID from filename
    name = tif_file.stem  # 'BernerOberland002'
    match = re.search(r'(\d+)$', name)
    
    if match:
        id_int = int(match.group(1))  # '002' ‚Üí 2 (integer)
        
        image_ids.append(id_int)
        filenames.append(tif_file.name)
        file_paths.append(str(tif_file))

print(f"Found {len(image_ids)} images")
print(f"Image IDs (integers): {sorted(image_ids)}")
print()

# Load images using MLDataLoader
#id_mapping = loader.load_images(
result = loader.load_images_safe(
    image_ids=image_ids,
    filenames=filenames,
    file_paths=file_paths,
    source='giub'
)

print()

id_mapping = result['id_mapping']
new_files = result['inserted_files']
reused_files = result['existing_files']

print(f"‚úÖ ID Mapping created: {len(id_mapping)} entries")
print("Sample mappings (source, source_image_id) ‚Üí database_image_id:")
for key in sorted(id_mapping.keys())[:5]:
    print(f"  {key} ‚Üí {id_mapping[key]}")
print('New files:')
print(new_files)
print('Reused files:')
print(reused_files)

### Check differences between image files in database and in folder again: 

In [None]:
existing_ids_after = get_existing_image_ids(source_filter='giub', conn=None, cur=None)
existing_ids_after
print('\n')
print('Images present in the tif_data_path folder but not in the database:')
set.difference(set(file_ids_in_folder), set(existing_ids_after))

### Load label data: 

In [None]:
# Load labels CSV
label_data_path = os.path.join(data_path, meta_data_file)
label_data = pd.read_csv(label_data_path)

# Reconvert image ids to integers (e.g. '234') as strings from the form they were saved in (e.g. 'id234' to ensure 
# string data type to deal with duck typing): 
img_ids = list(label_data.image_id)
label_data['image_id'] = img_idc.reconvert_image_ids(img_ids)

label_data.head()

### Load times data:

In [None]:
times_filename_yolo = times_file
times_path_yolo = os.path.join(data_path, times_filename_yolo)
# Reload saved dictionary to check if saving worked:
with open(times_path_yolo, 'rb') as f:
   times_data_yolo = pickle.load(f)

In [None]:
times_data_yolo

### Get relevant times data: 

In [None]:
yolo_timestamp_id = times_data_yolo['time_stamp_start'][0].strftime('%Y%m%d_%H%M%S')
yolo_timestamp_id


In [None]:
duration_seconds = times_data_yolo['duration_seconds'][0]
duration_seconds


### Load results: 

In [None]:
# load results:
results_tabular_path = data_path / results_file
reloaded_results_tabular = pd.read_csv(results_tabular_path)

In [None]:
print(type(reloaded_results_tabular))
print(len(reloaded_results_tabular))
print(reloaded_results_tabular.keys())

In [None]:
len(set(reloaded_results_tabular.image_id))

### Get number of processed images: 

In [None]:
#images_processed = reloaded_results_tabular[timestamp_id]['predictions']['contains_persons'].shape[0]
images_processed = len(set(reloaded_results_tabular.image_id))
images_processed

### Check results (predictions):

In [None]:
print("=" * 70)
print("DATA LOADED IN JUPYTER SESSION")
print("=" * 70)

print("\n1. RESULTS (reloaded_results_tabular)")
print(f"   Type: {type(reloaded_results_tabular)}")
if isinstance(reloaded_results_tabular, dict):
    print(f"   Keys: {list(reloaded_results_tabular.keys())}")
    first_key = list(reloaded_results_tabular.keys())[0]
    print(f"   First key: {first_key}")
    print(f"   Structure under first key: {list(reloaded_results_tabular[first_key].keys())}")

if isinstance(reloaded_results_tabular, pd.DataFrame):
    print(f" Column names: reloaded_results_tabular.columns")
    print(f"   Shape: {reloaded_results_tabular.shape}")
    print(f"   Columns: {list(reloaded_results_tabular.columns)}")
    print(f"   First few rows:")
    print(reloaded_results_tabular.head(3))

print("\n3. LABEL DATA (label_data)")
print(f"   Type: {type(label_data)}")
if hasattr(label_data, 'shape'):
    print(f"   Shape: {label_data.shape}")
    print(f"   Columns: {list(label_data.columns)}")
    print(f"   First few rows:")
    print(label_data.head(3))

### Load ground truth (from label data) into the database: 

In [None]:
print("=" * 70)
print("STEP 3: TRANSFORMING AND LOADING GROUND TRUTH")
print("=" * 70)

# Transform label_data from wide to long format
label_data_long = label_data.melt(
    id_vars=['image_id'], 
    var_name='label_name',
    value_name='value'
)

# Convert image_id from '001' to 1 (integer)
label_data_long['image_id'] = label_data_long['image_id'].astype(int)

# Convert value from 0/1 to 'false'/'true' (database stores as text)
label_data_long['value'] = label_data_long['value'].apply(lambda x: 'true' if x == 1 else 'false')

print(f"Transformed {len(label_data)} rows (wide) ‚Üí {len(label_data_long)} rows (long)")
print(f"\nFirst few rows of transformed data:")
print(label_data_long.head(10))
print()

# Now load using the original load_ground_truth method we built
# (Not the single-row method, but the batch method)
from pandas import DataFrame
#loader.load_ground_truth(label_data_long, source='giub')
loader.load_ground_truth_safe(label_data_long, source='giub')

print("\n‚úÖ Ground truth loading complete!")


### Load analysis run meta data into database: 

In [None]:
analysis_run_id = loader.load_analysis_run(run_timestamp=yolo_timestamp_id, analysis_type=analysis_type, model_name=model_name,
                        python_script=python_script, model_version=model_version, hyperparameters=None, 
                        notes=None, start_time=None, duration_seconds=duration_seconds, images_processed=images_processed)

loader.conn.commit()


### Load predictions into database:

In [None]:
filepath = str(data_path / results_file)
loader.load_yolo_predictions(analysis_run_id, filepath, source='giub')


In [None]:
loader.close()

In [None]:
# ============================================================================
# VERIFY YOLO PREDICTIONS LOADED
# ============================================================================
import pandas as pd
import psycopg2
from dotenv import load_dotenv
import os

load_dotenv()

conn = psycopg2.connect(
    dbname=os.getenv('DB_NAME'),
    user=os.getenv('DB_USER'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    port=os.getenv('DB_PORT')
)

print("=" * 70)
print("YOLO PREDICTIONS VERIFICATION")
print("=" * 70)

# ---------------------------------------------------------------------------
# 1. Check Overall Prediction Counts by Model
# ---------------------------------------------------------------------------
print("\n1. PREDICTION COUNTS BY MODEL")
print("-" * 70)

query1 = """
SELECT 
    ar.model_name,
    ar.analysis_type,
    COUNT(*) as prediction_count,
    COUNT(DISTINCT p.image_id) as images_analyzed
FROM predictions p
JOIN analysis_runs ar ON p.analysis_run_id = ar.analysis_run_id
GROUP BY ar.model_name, ar.analysis_type
ORDER BY ar.model_name;
"""

df1 = pd.read_sql_query(query1, conn)
print(df1.to_string(index=False))

print("\nExpected:")
print("  YOLO (object_detection): 12 predictions, 12 images")

In [None]:
# ---------------------------------------------------------------------------
# 2. Verify YOLO Has NULL prompt_id
# ---------------------------------------------------------------------------
print("\n\n2. CHECK YOLO PROMPT_ID (Should be NULL)")
print("-" * 70)

query2 = """
SELECT 
    ar.model_name,
    p.prompt_id,
    COUNT(*) as count
FROM predictions p
JOIN analysis_runs ar ON p.analysis_run_id = ar.analysis_run_id
WHERE ar.model_name LIKE '%yolo%'
GROUP BY ar.model_name, p.prompt_id;
"""

df2 = pd.read_sql_query(query2, conn)
print(df2.to_string(index=False))

print("\nExpected: prompt_id should be NULL (or None)")

In [None]:
# ---------------------------------------------------------------------------
# 4. YOLO Performance
# ---------------------------------------------------------------------------
print("\n\n4. YOLO PERFORMANCE")
print("-" * 70)

query4 = """
SELECT 
    ar.model_name,
    COUNT(*) as total_predictions,
    SUM(CASE WHEN p.predicted_value = gt.value THEN 1 ELSE 0 END) as correct,
    ROUND(
        100.0 * SUM(CASE WHEN p.predicted_value = gt.value THEN 1 ELSE 0 END) / COUNT(*),
        2
    ) as accuracy_percent
FROM predictions p
JOIN analysis_runs ar ON p.analysis_run_id = ar.analysis_run_id
JOIN ground_truth_history gt 
    ON p.image_id = gt.image_id 
    AND p.label_name = gt.label_name 
    AND gt.is_current = TRUE
WHERE ar.model_name LIKE '%yolo%'
GROUP BY ar.model_name;
"""

df4 = pd.read_sql_query(query4, conn)
print(df4.to_string(index=False))


In [None]:

# ---------------------------------------------------------------------------
# 5. Analysis Runs Summary
# ---------------------------------------------------------------------------
print("\n\n5. ANALYSIS RUNS SUMMARY")
print("-" * 70)

query5 = """
SELECT 
    analysis_run_id,
    model_name,
    analysis_type,
    run_timestamp,
    model_version
FROM analysis_runs
ORDER BY run_timestamp DESC;
"""

df5 = pd.read_sql_query(query5, conn)
print(df5.to_string(index=False))


In [None]:
# Close connection
conn.close()

print("\n" + "=" * 70)
print("VERIFICATION COMPLETE")
print("=" * 70)