In [None]:
import os
from pathlib import Path
from ultralytics import YOLO
from PIL import Image
import shutil
import pandas as pd
from source import image_id_converter as img_idc
from source.db_loader import MLDataLoader
from source.db_loader import delete_images
#from source import sort_img_files as sif
from source import llm_input as llm_i
from source import llm_output as llm_o
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np

In [None]:
import ollama
import json
import re
import pickle

In [None]:
import pandas as pd
import psycopg2
from dotenv import load_dotenv


In [None]:
import glob


In [None]:
from psycopg2.extras import execute_batch
from typing import Dict, List, Optional

In [None]:
# Standard library imports
import sys
from datetime import datetime
from pathlib import Path

# Database connection library
# psycopg2: PostgreSQL adapter for Python - handles all DB communication
import psycopg2
from psycopg2 import extras  # extras provides advanced features like Json adapter

# Data manipulation
import pandas as pd  # For handling CSV files and DataFrames

# Environment variable management
# python-dotenv: Loads database credentials from .env file (keeps passwords out of code)
from dotenv import load_dotenv

In [None]:
def get_existing_image_ids(source_filter, conn=None, cur=None):
    """
    Get all source_image_ids from database.
    
    Parameters:
    -----------
    source_filter : str
        Source to filter by (e.g., 'giub')
    conn : connection object, optional
        Existing database connection (if None, creates new one)
    cur : cursor object, optional
        Existing cursor (if None, creates new one)
    
    Returns:
    --------
    list : List of source_image_ids
    """
    import psycopg2
    from dotenv import load_dotenv
    import os
    
    close_after = False
    if conn is None:
        load_dotenv()
        conn = psycopg2.connect(
            dbname=os.getenv('DB_NAME'),
            user=os.getenv('DB_USER'),
            password=os.getenv('DB_PASSWORD'),
            host=os.getenv('DB_HOST'),
            port=os.getenv('DB_PORT')
        )
        cur = conn.cursor()
        close_after = True
    elif cur is None:
        cur = conn.cursor()
    
    # Added WHERE clause with parameter
    cur.execute("""
        SELECT DISTINCT source_image_id 
        FROM images 
        WHERE source = %s 
        ORDER BY source_image_id;
    """, (source_filter,))
                    
    existing_ids = [row[0] for row in cur.fetchall()]
    print(f"Existing image IDs for source '{source_filter}': {existing_ids}")
    
    if close_after:
        cur.close()
        conn.close()
    
    return existing_ids

### Empty database:

#### If necessary use this:
loader.conn.rollback()

### Set paths:

In [None]:
project_path = Path.cwd()

visual_genome_path = (project_path/ '..' /'data_folders' / 'visual_genome_data').resolve()
visual_genome_proc_path = (project_path/ '..' /'data_folders' / 'visual_genome_proc_data').resolve()


### Check files: 

In [None]:
vg_files = sorted([f for f in os.listdir(visual_genome_path) if not f.endswith('.jpg')], 
                  key=lambda f: os.path.getmtime(os.path.join(visual_genome_path, f)), 
                  reverse=True)
vg_files


### Set file paramters: 

In [None]:
file_source = 'visual_genome' # Institute of Geography, University of Berne.
file_extension = '.jpg' # First batch of images obtained from the institute of Geography (giub)
filename_tag = 'visual_genome_proc' # First batch of images obtained from the institute of Geography (giub)
labels_file = 'labels.csv' # File containing ids and labels.
times_file = 'times_clustering_pipeline_20260208_225215.pkl' # File containing timestamp and duration of analysis-run.
metadata_results_file = 'results_clustering_pipeline_20260208_225215.pkl'
#responses_file = 'responses_llm_people_detect_multi_approach_20260213_144241.pkl'
#model_name_file = 'minicpm_v_model_info.txt'
#metadata_results_file = 'metadata_results_clustering.pkl'


### Set analysis rung parameters: 

In [None]:
analysis_type = 'clustering with autoencoder'
model_name = 'cdae_model'
#model_version = model_version
python_script = 'img_to_pytorch.ipynb'


### Get file path of trained model: 

In [None]:
models_file_path = project_path / model_name
models = os.listdir(models_file_path)
models.sort(reverse=True)
print(len(models))
models[0:3]

In [None]:
fully_trained_model = models[0]
model_file_path = str(project_path / model_name / fully_trained_model)
model_file_path


### Set database name:

In [None]:
database_name = 'image_analysis_dev'


### Check environment variables:

In [None]:
import os
from pathlib import Path

# Check if .env exists in current directory
env_path = Path('.env')
print(f"Current directory: {os.getcwd()}")
print(f".env exists: {env_path.exists()}")

# If loaded, check what environment variables are available
print(f"\nDB_NAME: {os.getenv('DB_NAME')}")
print(f"DB_USER: {os.getenv('DB_USER')}")
print(f"DB_HOST: {os.getenv('DB_HOST')}")
print(f"DB_PORT: {os.getenv('DB_PORT')}")

### Load images:

### Get filepaths of images in folder:

In [None]:
file_path = visual_genome_proc_path / labels_file

# Load CSV
label_data = pd.read_csv(file_path)

# Get image paths from labels file and convert image paths to Path objects
vg_files = [Path(p) for p in label_data['file_paths']]

print('Number of vg files:')
print(len(vg_files))
print('Compare file id with filepath:')
print(label_data.file_paths[400])
print(label_data.image_id[400])
print(label_data.head())

### Get ID's of images in the database (filtered by source):

In [None]:
# file_source: 'giub'
get_existing_image_ids(source_filter='giub', conn=None, cur=None)

In [None]:
# file_source: 'visual_genome'
get_existing_image_ids(source_filter=file_source, conn=None, cur=None)

### Load images into database:

In [None]:

print("=" * 70)
print("STEP 1: LOADING VISUAL GENOME IMAGES")
print("=" * 70)

# Initialize MLDataLoader, this also establishes the connection with the database:
loader = MLDataLoader(db_name=database_name, source=file_source)

# Extract image information from TIF files
image_ids = []      # Will be integers: [2, 3, 8, 15, ...]
filenames = []      # Will be: ['BernerOberland002.tif', ...]
file_paths = []     # Full paths

for file in vg_files:  # tif_files from our earlier exploration
    # Extract numeric ID from filename
    name = file.stem  #
    match = re.search(r'(\d+)$', name)
    
    if match:
        id_int = int(match.group(1))  # '002' ‚Üí 2 (integer)
        
        image_ids.append(id_int)
        filenames.append(file.name)
        file_paths.append(str(file))

# print(f"Found {len(image_ids)} images")
#print(f"Image IDs (integers): {sorted(image_ids)}")
# print()

# Load images using MLDataLoader
#id_mapping = loader.load_images(
result = loader.load_images_safe(
    image_ids=image_ids,
    filenames=filenames,
    file_paths=file_paths,
    source=file_source
)

print()

id_mapping = result['id_mapping']
new_files = result['inserted_files']
reused_files = result['existing_files']

print(f"‚úÖ ID Mapping created: {len(id_mapping)} entries")
print("Sample mappings (source, source_image_id) ‚Üí database_image_id:")
for key in sorted(id_mapping.keys())[:5]:
    print(f"  {key} ‚Üí {id_mapping[key]}")
print('New files:')
print(len(new_files))
print(new_files[0:3])
print('Reused files:')
print(len(reused_files))
print(reused_files[0:3])

### Load ground truth into database:

In [None]:
import pandas as pd

print("=" * 70)
print("STEP 3: LOADING GROUND TRUTH (BUILDINGS LABEL)")
print("=" * 70)

label_data = label_data[['image_id', 'buildings']]

print(f"Loaded {len(label_data)} images with 'buildings' label")

# Transform label_data from wide to long format
label_data_long = label_data.melt(
    id_vars=['image_id'], 
    var_name='label_name',
    value_name='value'
)

# Convert image_id to integer (if it's a string)
label_data_long['image_id'] = label_data_long['image_id'].astype(int)

# Convert value from 0/1 to 'false'/'true'
label_data_long['value'] = label_data_long['value'].apply(lambda x: 'true' if x == 1 else 'false')

print(f"Transformed {len(label_data)} rows (wide) ‚Üí {len(label_data_long)} rows (long)")
print(f"\nFirst few rows:")
print(label_data_long.head(10))
print()

# Load ground truth
result = loader.load_ground_truth_safe(label_data_long, source=file_source)

print("\n‚úÖ Ground truth loading complete!")
print(f"   Inserted: {result['inserted']}")
print(f"   Existing: {result['existing']}")
print(f"   Skipped: {result['skipped']}")

### Load analysis_runs data: 

In [None]:
import pickle
file_path = visual_genome_path / metadata_results_file
# Load
with open(file_path, 'rb') as f:
    metadata_results_clustering = pickle.load(f)

In [None]:
metadata_results_clustering.keys()

In [None]:
metadata_results_clustering['autoencoder_params']

In [None]:
metadata_results_clustering['cluster_data'].head()

### Load times data: 

In [None]:
import pickle
file_path = visual_genome_path / times_file
# Load
with open(file_path, 'rb') as f:
    times_data = pickle.load(f)

In [None]:
times_data

### Get numbers of train, validation, and total images processed:

In [None]:
# Get numbers of train, validation, and total images processed:
n_train_images = metadata_results_clustering['autoencoder_params']['n_train_images']
print(n_train_images)

n_validation_images = metadata_results_clustering['autoencoder_params']['n_validation_images']
print(n_validation_images)
n_images = n_train_images + n_validation_images
n_images


In [None]:
metadata_results_clustering['run_timestamp']
metadata_results_clustering['analysis_type']
metadata_results_clustering['model_name']
metadata_results_clustering['python_script']
times_data['duration_seconds']
metadata_results_clustering['autoencoder_name']
metadata_results_clustering['autoencoder_implementation']
metadata_results_clustering['autoencoder_params']
metadata_results_clustering['dim_reduction_name'][0]
metadata_results_clustering['dim_reduction_implementation']
metadata_results_clustering['dim_reduction_params']
metadata_results_clustering['clustering_name']
metadata_results_clustering['clustering_params']
metadata_results_clustering['clustering_implementation']

### Load analysis runs data into database:

In [None]:
analysis_run_id = loader.load_analysis_run(run_timestamp = metadata_results_clustering['run_timestamp'], 
                         analysis_type = metadata_results_clustering['analysis_type'], 
                         model_name = metadata_results_clustering['model_name'], 
                         python_script = metadata_results_clustering['python_script'], 
                         model_version=None, 
                         hyperparameters=None, 
                         notes='Analysed amples taken from training set (cheating).', 
                         start_time=None, 
                         duration_seconds = times_data['duration_seconds'][0], 
                         images_processed = n_images, 
                         # Clustering pipeline parameters (all optional) 
                         autoencoder_name = 'convolutional_autoencoder', 
                         autoencoder_implementation = metadata_results_clustering['autoencoder_implementation'], 
                         autoencoder_file = model_file_path, 
                         autoencoder_params = metadata_results_clustering['autoencoder_params'], 
                         dim_reduction_name = metadata_results_clustering['dim_reduction_name'][0], 
                         dim_reduction_implementation = metadata_results_clustering['dim_reduction_implementation'], 
                         dim_reduction_params = metadata_results_clustering['dim_reduction_params'], 
                         clustering_name = metadata_results_clustering['clustering_name'], 
                         clustering_implementation = metadata_results_clustering['clustering_implementation'], 
                         clustering_params = metadata_results_clustering['clustering_params'])
analysis_run_id


In [None]:
#loader.conn.rollback()

### Load results into database: 

In [None]:
result = loader.load_clustering_results(
    analysis_run_id=analysis_run_id,
    clustering_dataframe=metadata_results_clustering['cluster_data'],
    source=file_source
)

In [None]:
# Close cursor and connection
loader.cur.close()
loader.conn.close()

In [None]:
# ============================================================================
# CLUSTERING DATA VERIFICATION
# ============================================================================
import pandas as pd
import psycopg2
from dotenv import load_dotenv
import os

load_dotenv()

conn = psycopg2.connect(
    dbname=os.getenv('DB_NAME'),
    user=os.getenv('DB_USER'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    port=os.getenv('DB_PORT')
)

print("=" * 80)
print("CLUSTERING DATA VERIFICATION")
print("=" * 80)

# ---------------------------------------------------------------------------
# 1. Check Visual Genome Images Loaded
# ---------------------------------------------------------------------------
print("\n1. VISUAL GENOME IMAGES")
print("-" * 80)

query1 = """
SELECT 
    source,
    COUNT(*) as image_count,
    MIN(source_image_id) as min_id,
    MAX(source_image_id) as max_id
FROM images
WHERE source = 'visual_genome'
GROUP BY source;
"""

df1 = pd.read_sql_query(query1, conn)
print(df1.to_string(index=False))

if len(df1) == 0:
    print("\n‚ö†Ô∏è  WARNING: No visual_genome images found!")
else:
    print(f"\n‚úÖ Found {df1['image_count'].iloc[0]} Visual Genome images")


In [None]:
# ---------------------------------------------------------------------------
# 2. Check Clustering Analysis Run
# ---------------------------------------------------------------------------
print("\n\n2. CLUSTERING ANALYSIS RUN DETAILS")
print("-" * 80)

query2 = """
SELECT 
    analysis_run_id,
    run_timestamp,
    model_name,
    analysis_type,
    autoencoder_name,
    autoencoder_implementation,
    dim_reduction_name,
    dim_reduction_implementation,
    clustering_name,
    clustering_implementation
FROM analysis_runs
WHERE analysis_type = 'clustering'
ORDER BY run_timestamp DESC
LIMIT 1;
"""

df2 = pd.read_sql_query(query2, conn)
if len(df2) == 0:
    print("‚ö†Ô∏è  WARNING: No clustering analysis run found!")
    clustering_run_id = None
else:
    print(df2.transpose().to_string())
    clustering_run_id = df2['analysis_run_id'].iloc[0]
    print(f"\n‚úÖ Found clustering analysis run (ID: {clustering_run_id})")

In [None]:
# ---------------------------------------------------------------------------
# 3. Check Clustering Results Loaded
# ---------------------------------------------------------------------------
print("\n\n3. CLUSTERING RESULTS")
print("-" * 80)

if clustering_run_id:
    query3 = f"""
    SELECT 
        COUNT(*) as total_assignments,
        COUNT(DISTINCT cluster_id) as num_clusters,
        COUNT(DISTINCT image_id) as images_clustered,
        MIN(cluster_id) as min_cluster,
        MAX(cluster_id) as max_cluster
    FROM clustering_results
    WHERE analysis_run_id = {clustering_run_id};
    """
    
    df3 = pd.read_sql_query(query3, conn)
    print(df3.to_string(index=False))
    
    if df3['total_assignments'].iloc[0] == 0:
        print("\n‚ö†Ô∏è  WARNING: No clustering results found!")
    else:
        print(f"\n‚úÖ {df3['total_assignments'].iloc[0]} cluster assignments loaded")
        print(f"   Clusters: {df3['num_clusters'].iloc[0]} unique clusters")
        print(f"   Images: {df3['images_clustered'].iloc[0]} images clustered")
else:
    print("‚ö†Ô∏è  Skipping - no clustering run found")

In [None]:
# ---------------------------------------------------------------------------
# 4. Check Ground Truth (Buildings Label)
# ---------------------------------------------------------------------------
print("\n\n4. GROUND TRUTH - BUILDINGS LABEL")
print("-" * 80)

query4 = """
SELECT 
    COUNT(*) as total_labels,
    SUM(CASE WHEN value = 'true' THEN 1 ELSE 0 END) as with_buildings,
    SUM(CASE WHEN value = 'false' THEN 1 ELSE 0 END) as without_buildings
FROM ground_truth_history gt
JOIN images i ON gt.image_id = i.image_id
WHERE i.source = 'visual_genome'
  AND gt.label_name = 'buildings'
  AND gt.is_current = TRUE;
"""

df4 = pd.read_sql_query(query4, conn)
print(df4.to_string(index=False))

if df4['total_labels'].iloc[0] == 0:
    print("\n‚ö†Ô∏è  WARNING: No 'buildings' ground truth found!")
else:
    print(f"\n‚úÖ {df4['total_labels'].iloc[0]} images labeled for buildings")

In [None]:

# ---------------------------------------------------------------------------
# 5. Cluster Distribution
# ---------------------------------------------------------------------------
print("\n\n5. CLUSTER DISTRIBUTION")
print("-" * 80)

if clustering_run_id:
    query5 = f"""
    SELECT 
        cr.cluster_id,
        COUNT(*) as image_count,
        ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) as percentage
    FROM clustering_results cr
    WHERE cr.analysis_run_id = {clustering_run_id}
    GROUP BY cr.cluster_id
    ORDER BY cr.cluster_id;
    """
    
    df5 = pd.read_sql_query(query5, conn)
    print(df5.to_string(index=False))
    print(f"\n‚úÖ Cluster distribution shown above")
else:
    print("‚ö†Ô∏è  Skipping - no clustering run found")

In [None]:

# ---------------------------------------------------------------------------
# 6. Buildings Label by Cluster (Combined View)
# ---------------------------------------------------------------------------
print("\n\n6. BUILDINGS LABEL DISTRIBUTION BY CLUSTER")
print("-" * 80)

if clustering_run_id:
    query6 = f"""
    SELECT 
        cr.cluster_id,
        COUNT(*) as total_images,
        SUM(CASE WHEN gt.value = 'true' THEN 1 ELSE 0 END) as with_buildings,
        SUM(CASE WHEN gt.value = 'false' THEN 1 ELSE 0 END) as without_buildings,
        ROUND(
            100.0 * SUM(CASE WHEN gt.value = 'true' THEN 1 ELSE 0 END) / COUNT(*),
            2
        ) as buildings_percentage
    FROM clustering_results cr
    JOIN images i ON cr.image_id = i.image_id
    LEFT JOIN ground_truth_history gt 
        ON cr.image_id = gt.image_id 
        AND gt.label_name = 'buildings'
        AND gt.is_current = TRUE
    WHERE cr.analysis_run_id = {clustering_run_id}
    GROUP BY cr.cluster_id
    ORDER BY cr.cluster_id;
    """
    
    df6 = pd.read_sql_query(query6, conn)
    print(df6.to_string(index=False))
    print(f"\n‚úÖ Buildings distribution by cluster shown above")
else:
    print("‚ö†Ô∏è  Skipping - no clustering run found")

In [None]:



# ---------------------------------------------------------------------------
# 7. Sample of Clustering Results with Details
# ---------------------------------------------------------------------------
print("\n\n7. SAMPLE CLUSTERING RESULTS (First 10 images)")
print("-" * 80)

if clustering_run_id:
    query7 = f"""
    SELECT 
        i.filename,
        i.source_image_id,
        cr.cluster_id,
        gt.value as has_buildings
    FROM clustering_results cr
    JOIN images i ON cr.image_id = i.image_id
    LEFT JOIN ground_truth_history gt 
        ON cr.image_id = gt.image_id 
        AND gt.label_name = 'buildings'
        AND gt.is_current = TRUE
    WHERE cr.analysis_run_id = {clustering_run_id}
    ORDER BY i.source_image_id
    LIMIT 10;
    """
    
    df7 = pd.read_sql_query(query7, conn)
    print(df7.to_string(index=False))
else:
    print("‚ö†Ô∏è  Skipping - no clustering run found")

# Close connection
conn.close()

print("\n" + "=" * 80)
print("VERIFICATION COMPLETE")
print("=" * 80)