In [None]:
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from datetime import datetime
from PIL import Image
import requests
from io import BytesIO
import aiohttp
import asyncio
import nest_asyncio
nest_asyncio.apply()
from time import time
from multiprocessing import Pool, cpu_count
import cv2
# import multiprocessing
# multiprocessing.set_start_method("spawn", force=True)

In [None]:
csv_pth = Path(r"C:\Users\au761367\Datasets\classif\camalien\camalien_images_and_responsefiles_2024.csv")

In [None]:
df=pd.read_csv(csv_pth, delimiter=';')

In [None]:
df.head()

In [None]:
df[['country','timestamp']].head()

In [None]:
df.groupby('country').agg('count').path

In [None]:
len(df)

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format="ISO8601")

In [None]:
countries = df['country'].unique()

In [None]:

def plot_dist(df):
    df=df.copy()
    for country in countries:
        subset = df[df['country'] == country]
    
        plt.figure(figsize=(10, 4))
        plt.hist(subset['timestamp'], bins=30)   # adjust bins as needed
        plt.title(f"Timestamp Histogram – {country}")
        plt.xlabel("Timestamp")
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        break
# plot_dist(sampled)

In [None]:
df['timestamp'].head()

In [None]:
date = "2024-06-11"

start = pd.to_datetime(date, utc=True).normalize()
end   = start + pd.Timedelta(days=1)

filtered = df[(df['timestamp'] >= start) & (df['timestamp'] < end)]

In [None]:
unique_days = df['timestamp'].dt.date.unique()
print(sorted(unique_days))

In [None]:
df['day'] = df['timestamp'].dt.normalize()

In [None]:
df = df.sort_values(['country', 'day', 'timestamp'])

In [None]:
df['imageurl'].tail().iloc[4]

In [None]:
df.tail(30)

In [None]:
N = 15

sampled = (
    df
    .groupby(['country', 'day'])
    .apply(lambda g: g.iloc[::N])   # take every N-th row
    .reset_index(drop=True)
)

In [None]:
len(sampled)

In [None]:
sampled.head()

In [None]:
async def fetch_image(session, url, semaphore):
    async with semaphore:
        try:
            async with session.get(url) as resp:
                resp.raise_for_status()
                data = await resp.read()
                return Image.open(BytesIO(data))
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            return None

async def fetch_many_images(urls, max_concurrency=16, timeout_seconds=10):
    timeout = aiohttp.ClientTimeout(total=timeout_seconds)
    connector = aiohttp.TCPConnector(limit=max_concurrency)

    semaphore = asyncio.Semaphore(max_concurrency)

    async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
        tasks = [
            fetch_image(session, url, semaphore)
            for url in urls
        ]
        return await asyncio.gather(*tasks)


def plot_sample_grid(df, start_idx=0, grid_size=4, url_col="imageurl"):
    """
    Plot a 4x4 grid (16 images) from df starting at index start_idx.
    Downloads using the `imageurl` column.
    """

    n_images = grid_size * grid_size
    urls = df[url_col].iloc[start_idx : start_idx + n_images]

    # run async loader
    t1=time()
    images = asyncio.run(fetch_many_images(urls, max_concurrency=16))
    t2=time()

    fig, axes = plt.subplots(grid_size, grid_size, figsize=(16, 16))
    axes = axes.flatten()

    for ax, img in zip(axes, images):
        if img is None:
            ax.text(0.5, 0.5, "Load error", ha="center", va="center")
            ax.axis("off")
            continue

        ax.imshow(img)
        ax.axis("off")

    plt.tight_layout()
    plt.show()
    t3=time()

In [None]:
plot_sample_grid(sampled)

In [None]:
df['imageurl'].head(11).iloc[10]

In [None]:
len(sampled)

In [None]:
async def download_and_save_image(session, url, save_path, semaphore):
    async with semaphore:
        try:
            async with session.get(url) as resp:
                resp.raise_for_status()
                data = await resp.read()
                with open(save_path, "wb") as f:
                    f.write(data)
        except Exception as e:
            print(f"Failed to download image {url}: {e}")

async def download_and_compress_image(session, url, save_path, semaphore, target_width=1920, jpeg_quality=95):
    async with semaphore:
        try:
            async with session.get(url) as resp:
                resp.raise_for_status()
                data = await resp.read()
                # Load image
                img = Image.open(BytesIO(data))
                orig_size = img.size
                # Resize proportionally
                w, h = img.size
                if w > target_width:
                    new_h = int(target_width * h / w)
                    img = img.resize((target_width, new_h), Image.LANCZOS)
                # Save compressed
                img.save(save_path, format="JPEG", quality=jpeg_quality)
                return orig_size, img.size
        except Exception as e:
            print(f"Failed to download/compress {url}: {e}")
            return None

async def download_and_save_json(session, url, save_path, semaphore):
    async with semaphore:
        try:
            async with session.get(url) as resp:
                resp.raise_for_status()
                data = await resp.text()
                with open(save_path, "w", encoding="utf-8") as f:
                    f.write(data)
        except Exception as e:
            print(f"Failed to download JSON {url}: {e}")

async def save_images_and_json(df, img_dir:Path, json_dir:Path, max_concurrency:int=16):
    semaphore = asyncio.Semaphore(max_concurrency)
    timeout = aiohttp.ClientTimeout(total=60)
    connector = aiohttp.TCPConnector(limit=max_concurrency)

    async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
        tasks = []

        for _, row in df.iterrows():
            # Image
            image_filename = f"{row['imagedataid']}.jpg"
            image_path = img_dir / image_filename
            tasks.append(download_and_save_image(session, row['imageurl'], image_path, semaphore))

            # JSON
            json_filename = f"{row['imagedataid']}.json"
            json_path = json_dir / json_filename
            tasks.append(download_and_save_json(session, row['pn_response'], json_path, semaphore))

        await asyncio.gather(*tasks)

In [None]:
out_dir = Path(r"D:")
img_dir = out_dir / "images"
json_dir = out_dir / "json"
img_dir.mkdir(exist_ok=True)
json_dir.mkdir(exist_ok=True)

In [None]:
asyncio.run(save_images_and_json(sampled, img_dir, json_dir, max_concurrency=16))

In [None]:

def compress_image_on_disk(input_path, output_path, target_width=1920, jpeg_quality=95):
    """
    Compress an image on disk, resizing proportionally to target_width
    and saving as JPEG with specified quality.
    
    Parameters:
        input_path (str): Path to the original image
        output_path (str): Path to save the compressed image
        target_width (int): Maximum width of compressed image
        jpeg_quality (int): JPEG quality (1-100)
        
    Returns:
        orig_size (tuple): (width, height) of original image
        compressed_size (tuple): (width, height) of compressed image
    """
    img = Image.open(input_path)
    orig_size = img.size

    w, h = img.size
    if w > target_width:
        new_h = int(target_width * h / w)
        img = img.resize((target_width, new_h), Image.LANCZOS)

    img.save(output_path, format="JPEG", quality=jpeg_quality)
    compressed_size = img.size

    return orig_size, compressed_size

In [None]:
input_path = Path(r"D:\images\0a72b807-ecfc-3d55-98aa-f8a2f81b838f.jpg")
output_path = Path(r"D:\tmp.jpeg")
compress_image_on_disk(input_path, output_path, target_width=2048)

In [None]:

def plot_original_vs_compressed(original_path, compressed_path, zoom_box=(1000, 1000, 400, 400), output_path=None):
    """
    Display original and compressed images side by side, with a zoomed region.
    
    Parameters:
        original_path (str): Path to original image
        compressed_path (str): Path to compressed image
        zoom_box (tuple): (x, y, width, height) in ORIGINAL image coordinates
    """
    from matplotlib.patches import Rectangle
    # Load images
    orig = Image.open(original_path)
    comp = Image.open(compressed_path)

    # Compute scaling factor for compressed image
    scale_x = comp.width / orig.width
    scale_y = comp.height / orig.height
    zoom_box_scaled = (
        int(zoom_box[0] * scale_x),
        int(zoom_box[1] * scale_y),
        int(zoom_box[2] * scale_x),
        int(zoom_box[3] * scale_y)
    )

    # Side-by-side full images with rectangle
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))
    axes[0].imshow(orig)
    axes[0].set_title("Original")
    axes[0].axis("off")
    axes[0].add_patch(Rectangle((zoom_box[0], zoom_box[1]), zoom_box[2], zoom_box[3],
                                edgecolor='red', facecolor='none', lw=2))

    axes[1].imshow(comp)
    axes[1].set_title("Compressed")
    axes[1].axis("off")
    axes[1].add_patch(Rectangle((zoom_box_scaled[0], zoom_box_scaled[1]),
                                zoom_box_scaled[2], zoom_box_scaled[3],
                                edgecolor='red', facecolor='none', lw=2))

    # Zoomed-in comparison
    fig2, axes2 = plt.subplots(1, 2, figsize=(12, 6))
    x, y, w, h = zoom_box
    axes2[0].imshow(orig.crop((x, y, x + w, y + h)))
    axes2[0].set_title("Original zoom")
    axes2[0].axis("off")

    xs, ys, ws, hs = zoom_box_scaled
    axes2[1].imshow(comp.crop((xs, ys, xs + ws, ys + hs)))
    axes2[1].set_title("Compressed zoom")
    axes2[1].axis("off")
    if output_path is not None and isinstance(output_path, (Path, str)):
        plt.savefig(output_path)
    else:
        plt.show()

In [None]:
plot_original_vs_compressed(input_path, output_path, zoom_box=(1000,1000,100,100), output_path=Path(r"D:\resizing.jpeg"))

In [None]:
# parallel resizing

def resize_image_task(args):
    path, output_dir, target_size = args
    try:
        filename = os.path.basename(path)
        out_path = os.path.join(output_dir, filename)

        img = cv2.imread(path)
        if img is None:
            return f"ERROR loading {path}"

        resized = cv2.resize(img, target_size, interpolation=cv2.INTER_AREA)
        cv2.imwrite(out_path, resized)

        return f"OK   {filename}"
    except Exception as e:
        return f"ERROR {path}: {e}"


def resize_images(
    input_dir,
    output_dir,
    target_size=(2048, 1500),
    extensions=(".jpg", ".jpeg", ".png", ".bmp", ".webp", ".tif")
):
    # Prepare file list
    images = [
        os.path.join(input_dir, f)
        for f in os.listdir(input_dir)
        if os.path.splitext(f)[1].lower() in extensions
    ]

    os.makedirs(output_dir, exist_ok=True)

    print(f"Found {len(images)} images…")

    # Prepare arguments for workers
    tasks = [(img, output_dir, target_size) for img in images]

    # Run multiprocessing pool
    with Pool(cpu_count()) as pool:
        for result in pool.imap_unordered(resize_image_task, tasks):
            print(result)

    print("Done.")

In [None]:
input_dir = Path(r"D:\images")
output_dir = Path(r"E:\images")

In [None]:
if __name__ == "__main__":
    resize_images(
        input_dir=input_dir,
        output_dir=output_dir,
        target_size=(2048, 1500)
    )