In [None]:
import tensorflow as tf
from tensorflow.keras.utils import img_to_array, load_img
import numpy as np
import pathlib
import json
import utils

CONFIG = utils.load_config()
IMAGES_PATH = CONFIG["images_path"]
CONFIG_FILE = "../config.json"
IMAGE_DIMENSIONS = (299, 299)

In [3]:
def preprocess_images(image_paths: pathlib.Path, target_size=IMAGE_DIMENSIONS) -> np.ndarray:
    out = []
    for p in image_paths:
        img = load_img(p, target_size=target_size)
        arr = img_to_array(img) / 255.0
        out.append(arr)
    return np.array(out)

In [None]:
image_paths = [
    "flickr_scraper/images/1900/190756525.jpg",
]

img = load_img(image_paths[0])
arr = img_to_array(img)
arr

In [None]:
import polars as pl


def load_data_for_year(dir: pathlib.Path, target_size=(224,224)) -> tuple[np.ndarray, np.ndarray]:
    """
    Loads images and labels for a year.

    Args:
        dir : pathlib.Path
            Directory containing images and a metadata.csv file.

        target_size : tuple[int, int],  default=(224,224)
            Size to resize images to (height, width).

    Returns:
        np.ndarray
            Array of preprocessed images.

        np.ndarray
            Array of labels (year, latitude, longitude).
    """
    if isinstance(dir, str):
        dir = pathlib.Path(dir)
    
    df = pl.read_csv(dir / "metadata.csv")

    images = []
    labels = []
    for id in df["id"]:
        filename = str(id) + ".jpg"
        try:
            img = load_img(dir / filename, target_size=target_size)
            arr = img_to_array(img) / 255.0
            images.append(arr)

            # Saves metadata of file
            metadata = df.filter(pl.col("id") == id).select(["year", "latitude", "longitude"])
            labels.append(metadata.to_numpy()[0])
        except FileNotFoundError:
            continue

    return np.array(images), np.array(labels)


images, labels = load_data_for_year(pathlib.Path(IMAGES_PATH + "/1900"))
labels

In [6]:
from sklearn.preprocessing import MinMaxScaler


def load_all_data(dir: pathlib.Path, target_size=(224, 224), scaling=True, start_year=1900, end_year=2025) -> tuple[np.ndarray, np.ndarray]:
    """
    Loads images and labels from a directory.

    Args:
        dir : pathlib.Path
            Directory year subdirectories.

        target_size : tuple[int, int],  default=(224,224)
            Size to resize images to (height, width).

        scaling : bool, default=True
            Whether or not to scale coordinates.

        start_year : int, default=1900
            First year of the range to include images from (inclusive).

        end_year : int, default=2025
            Last year of the range to include images from (exclusive).

    Returns:
        np.ndarray
            array of preprocessed images

        np.ndarray
            array of labels (year, latitude, longitude)
    """
    if isinstance(dir, str):
        dir = pathlib.Path(dir)

    images = []
    labels = []
    for d in sorted(dir.iterdir()):
        if int(d.name) < start_year:
            continue
        elif int(d.name) >= end_year:
            break
        new_images, new_labels = load_data_for_year(d, target_size=target_size)
        images.append(new_images)
        labels.append(new_labels)
    image_arr = np.concatenate(images)
    label_arr = np.concatenate(labels)

    if scaling:
        scaler = MinMaxScaler()
        label_arr[:, 1:] = scaler.fit_transform(label_arr[:, 1:])

    return image_arr, label_arr


In [None]:
images, labels = load_all_data(IMAGES_PATH, start_year=1945, end_year=1960)

In [None]:
images

In [None]:
import os

FAULTY_YEAR = IMAGES_PATH + "/1914"
METADATA_FILE = FAULTY_YEAR + "/metadata.csv"

file_count = len([f for f in os.listdir(FAULTY_YEAR) if os.path.isfile(os.path.join(FAULTY_YEAR,f))])
df = pl.read_csv(METADATA_FILE).unique(["id"]) #, "latitude", "longitude"])
print(file_count)
df

In [None]:
faulty_dir = pathlib.Path(FAULTY_YEAR)

downloaded_ids = []
for f in faulty_dir.glob("*.jpg"):
    downloaded_ids.append(int(f.name.replace(".jpg", "")))

ids_without_metadata = []
for id in downloaded_ids:
    if id not in df["id"]:
        ids_without_metadata.append(id)
ids_without_metadata

In [None]:
d = pathlib.Path(FAULTY_YEAR)
d.name