In [None]:
pip install -U datasets==3.6.0

In [None]:
pip install timezonefinder

In [None]:
from datasets import load_dataset
from google.colab import drive
import pandas as pd
from datetime import datetime
from itertools import islice
import json
from PIL import Image
import numpy as np
import os
import zipfile
from timezonefinder import TimezoneFinder
import pytz
from huggingface_hub import hf_hub_download

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
columns_to_keep = {"image_path", "latitude", "longitude", "country", "captured_at", "city"}

In [None]:
from PIL import Image

# Image preprocessor
# remaps pixel values (0-255) -> [0,1]
def preprocess_image(x, size=(224, 224)):
    image = Image.open(x).convert("RGB").resize(size)
    image_array = np.array(image).astype(np.float32) / 255.0
    return image_array

# Timezone preprocessor
# Adds column "time" that is the local time in the timezone of the location
# Adds column "year_percent" that is the percentage through the year
tf = TimezoneFinder()
def preprocess_time(output):
    tz_name = tf.timezone_at(lat=output['latitude'], lng=output['longitude'])
    if tz_name is None:
        tz_name = tf.closest_timezone_at(lat=output['latitude'], lng=output['longitude'])
    local_tz = pytz.timezone(tz_name)
    utc_dt = datetime.utcfromtimestamp(int(output['timestamp']) / 1000).replace(tzinfo=pytz.utc)
    local_dt = utc_dt.astimezone(local_tz)
    local_dt = local_dt.replace(tzinfo=None).replace(microsecond=0)

    output["time"] = local_dt.hour / 24.0 + local_dt.minute / 1440.0 + local_dt.second / 86400.0

    start_of_year = datetime(local_dt.year, 1, 1)
    start_of_next_year = datetime(local_dt.year + 1, 1, 1)
    elapsed = (local_dt - start_of_year).total_seconds()
    total = (start_of_next_year - start_of_year).total_seconds()
    output["year_percent"] = elapsed / total  # value between 0 and 1
    return output

In [None]:
from huggingface_hub import hf_hub_download
import os
import zipfile
from datasets import Dataset
import shutil

save_dir = '/content/drive/MyDrive/Colab Notebooks/Masters/BigDataAnalysisProject/osv5m/processed/test/'
dir_location = "datasets/OpenWorld/" # Dataset directiory
file_location = "images/test/" # File directory
file_start = 0
file_count = 5
for i in range(file_start, file_start + file_count):
    file_name = f"{i:02d}"
    zip_name = dir_location + file_location + file_name + ".zip"
    folder_name = dir_location + file_location + file_name
    hf_hub_download(repo_id="osv5m/osv5m", filename=file_location + file_name + ".zip", repo_type='dataset', local_dir="datasets/OpenWorld")
    with zipfile.ZipFile(zip_name, 'r') as zip_ref:
        zip_ref.extractall(dir_location + file_location)
        os.remove(zip_name)

    available_ids = {os.path.splitext(f)[0] for f in os.listdir(folder_name)}
    df_filtered = df[df['id'].astype(str).isin(available_ids)].reset_index(drop=True)

    df_filtered["image_path"] = df_filtered["id"].astype(str).apply(lambda x: folder_name + f"/{x}.jpg")

    df_filtered = df_filtered.drop(columns=[col for col in df_filtered.columns if col not in columns_to_keep])
    df_filtered = df_filtered.rename(columns={'captured_at': 'timestamp'})
    df_filtered["image"] = df_filtered["image_path"].apply(preprocess_image)
    df_filtered = df_filtered.apply(preprocess_time, axis=1)
    df_filtered = df_filtered.drop(columns=["image_path", "timestamp"])

    df_filtered.to_pickle(f'{save_dir}{file_name}.pkl')
    print(f"saving {file_name}")

    shutil.rmtree(folder_name)

00.zip:   0%|          | 0.00/2.25G [00:00<?, ?B/s]

saving 00


01.zip:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

saving 01


02.zip:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

saving 02


03.zip:   0%|          | 0.00/2.25G [00:00<?, ?B/s]

saving 03


04.zip:   0%|          | 0.00/453M [00:00<?, ?B/s]

saving 04
