PREPROCESS DATASET

In [1]:
import os
import json
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from uuid import uuid4

FILE_TYPE = ".JPG"
IMAGES_PATH = "/data/luiz/dataset/serengeti_images/"
ANNOTATIONS_PATH = "/data/luiz/dataset/serengeti/SnapshotSerengeti_S1-11_v2.1.json"
SEQUENCE_PATH = "/ssd/luiz/dataset/sequences/"
RESULTS_PATH = "/data/luiz/dataset/partitions/"
ANNOTATIONS_PATH_CSV = "/data/luiz/dataset/serengeti/SnapshotSerengeti_v2_1_annotations.csv"
DATABASE = "serengeti"
CSV_FIELDS = {
    'capture_id': 'id',
    'question__standing': 'standing', 
    'question__resting': 'resting', 
    'question__moving': 'moving', 
    'question__eating': 'eating', 
    'question__interacting': 'interacting'
}
USE_CSV = True

SEED = 10

In [2]:
data = json.load(open(ANNOTATIONS_PATH))

if USE_CSV:
    df_csv = pd.read_csv(ANNOTATIONS_PATH_CSV)[CSV_FIELDS.keys()]
    df_data = pd.DataFrame(data["annotations"])
    df_data = pd.merge(df_data, df_csv, left_on='seq_id', right_on='capture_id', how='inner')

    df_data = df_data.rename(columns=CSV_FIELDS)
    # Remover duplicatas mantendo a primeira ocorrência
    df_data = df_data.loc[:, ~df_data.columns.duplicated()]

data["annotations"] = df_data.to_dict(orient='records')

  df_csv = pd.read_csv(ANNOTATIONS_PATH_CSV)[CSV_FIELDS.keys()]


In [3]:
ALL_ACTIONS = ['standing', 'resting', 'moving', 'eating', 'interacting']
ACTIONS = ['resting', 'moving', 'eating']
CATEGORIES_INVALID = [1]
MAX_ANIMALS = 1

def get_category_from_sequence(sequence):
    mapper = {}
    for action in ALL_ACTIONS:
        for frame in sequence:
            mapper[action] = mapper.get(action, 0) + frame.get(action, 0)
    return max(mapper, key=mapper.get)

def is_valid_frame(frame):
    try:
        count = int(frame["count"])
    except Exception:
        count = 0
    exist_file = os.path.isfile(frame["path"])
    category_invalid = frame["category_id"] in CATEGORIES_INVALID
    count_valid = count <= MAX_ANIMALS
    if not exist_file or category_invalid or not count_valid:
        return False
    # for action in ALL_ACTIONS:
    #     try:
    #         if not frame.get(action) >= 0:
    #             return False
    #     except Exception:
    #         return False
    return True

def get_sequence_mapper(annotations):
    mapper = {}
    print("getting sequence mapper")
    for item in tqdm(annotations):
        id = item.get("seq_id")
        item["path"] = f'{IMAGES_PATH}{item["image_id"]}{FILE_TYPE}'
        if is_valid_frame(item):
            if not mapper.get(id):
                mapper[id] = []
            mapper[id].append(item)
    return mapper

def get_frames_sequences(data):
    sequence_mapper = get_sequence_mapper(data["annotations"])
    events = []
    print("getting categories")
    for key in tqdm(sequence_mapper.keys()):
        frames = sequence_mapper[key]
        events.append({
            "num_frames": len(frames),
            "frames": frames,
            "datetime": frames[0]["datetime"],
            "category": get_category_from_sequence(frames)})
    return events

sequences = get_frames_sequences(data)

getting sequence mapper


  0%|          | 14500/7429835 [00:00<00:51, 144986.31it/s]

100%|██████████| 7429835/7429835 [01:17<00:00, 96034.34it/s] 


getting categories


100%|██████████| 1033219/1033219 [00:11<00:00, 92183.48it/s]


In [38]:
df = pd.DataFrame(sequences)
df = df[df.num_frames > 1]
df = df.replace("interacting", "moving").replace("standing", "resting")
df.category.unique()

array(['resting', 'moving', 'eating'], dtype=object)

In [39]:
def balance_dataset(df):
    dfs = []
    size = min(df.category.value_counts())
    for category in df.category.unique():
        filtered = df[df.category == category].sample(size, random_state=SEED)
        dfs.append(filtered)
    new_df = pd.concat(dfs).reset_index(drop=True)
    return new_df.sample(len(new_df), random_state=SEED).reset_index(drop=True)

def get_empty_in_frames(frames):
    for frame in frames:
        if frame["category_id"] == 0:
            return frame["path"]
    return None

def get_animal_in_frames(frames):
    for frame in frames:
        if frame["category_id"] != 0:
            return frame["path"]
    return None

df["path_empty"] = df.frames.map(lambda a: get_empty_in_frames(a))
df["path_animal"] = df.frames.map(lambda a: get_animal_in_frames(a))
df['location'] = df.frames.map(lambda a: a[0]['location'])
df['datetime'] = pd.to_datetime(df['datetime'])
df['path_seq'] = df.frames.map(lambda a: ",".join([item["path"] for item in a]))

df = df[["num_frames", "frames", "category", "location", "datetime", "path_empty", "path_animal", "path_seq"]]

In [40]:
# df = balance_dataset(df)
df.category.value_counts()

resting    792905
moving      56684
eating      28968
Name: category, dtype: int64

In [41]:
df.head()

Unnamed: 0,num_frames,frames,category,location,datetime,path_empty,path_animal,path_seq
18,3,"[{'sequence_level_annotation': True, 'id': '10...",resting,B04,2010-08-12 16:28:50,/data/luiz/dataset/serengeti_images/S1/B04/B04...,,/data/luiz/dataset/serengeti_images/S1/B04/B04...
19,3,"[{'sequence_level_annotation': True, 'id': '10...",moving,B04,2010-08-20 03:37:00,,/data/luiz/dataset/serengeti_images/S1/B04/B04...,/data/luiz/dataset/serengeti_images/S1/B04/B04...
20,2,"[{'sequence_level_annotation': True, 'id': '10...",eating,B05,2010-07-20 15:19:52,,/data/luiz/dataset/serengeti_images/S1/B05/B05...,/data/luiz/dataset/serengeti_images/S1/B05/B05...
21,2,"[{'sequence_level_annotation': True, 'id': '10...",resting,B05,2010-07-20 15:23:10,/data/luiz/dataset/serengeti_images/S1/B05/B05...,,/data/luiz/dataset/serengeti_images/S1/B05/B05...
22,2,"[{'sequence_level_annotation': True, 'id': '10...",eating,B05,2010-07-20 15:26:16,,/data/luiz/dataset/serengeti_images/S1/B05/B05...,/data/luiz/dataset/serengeti_images/S1/B05/B05...


In [42]:
import numpy as np
from PIL import Image

def random_df(df, size):
    return df.sample(n=size, random_state=SEED)


def create_side_by_side_image(batchs):
    response = []
    for batch in tqdm(batchs):
        output_path = f'{SEQUENCE_PATH}{uuid4()}{FILE_TYPE}'

        images = [Image.open(img) for img in batch.split(",")]
        min_height = min(img.height for img in images)
        resized_images = [
            img.resize((int(img.width * min_height / img.height), min_height), Image.Resampling.LANCZOS) for img in images
        ]
        total_width = sum(img.width for img in resized_images)
        combined_image = Image.new("RGB", (total_width, min_height), (255, 255, 255))
        x_offset = 0
        for img in resized_images:
            combined_image.paste(img, (x_offset, 0))
            x_offset += img.width
        combined_image.save(output_path)
        response.append(output_path)
    return response

def split_data_subsets(df):
    unique_locations = df.location.value_counts().index.values
    np.random.shuffle(unique_locations)
    # Define partition ratios
    train_ratio = 0.5
    val_ratio = 0.15
    # Calculate split indices
    n_total = len(unique_locations)
    train_end = int(train_ratio * n_total)
    val_end = train_end + int(val_ratio * n_total)
    # Split locations into partitions
    train_locations = unique_locations[:train_end]
    val_locations = unique_locations[train_end:val_end]
    test_locations = unique_locations[val_end:]
    # Assign partitions
    train = df[df['location'].isin(train_locations)]
    val = df[df['location'].isin(val_locations)]
    test = df[df['location'].isin(test_locations)]
    train = balance_dataset(train)
    val = balance_dataset(val)
    test = balance_dataset(test)
    return random_df(train, 20000), random_df(val, 3000), random_df(test, 10000)

def save_results(df, task, filename):
    path = f"{RESULTS_PATH}{task}/{DATABASE}/{filename}"
    print(path, len(df))
    print(df.category.value_counts(), "\n")
    df.to_csv(path, index=False)

def save_animal_classifier_dataset(df):
    df["category"] = df.path_animal.map(lambda a: "yes" if isinstance(a, str) else "no")
    df = df.rename(columns={"path_animal": "path"})
    df["path"] = df["path"].combine_first(df["path_empty"])
    df = df[["num_frames", "category", "location", "datetime", "path"]]
    train, val, test = split_data_subsets(df)
    save_results(train, "animal-classifier", "train.csv")
    save_results(val, "animal-classifier", "val.csv")
    save_results(test, "animal-classifier", "test.csv")


def save_behaviour_classifier_dataset(df):
    df = df.rename(columns={"path_animal": "path"})
    df = df[["num_frames", "category", "location", "datetime", "path", "path_seq"]].dropna()
    train, val, test = split_data_subsets(df)

    train["path_seq_saved"] = create_side_by_side_image(train["path_seq"])
    val["path_seq_saved"] = create_side_by_side_image(val["path_seq"])
    test["path_seq_saved"] = create_side_by_side_image(test["path_seq"])

    save_results(train, "behaviour-classifier", "train.csv")
    save_results(val, "behaviour-classifier", "val.csv")
    save_results(test, "behaviour-classifier", "test.csv")

def save_species_classifier_dataset(df):
    def build_prompt_by_dict(data):
        new_mapper = {idx: category for idx, category in enumerate(data)}
        print(new_mapper)
        items = [f"{idx}) {value}" for idx, value in enumerate(data)]
        if len(items) > 1:
            return ", ".join(items[:-1]) + " and " + items[-1]
        return items[0] if items else ""

    def build_labels_mapper(categories):
        mapper = {item["id"]: item["name"] for item in data["categories"]}
        return {category: mapper[category] for idx, category in enumerate(categories)}

    concatenated = [item for sublist in df.frames.values for item in sublist]
    df = pd.DataFrame(concatenated)
    df = df.rename(columns={"category_id": "category"})
    df = df[["category", "location", "datetime", "path"]]

    species = {
        'gazellegrants',
        'elephant',
        'lionfemale',
        'giraffe',
        'zebra',
        'buffalo',
        'wildebeest',
        'hyenaspotted',
        'wilddog',
        'otherbird'
    }

    categories_id = {item["id"] for item in data["categories"] if item["name"] in species}
    df = df[df['category'].isin(categories_id)]

    original = build_labels_mapper(df.category.unique())
    print("original categories names:", original)

    df["category"] = df["category"].replace(original)
    df["category"] = df["category"].replace({
        'lionfemale': 'lion',
        'otherbird': 'bird',
        'gazellegrants': 'gazelle',
        'hyenaspotted': 'hyena'
    })
    prompt = build_prompt_by_dict(df["category"].unique())
    print("prompt:", prompt)
    print(df.category.value_counts())

    train, val, test = split_data_subsets(df)

    save_results(train, "species-classifier", "train.csv")
    save_results(val, "species-classifier", "val.csv")
    save_results(test, "species-classifier", "test.csv")

save_species_classifier_dataset(df.copy())
# save_animal_classifier_dataset(df.copy())
# save_behaviour_classifier_dataset(df.copy())

original categories names: {8: 'hyenaspotted', 5: 'zebra', 13: 'giraffe', 15: 'buffalo', 2: 'gazellegrants', 18: 'wildebeest', 11: 'elephant', 21: 'lionfemale', 23: 'otherbird'}
{0: 'hyena', 1: 'zebra', 2: 'giraffe', 3: 'buffalo', 4: 'gazelle', 5: 'wildebeest', 6: 'elephant', 7: 'lion', 8: 'bird'}
prompt: 0) hyena, 1) zebra, 2) giraffe, 3) buffalo, 4) gazelle, 5) wildebeest, 6) elephant, 7) lion and 8) bird
wildebeest    136627
zebra          75555
giraffe        18477
buffalo        17806
bird           15140
elephant       14247
gazelle        13645
hyena           7470
lion            5775
Name: category, dtype: int64
/data/luiz/dataset/partitions/species-classifier/serengeti/train.csv 20000
elephant      2257
buffalo       2248
zebra         2245
gazelle       2238
lion          2220
hyena         2219
wildebeest    2209
giraffe       2203
bird          2161
Name: category, dtype: int64 

/data/luiz/dataset/partitions/species-classifier/serengeti/val.csv 3000
gazelle       350
hyen

PASSING IMAGES TO SSD

In [44]:
import os
import shutil
import os
import pandas as pd
from tqdm import tqdm

RESULTS_PATH = "/data/luiz/dataset/partitions/species-classifier/"

SEED = 10

def list_all_csv_files(directory):
    csv_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".csv"):
                csv_files.append(os.path.join(root, file))
    return csv_files

csv_files = list_all_csv_files(RESULTS_PATH)
ssd_images = []
for file in csv_files:
    ssd_images.extend(pd.read_csv(file)["path"])

# ssd_images = list(set(ssd_images))
len(ssd_images)

33000

REPLACE DIR HD TO DIR SSD

In [45]:
for file in csv_files:
    df = pd.read_csv(file)
    df["path"] = df["path"].map(lambda a: a.replace("/data/", "/ssd/"))
    df.to_csv(file, index=False)

COPY IMAGES TO SSD

In [46]:
def copy_images_to_ssd(images):
    for file_name in tqdm(list(set(images))):
        file_name_ssd = file_name.replace("/data/", "/ssd/")
        file_name_hd = file_name.replace("/ssd/", "/data/")
        destination_dir = os.path.dirname(file_name_ssd)
        if not os.path.exists(destination_dir):
            os.makedirs(destination_dir)
        shutil.copy(file_name_hd, file_name_ssd)

copy_images_to_ssd(ssd_images)

100%|██████████| 32380/32380 [11:50<00:00, 45.54it/s] 
