PREPROCESS DATASET

In [3]:
import os
import json
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from uuid import uuid4

FILE_TYPE = ".JPG"
IMAGES_PATH = "/data/luiz/dataset/serengeti_images/"
ANNOTATIONS_PATH = "/data/luiz/dataset/serengeti/SnapshotSerengeti_S1-11_v2.1.json"
SEQUENCE_PATH = "/ssd/luiz/dataset/sequences/"
RESULTS_PATH = "/data/luiz/dataset/partitions/"
ANNOTATIONS_PATH_CSV = "/data/luiz/dataset/serengeti/SnapshotSerengeti_v2_1_annotations.csv"
DATABASE = "serengeti"
CSV_FIELDS = {
    'capture_id': 'id',
    'question__standing': 'standing', 
    'question__resting': 'resting', 
    'question__moving': 'moving', 
    'question__eating': 'eating', 
    'question__interacting': 'interacting'
}
USE_CSV = True

SEED = 10

In [3]:
data = json.load(open(ANNOTATIONS_PATH))

if USE_CSV:
    df_csv = pd.read_csv(ANNOTATIONS_PATH_CSV)[CSV_FIELDS.keys()]
    df_data = pd.DataFrame(data["annotations"])
    df_data = pd.merge(df_data, df_csv, left_on='seq_id', right_on='capture_id', how='inner')

    df_data = df_data.rename(columns=CSV_FIELDS)
    # Remover duplicatas mantendo a primeira ocorrência
    df_data = df_data.loc[:, ~df_data.columns.duplicated()]

data["annotations"] = df_data.to_dict(orient='records')

  df_csv = pd.read_csv(ANNOTATIONS_PATH_CSV)[CSV_FIELDS.keys()]


In [5]:
ALL_ACTIONS = ['standing', 'resting', 'moving', 'eating', 'interacting']
ACTIONS = ['resting', 'moving', 'eating']
CATEGORIES_INVALID = [1]
MAX_ANIMALS = 1

def get_category_from_sequence(sequence):
    mapper = {}
    for action in ALL_ACTIONS:
        for frame in sequence:
            mapper[action] = mapper.get(action, 0) + frame.get(action, 0)
    return max(mapper, key=mapper.get)

def is_valid_frame(frame):
    try:
        count = int(frame["count"])
    except Exception:
        count = 0
    exist_file = os.path.isfile(frame["path"])
    category_invalid = frame["category_id"] in CATEGORIES_INVALID
    count_valid = count <= MAX_ANIMALS
    if not exist_file or category_invalid or not count_valid:
        return False
    # for action in ALL_ACTIONS:
    #     try:
    #         if not frame.get(action) >= 0:
    #             return False
    #     except Exception:
    #         return False
    return True

def get_sequence_mapper(annotations):
    mapper = {}
    print("getting sequence mapper")
    for item in tqdm(annotations):
        id = item.get("seq_id")
        item["path"] = f'{IMAGES_PATH}{item["image_id"]}{FILE_TYPE}'
        if is_valid_frame(item):
            if not mapper.get(id):
                mapper[id] = []
            mapper[id].append(item)
    return mapper

def get_frames_sequences(data):
    sequence_mapper = get_sequence_mapper(data["annotations"])
    events = []
    print("getting categories")
    for key in tqdm(sequence_mapper.keys()):
        frames = sequence_mapper[key]
        events.append({
            "num_frames": len(frames),
            "frames": frames,
            "datetime": frames[0]["datetime"],
            "category": get_category_from_sequence(frames)})
    return events

sequences = get_frames_sequences(data)

getting sequence mapper


100%|██████████| 7429835/7429835 [02:09<00:00, 57528.58it/s] 


getting categorys


100%|██████████| 1033219/1033219 [00:10<00:00, 93973.74it/s]


In [6]:
df = pd.DataFrame(sequences)
df = df[df.num_frames > 1]
df = df.replace("interacting", "moving").replace("standing", "resting")
df.category.unique()

array(['resting', 'moving', 'eating'], dtype=object)

In [7]:
def balance_dataset(df):
    dfs = []
    size = min(df.category.value_counts())
    for category in df.category.unique():
        filtered = df[df.category == category].sample(size, random_state=SEED)
        dfs.append(filtered)
    new_df = pd.concat(dfs).reset_index(drop=True)
    return new_df.sample(len(new_df), random_state=SEED).reset_index(drop=True)

def get_empty_in_frames(frames):
    for frame in frames:
        if frame["category_id"] == 0:
            return frame["path"]
    return None

def get_animal_in_frames(frames):
    for frame in frames:
        if frame["category_id"] != 0:
            return frame["path"]
    return None

df["path_empty"] = df.frames.map(lambda a: get_empty_in_frames(a))
df["path_animal"] = df.frames.map(lambda a: get_animal_in_frames(a))

df = balance_dataset(df)
df.category.value_counts()

eating     28968
resting    28968
moving     28968
Name: category, dtype: int64

In [8]:
df['location'] = df.frames.map(lambda a: a[0]['location'])
df['datetime'] = pd.to_datetime(df['datetime'])
df['path_seq'] = df.frames.map(lambda a: ",".join([item["path"] for item in a]))

df = df[["num_frames", "category", "location", "datetime", "path_empty", "path_animal", "path_seq"]]

In [9]:
df.head()

Unnamed: 0,num_frames,category,location,datetime,path_empty,path_animal,path_seq
0,3,eating,G12,2012-11-01 07:24:44,,/data/luiz/dataset/serengeti_images/S5/G12/G12...,/data/luiz/dataset/serengeti_images/S5/G12/G12...
1,3,eating,G02,2011-11-05 13:40:03,,/data/luiz/dataset/serengeti_images/S3/G02/G02...,/data/luiz/dataset/serengeti_images/S3/G02/G02...
2,3,resting,D09,2012-02-12 09:50:56,/data/luiz/dataset/serengeti_images/S4/D09/D09...,,/data/luiz/dataset/serengeti_images/S4/D09/D09...
3,3,resting,G12,2010-09-22 05:45:50,/data/luiz/dataset/serengeti_images/S1/G12/G12...,,/data/luiz/dataset/serengeti_images/S1/G12/G12...
4,3,resting,L05,2012-07-30 12:48:36,/data/luiz/dataset/serengeti_images/S5/L05/L05...,,/data/luiz/dataset/serengeti_images/S5/L05/L05...


In [35]:
import numpy as np
from PIL import Image

def create_side_by_side_image(batchs):
    response = []
    for batch in tqdm(batchs):
        output_path = f'{SEQUENCE_PATH}{uuid4()}{FILE_TYPE}'

        images = [Image.open(img) for img in batch]
        min_height = min(img.height for img in images)
        resized_images = [
            img.resize((int(img.width * min_height / img.height), min_height), Image.Resampling.LANCZOS) for img in images
        ]
        total_width = sum(img.width for img in resized_images)
        combined_image = Image.new("RGB", (total_width, min_height), (255, 255, 255))
        x_offset = 0
        for img in resized_images:
            combined_image.paste(img, (x_offset, 0))
            x_offset += img.width
        combined_image.save(output_path)
        response.append(output_path)
    return response

def split_data_subsets(df):
    unique_locations = df['location'].unique()
    np.random.shuffle(unique_locations)
    # Define partition ratios
    train_ratio = 0.5
    val_ratio = 0.15
    # Calculate split indices
    n_total = len(unique_locations)
    train_end = int(train_ratio * n_total)
    val_end = train_end + int(val_ratio * n_total)
    # Split locations into partitions
    train_locations = unique_locations[:train_end]
    val_locations = unique_locations[train_end:val_end]
    test_locations = unique_locations[val_end:]
    # Assign partitions
    train = df[df['location'].isin(train_locations)]
    val = df[df['location'].isin(val_locations)]
    test = df[df['location'].isin(test_locations)]
    return balance_dataset(train), balance_dataset(val), balance_dataset(test)

def save_results(df, task, filename):
    path = f"{RESULTS_PATH}{task}/{DATABASE}/{filename}"
    print(path)
    print(df.category.value_counts(), "\n")
    df.to_csv(path, index=False)

def save_animal_classifier_dataset(df):
    df["category"] = df.path_animal.map(lambda a: "yes" if isinstance(a, str) else "no")
    df = df.rename(columns={"path_animal": "path"})
    df["path"] = df["path"].combine_first(df["path_empty"])
    df = df[["num_frames", "category", "location", "datetime", "path"]]
    train, val, test = split_data_subsets(df)
    save_results(train, "animal-classifier", "train.csv")
    save_results(val[:7000], "animal-classifier", "val.csv")
    save_results(test[:8000], "animal-classifier", "test.csv")


def save_behaviour_classifier_dataset(df):
    df = df.rename(columns={"path_animal": "path"})
    df = df[["num_frames", "category", "location", "datetime", "path", "path_seq"]].dropna()
    train, val, test = split_data_subsets(df)

    train["path_seq_saved"] = create_side_by_side_image(train["path_seq"])
    val["path_seq_saved"] = create_side_by_side_image(val["path_seq"])
    test["path_seq_saved"] = create_side_by_side_image(test["path_seq"])

    save_results(train, "behaviour-classifier", "train.csv")
    save_results(val, "behaviour-classifier", "val.csv")
    save_results(test, "behaviour-classifier", "test.csv")

save_animal_classifier_dataset(df.copy())
save_behaviour_classifier_dataset(df.copy())

/data/luiz/dataset/partitions/animal-classifier/serengeti/train.csv
no     12561
yes    12561
Name: category, dtype: int64 

/data/luiz/dataset/partitions/animal-classifier/serengeti/val.csv
yes    3517
no     3483
Name: category, dtype: int64 

/data/luiz/dataset/partitions/animal-classifier/serengeti/test.csv
no     4030
yes    3970
Name: category, dtype: int64 

/data/luiz/dataset/partitions/behaviour-classifier/serengeti/train.csv
moving     896
eating     896
resting    896
Name: category, dtype: int64 

/data/luiz/dataset/partitions/behaviour-classifier/serengeti/val.csv
moving     271
eating     271
resting    271
Name: category, dtype: int64 

/data/luiz/dataset/partitions/behaviour-classifier/serengeti/test.csv
moving     839
eating     839
resting    839
Name: category, dtype: int64 



PASSING IMAGES TO SSD

In [29]:
import os
import shutil

def list_all_csv_files(directory):
    csv_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".csv"):
                csv_files.append(os.path.join(root, file))
    return csv_files

csv_files = list_all_csv_files(RESULTS_PATH)
print(csv_files)
ssd_images = []
for file in csv_files:
    df = pd.read_csv(file)
    ssd_images.extend(df["path"])

ssd_images = list(set(ssd_images))
len(ssd_images)

['/data/luiz/dataset/partitions/behaviour-classifier/serengeti/val.csv', '/data/luiz/dataset/partitions/behaviour-classifier/serengeti/test.csv', '/data/luiz/dataset/partitions/behaviour-classifier/serengeti/train.csv', '/data/luiz/dataset/partitions/animal-classifier/serengeti/val.csv', '/data/luiz/dataset/partitions/animal-classifier/serengeti/test.csv', '/data/luiz/dataset/partitions/animal-classifier/serengeti/train.csv']


44120

REPLACE DIR HD TO DIR SSD

In [25]:
for file in [csv_files]:
    df = pd.read_csv(file)
    df["path"] = df["path"].map(lambda a: a.replace("/data/", "/ssd/"))
    df.to_csv(file, index=False)

COPY IMAGES TO SSD

In [31]:
def copy_images_to_ssd(images):
    for file_name in tqdm(list(set(images))):
        file_name_ssd = file_name.replace("/data/", "/ssd/")
        file_name_hd = file_name.replace("/ssd/", "/data/")
        destination_dir = os.path.dirname(file_name_ssd)
        if not os.path.exists(destination_dir):
            os.makedirs(destination_dir)
        shutil.copy(file_name_hd, file_name_ssd)

copy_images_to_ssd(ssd_images)

100%|██████████| 44120/44120 [10:47<00:00, 68.10it/s]


In [2]:
x = ''
for i in '1234':
    x += i

In [3]:
x

'1234'