# IMPORTS #

In [20]:
import torch
import pandas as pd
import numpy as np
#import fiftyone as fo
#import fiftyone.zoo as foz
from ultralytics import YOLO
# no longer required

# Getting the OpenImagesV7 dataset ready for use #

We are going to use the fiftyone lib to get the only the penguin data. \
Then we are going to need to make sure the data is compatible and in the right format for the YOLOv8 model.

## Get only the penguin data from the OpenImagesV7 dataset ##

### Using the fiftyone - this requires the MongaDB so I am going to just work with the dataset directly ###

In [None]:
# test
# to be removed
dataset = foz.load_zoo_dataset(
    "open-images-v7",
    split="validation",
    max_samples=100,
    seed=51,
    shuffle=True,
)

session = fo.launch_app(dataset.view())
# fiftyone is not working working off the bat so I am going to try something else and come back to this


### working with the dataset and annotations directly ###

data is found here: https://storage.googleapis.com/openimages/web/download_v7.html \
\
The goal is to create a list with all the image ids that specify that a penguin has been detected\
The downloader.py function that is supplied by OIV7 (and is saved it the colab directory) can be called and will only download the images in that list\
We then need to check those images and adjust the relevant annotations to only include those ids and save them.\


In [2]:
# path to the parent folder of the raw data

ROOT_RAW_OI7_PATH = '/home/matthew/Desktop/Master_Dev/masters_penguin_pose_estimation/data/raw/OpenImagesV7'

#### Getting the data annotations in a dataframe and filtering to only penguins ####

In [3]:
# load csvs to dataframes to work with
raw_image_labels = pd.read_csv(f'{ROOT_RAW_OI7_PATH}/test-annotations-human-imagelabels-boxable.csv')
raw_bbox_labels = pd.read_csv(f'{ROOT_RAW_OI7_PATH}/test-annotations-bbox.csv')

In [4]:
raw_image_labels.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence
0,000026e7ee790996,verification,/m/0cgh4,0
1,000026e7ee790996,verification,/m/04hgtk,0
2,000026e7ee790996,verification,/m/0d5gx,0
3,000026e7ee790996,verification,/m/07j7r,1
4,000026e7ee790996,verification,/m/04rky,0


In [5]:
raw_bbox_labels.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
0,000026e7ee790996,xclick,/m/07j7r,1,0.071875,0.145313,0.20625,0.391667,0,1,1,0,0
1,000026e7ee790996,xclick,/m/07j7r,1,0.439063,0.571875,0.264583,0.435417,0,1,1,0,0
2,000026e7ee790996,xclick,/m/07j7r,1,0.66875,1.0,0.0,0.552083,0,1,1,0,0
3,000062a39995e348,xclick,/m/015p6,1,0.206208,0.849224,0.154639,1.0,0,0,0,0,0
4,000062a39995e348,xclick,/m/05s2s,1,0.137472,0.37694,0.0,0.883652,1,1,0,0,0


the penguin id (found from the oldv7-class-descriptions-boxable.csv file):\
/m/05z6w

In [16]:
# below you can also just make it image labels where LabelName is the same but the confidence score is also equal to 1
# then you dont need to do the removing the rows thing

In [6]:
# create datasets that only contain entries with images in them
penguin_image_labels = raw_image_labels[raw_image_labels['LabelName'] == '/m/05z6w']
penguin_bbox_labels = raw_bbox_labels[raw_bbox_labels['LabelName'] == '/m/05z6w']

In [7]:
penguin_image_labels.head(97)

Unnamed: 0,ImageID,Source,LabelName,Confidence
6084,01e8e6894bf88777,verification,/m/05z6w,1
7654,025dfa9ea18bcb18,verification,/m/05z6w,0
14090,045f2a7a01277369,verification,/m/05z6w,0
17172,0555911ee31d049f,verification,/m/05z6w,0
24612,07ca230691eec10e,verification,/m/05z6w,1
...,...,...,...,...
482309,9ee32a851eba0eea,verification,/m/05z6w,1
492643,a25a74a471183cee,verification,/m/05z6w,1
495405,a32f24bc0220911f,verification,/m/05z6w,0
516197,a9ddfcc0e100968c,verification,/m/05z6w,1


In [8]:
# get number of rows (number of images)
penguin_image_labels.shape

(144, 4)

In [9]:
penguin_bbox_labels.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
7934,01e8e6894bf88777,xclick,/m/05z6w,1,0.050186,0.899628,0.014035,0.84386,0,0,0,0,0
30971,07ca230691eec10e,xclick,/m/05z6w,1,0.0,0.466814,0.625369,1.0,1,1,0,0,0
30972,07ca230691eec10e,xclick,/m/05z6w,1,0.327434,0.761062,0.122419,0.501475,1,0,0,0,0
30973,07ca230691eec10e,xclick,/m/05z6w,1,0.396018,0.818584,0.215339,0.971976,0,0,0,0,0
37464,099862b891b82c9a,xclick,/m/05z6w,1,0.526549,0.621681,0.309735,0.40118,0,0,0,1,0


In [10]:
# get number of rows (number of bboxs)
penguin_bbox_labels.shape

(165, 13)

#### creating the text file with a list of all the images the contain penguins and getting all the images from that and downloading those images to a folder - you only need to do this once for the test, train and val datasets on the ####

In [11]:
# First I would like to just check that all the ids appear in both lists (sanity check)
# I will remove anything that does not appear in both 
def check_id_presence_in_dfs(df1, df2, id_column, df1_name='df1', df2_name='df2'):
    """
    Check and print IDs that are not present in both dataframes and put them in a list. prints and returns nothing if all ids are present.
    """
    # Set of ids from each df
    ids_df1 = set(df1[id_column])
    ids_df2 = set(df2[id_column])

    # create lists for ids to be removefd
    df1_list_2remove = []
    df2_list_2remove = []

    # Find IDs that are only in df1
    only_in_df1 = ids_df1 - ids_df2
    if only_in_df1:
        #print(f"IDs only in {df1_name}:")
        for id_ in only_in_df1:
            #print(id_)
            df1_list_2remove.append(id_)

    # Find IDs that are only in df2
    only_in_df2 = ids_df2 - ids_df1
    if only_in_df2:
        #print(f"IDs only in {df2_name}:")
        for id_ in only_in_df2:
            #print(id_)
            df2_list_2remove.append(id_)

    return df1_list_2remove, df2_list_2remove

In [12]:
# if returns nothing then all images are present in both dfs
penguin_image_labels_2remove, penguin_bbox_labels_2remove = check_id_presence_in_dfs(penguin_image_labels, penguin_bbox_labels, 'ImageID', 'label', 'bbox')

In [13]:
print(len(penguin_image_labels_2remove))
print(len(penguin_bbox_labels_2remove))

47
0


In [14]:
# function to remove ids that don't appear in both lists
def remove_entries_by_ids(df, id_list, id_column):
    '''Removes rows from the DataFrame where the ID is in the provided list.'''
    filtered_df = df[~df[id_column].isin(id_list)]
    return filtered_df

In [15]:
penguin_image_labels = remove_entries_by_ids(penguin_image_labels, penguin_image_labels_2remove, 'ImageID')
penguin_bbox_labels = remove_entries_by_ids(penguin_bbox_labels, penguin_bbox_labels_2remove, 'ImageID')

In [16]:
penguin_image_labels.shape

(97, 4)

In [17]:
penguin_bbox_labels.shape

(165, 13)

In [19]:
# create a text file with the following layout:
#$SPLIT/$IMAGE_ID, where $SPLIT is either "train", "test", "validation", or "challenge2018"; and $IMAGE_ID is the image ID that uniquely identifies the image
# so the text file will look like this:
#train/f9e0434389a1d4dd
#train/1a007563ebc18664
#test/ea8bfd4e765304db
# the following function should create the required text file:
def write_ids_to_file(location, name, df):
    """
    Writes entries to a text file where each entry is 'name/id' from the DataFrame's ID column.

    :param location: String, the directory where the text file will be saved.
    :param name: String, the name to be prefixed before each ID.
    :param df: DataFrame, the dataframe containing an 'id' column.
    """
    # Open a file at the specified location to write
    with open(f"{location}/{name}_ids.txt", "w") as file:
        # Loop through each id in the DataFrame
        for id_ in df['ImageID']:
            # Write formatted string to file
            file.write(f"{name}/{id_}\n")

In [11]:
# call the above function for the test images
# call the above function for the test images
# call the above function for the test images
write_ids_to_file(ROOT_RAW_OI7_PATH, 'test', penguin_image_labels)

Now run the following command in the terminal:\
python3 /home/matthew/Desktop/Master_Dev/masters_penguin_pose_estimation/notebooks/OpenImagesV7_downloader.py /home/matthew/Desktop/Master_Dev/masters_penguin_pose_estimation/data/raw/OpenImagesV7/image_list_file.txt --download_folder=/home/matthew/Desktop/Master_Dev/masters_penguin_pose_estimation/data/raw/OpenImagesV7/OpenImageV7_raw_images --num_processes=5

but replace image_list_file.txt with the name of the test file you just created.\
\
The images should be copied to the /home/matthew/Desktop/Master_Dev/masters_penguin_pose_estimation/data/raw/OpenImagesV7/OpenImageV7_raw_images directory 

In [19]:
# this will allow you to see the row corresponding to the id provided
input_id = '0d6a527e509d7e6e'
matching_row = penguin_image_labels[penguin_image_labels['ImageID'] == input_id]
print(matching_row)

Empty DataFrame
Columns: [ImageID, Source, LabelName, Confidence]
Index: []


In [18]:
# this will allow you to see the row corresponding to the id provided
input_id = '0d6a527e509d7e6e'
matching_row = penguin_bbox_labels[penguin_bbox_labels['ImageID'] == input_id]
print(matching_row)

Empty DataFrame
Columns: [ImageID, Source, LabelName, Confidence, XMin, XMax, YMin, YMax, IsOccluded, IsTruncated, IsGroupOf, IsDepiction, IsInside]
Index: []


#### Create the annotation files ####
here we need to create a text file for each image with the following format for each bounding box in that image, where each bounding box is on a new line:\
0 x_centre_value y_centre_value width height \
\
note that all values are normalised between zero and 1

##### convert the bboxs to Yolo format: XCentre YCentre Width Height and save these as new columns to the df and save that dataframe to a .json file - might need to convert it to a series first

In [None]:
# first we need to create a function to get bounding boxes in an array in a format:
# x1 y1 x2 y2
# since the origin is in the top left corner, that is to say it must be:
# XMin Ymin XMax YMax

