In [1]:
import json
import numpy as np
import pandas as pd
from loguru import logger
from tqdm import tqdm
from training.train_config import CFG

In [22]:
def class_priors(df: pd.DataFrame) -> np.ndarray:
    """Calculates the class priors for a given DataFrame.

    Args:
        df (DataFrame): The DataFrame containing the class labels.

    Returns:
        class_priors (np.ndarray): An array containing the class priors.
    """
    logger.info("Calculating class priors")
    class_priors = np.zeros(len(df["class_id"].unique()))
    for species in df["class_id"].unique():
        class_priors[species] = len(df[df["class_id"] == species])

    return class_priors / sum(class_priors)


def month_distributions(df):
    """Calculates the distribution of mushroom classes for each month in the dataset.

    Args:
        df (DataFrame): The input DataFrame containing the mushroom data.

    Returns:
        dict: A dictionary containing the distribution of mushroom classes for each month.
    """
    logger.info("Calculating month distributions")
    month_distributions = {}

    for _, observation in tqdm(df.iterrows(), total=len(df)):
        month = str(observation["date"].month)
        if month not in month_distributions:
            month_distributions[month] = np.zeros(len(df["class_id"].unique()))
        else:
            class_id = observation.class_id
            month_distributions[month][class_id] += 1

    for key, value in month_distributions.items():
        month_distributions[key] = value / sum(value)
    return month_distributions


def parse_json(filepath, is_test=False, categories=None):
    """Parses a JSON file and returns relevant dataframes.

    Args:
        filepath (pathlib.Path): The path to the JSON file.
        is_test (bool, optional): Whether the JSON file is a test file. Defaults to False.
        categories (DataFrame, optional): A dataframe containing categories. Defaults to None.

    Returns:
        DataFrame: A dataframe containing information.
        DataFrame: A dataframe containing images.
        DataFrame: A dataframe containing annotations (if not a test file).
        DataFrame: A dataframe containing categories (if categories parameter is not None and not a test file).
    """
    with open(filepath, "r") as f:
        res = json.load(f)
    info = pd.DataFrame.from_dict(res["info"], orient="index")
    images = pd.DataFrame(res["images"]).set_index("id")
    if not is_test:
        annotations = pd.DataFrame(res["annotations"]).set_index("id")
        if categories:
            categories = pd.DataFrame(res["categories"]).set_index("id")
            return info, images, annotations, categories
        return info, images, annotations

    return info, images


def join_dataframes(images, annotations, categories, dset=None, locations=None):
    """Join dataframes containing information about images, annotations, categories, and locations (optional).
    Only categories with the supercategory 'Fungi' are included.

    Args:
        images (DataFrame): dataframe containing information about images
        annotations (DataFrame): dataframe containing information about annotations
        categories (DataFrame): dataframe containing information about categories
        locations (DataFrame, optional): dataframe containing information about image locations

    Returns:
        df (DataFrame): merged dataframe with selected columns dropped
    """
    categories = categories[categories["supercategory"] == "Fungi"].rename(
        columns={"id": "category_id"}
    )
    if locations is None:  # some datasets do not have location information
        df = pd.merge(
            categories, annotations, right_on="category_id", left_index=True
        ).merge(images, left_on="image_id", right_index=True)
    else:
        df = pd.merge(annotations, categories, on="category_id").set_index("image_id")
        df = df.merge(images, left_index=True, right_index=True)
        df = df.merge(locations, right_index=True, left_index=True)
        
    df = df.drop(
        ["supercategory", "kingdom", "image_id", "valid", "license", "rights_holder"],
        errors="ignore",
    )
    if dset is not None:
        df["dset"] = dset
    return df


# @flow(name='Parse2018Data')
def parse_2018_data(data_root):
    """Parses the 2018 mushroom dataset from the given data root directory.

    Args:
        data_root (pathlib.Path): The root directory of the dataset.

    Returns:
        DataFrame: A dataframe containing the parsed data.
    """
    logger.info(f"Parsing 2018 data from {data_root}")

    # Parse categories
    with open(data_root / "categories.json", "r") as f:
        cats = pd.DataFrame(json.load(f))

    # Parse train and validation data
    (timages2018, tanno2018), (vimages2018, vanno2018) = [
        parse_json(data_root / f"{s}2018.json")[1:]
        for s in ["train", "val"]
    ]

    # Parse train and validation locations
    tloc, vloc = [
        pd.read_json(data_root / "inat2018_locations" / f"{s}2018_locations.json").set_index("id")
        for s in ["train", "val"]
    ]

    # Join dataframes and save which dset they are from
    val = join_dataframes(vimages2018, vanno2018, cats, locations=vloc, dset="val")
    train = join_dataframes(timages2018, tanno2018, cats, locations=tloc, dset="train")
    df = pd.concat([train, val]).reset_index(drop=True)
    df["dataset"] = "2018"

    # Create new directories and paths
    df['file_path'] = str(data_root) + "/" + df["file_name"]
    df["file_name"] = df["file_name"].str.split("/").str[-1]
    df["specific_epithet"] = df["name"].str.split().str[-1]

    # Drop unneeded columns and rename others
    df = df.drop(["category_id", "date_c"], axis=1).rename(
        columns={
            "lon": "longitude",
            "lat": "latitude",
            "loc_uncert": "location_uncertainty",
        }
    )
    logger.debug(f"2018 dataframe shape {df.shape}")
    return df


# @flow(name='Parse2021Data')
def parse_2021_data(data_root):
    """Parses 2021 mushroom data from the given data root directory.

    Args:
        data_root (pathlib.Path): The root directory of the 2021 mushroom data.

    Returns:
        DataFrame: A concatenated dataframe of the parsed mushroom data.
    """
    logger.info(f"Parsing 2021 data from {data_root}")
    sets = ["train", "val"]

    dfs = [
        join_dataframes(
            *parse_json(data_root / f"{s}.json", categories=True)[1:], dset=s
        )
        for s in sets
    ]
    df = pd.concat(dfs, ignore_index=True)

    df["dataset"] = "2021"
    df['file_path'] = str(data_root) + "/" + df["file_name"]
    df["file_name"] = df["file_name"].str.split("/").str[-1]
    
    df = df.drop(["category_id", "common_name"], axis=1)
    logger.debug(f"2021 dataframe shape {df.shape}")
    return df


# @flow(name='JoinDatasets')
def join_datasets(CFG, root) -> tuple:
    """Join two mushroom datasets, parse date column, create file path and GCS path columns,
    create class ID column, calculate month distribution and class prior, and return the
    concatenated dataframe and month distribution as a tuple.

    Args:
        gcs_bucket (str): The name of the Google Cloud Storage bucket.
        root (pathlib.Path): The root directory of the mushroom datasets.

    Returns:
        tuple: A tuple containing the concatenated dataframe and month distribution.
    """
    df1 = parse_2018_data(root / "2018")
    df2 = parse_2021_data(root / "2021")
    logger.info("Joining all datasets")
    df = pd.concat([df1, df2], ignore_index=True)

    df["date"] = pd.to_datetime(df["date"], format="mixed", utc=True)
    df["class_id"] = df["name"].astype("category").cat.codes
    df['gcs_path'] = df.apply(lambda x: f"gs://{CFG.GCS_REPO}/data/raw/{x['dataset']}/{x['phylum']}_{x['class']}_{x['order']}_{x['family']}_{x['genus']}_{x['specific_epithet']}/{x['file_name']}", axis=1)

    month_distribution = month_distributions(df)
    class_prior = class_priors(df)

    df["class_priors"] = df["class_id"].map(dict(enumerate(class_prior)))

    return df, month_distribution

In [None]:
if __name__ == "__main__":
    from os import environ
    root = environ['PYTHONPATH']
    raw_data_root = CFG.DATA / "raw"

    df, month_distribution = join_datasets(CFG, raw_data_root)
    
    logger.debug(f"Final dataframe shape {df.shape}")
    df.to_csv(CFG.DATA / "train.csv", index=False)

    logger.info("Deleting unused images")
    total_filelist = raw_data_root.rglob('*.jpg')
    total_fileset = set([x for x in total_filelist])

    keep_set = set(df['file_name'].values.tolist())

    files_to_delete = total_fileset - keep_set

    for file in files_to_delete:
        file.unlink()


In [9]:
from os import environ
from pathlib import Path
root = Path(environ['PYTHONPATH'].split(":")[0])
raw_data_root = root / 'training' / 'data' / "raw"

In [16]:
parse_2021_data(raw_data_root / "2021")

[32m2023-11-11 04:18:33.164[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_2021_data[0m:[36m169[0m - [1mParsing 2021 data from /home/broug/Mushroom-Classifier/training/data/raw/2021[0m
[32m2023-11-11 04:18:48.197[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mparse_2021_data[0m:[36m187[0m - [34m[1m2021 dataframe shape (93458, 23)[0m


Unnamed: 0,name,supercategory,kingdom,phylum,class,order,family,genus,specific_epithet,image_dir_name,...,file_name,license,rights_holder,date,latitude,longitude,location_uncertainty,dset,dataset,file_path
0,Herpothallon rubrocinctum,Fungi,Fungi,Ascomycota,Arthoniomycetes,Arthoniales,Arthoniaceae,Herpothallon,rubrocinctum,05388_Fungi_Ascomycota_Arthoniomycetes_Arthoni...,...,43dec0a2-c4cd-4d80-8033-085e378a7bee.jpg,6,Fluff Berger,2014-02-11 16:35:28+00:00,26.57042,-81.82626,217.0,train,2021,/home/broug/Mushroom-Classifier/training/data/...
1,Herpothallon rubrocinctum,Fungi,Fungi,Ascomycota,Arthoniomycetes,Arthoniales,Arthoniaceae,Herpothallon,rubrocinctum,05388_Fungi_Ascomycota_Arthoniomycetes_Arthoni...,...,d31929ec-1d6c-4db2-a3ec-e974ec7bc0e5.jpg,4,paloma,2014-07-31 16:23:19+00:00,-0.62559,-90.38539,,train,2021,/home/broug/Mushroom-Classifier/training/data/...
2,Herpothallon rubrocinctum,Fungi,Fungi,Ascomycota,Arthoniomycetes,Arthoniales,Arthoniaceae,Herpothallon,rubrocinctum,05388_Fungi_Ascomycota_Arthoniomycetes_Arthoni...,...,c09db9e3-0c6b-48a8-bad8-ee4fbdd2eef7.jpg,0,Marisol Báez,2015-03-04 16:05:40+00:00,19.50928,-96.94537,1414.0,train,2021,/home/broug/Mushroom-Classifier/training/data/...
3,Herpothallon rubrocinctum,Fungi,Fungi,Ascomycota,Arthoniomycetes,Arthoniales,Arthoniaceae,Herpothallon,rubrocinctum,05388_Fungi_Ascomycota_Arthoniomycetes_Arthoni...,...,8acff645-131f-4e3e-889c-30f5864ffd74.jpg,1,James Bailey,2015-01-16 00:00:00+00:00,28.62446,-82.35767,58.0,train,2021,/home/broug/Mushroom-Classifier/training/data/...
4,Herpothallon rubrocinctum,Fungi,Fungi,Ascomycota,Arthoniomycetes,Arthoniales,Arthoniaceae,Herpothallon,rubrocinctum,05388_Fungi_Ascomycota_Arthoniomycetes_Arthoni...,...,aa8e75d2-2aa4-4a98-b839-5a64cb2d2caf.jpg,1,Hervin Barrios,2011-09-27 00:00:00+00:00,15.67858,-92.75620,,train,2021,/home/broug/Mushroom-Classifier/training/data/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93453,Tremella mesenterica,Fungi,Fungi,Basidiomycota,Tremellomycetes,Tremellales,Tremellaceae,Tremella,mesenterica,05728_Fungi_Basidiomycota_Tremellomycetes_Trem...,...,51f13b0b-f4f7-46ed-9e53-4d0c63ddad6b.jpg,0,Maxim Shashkov,2019-05-11 09:41:04+00:00,53.62273,35.86882,13.0,val,2021,/home/broug/Mushroom-Classifier/training/data/...
93454,Tremella mesenterica,Fungi,Fungi,Basidiomycota,Tremellomycetes,Tremellales,Tremellaceae,Tremella,mesenterica,05728_Fungi_Basidiomycota_Tremellomycetes_Trem...,...,294cf8dc-bfaa-45b7-a7a8-205d1a22e33d.jpg,1,ninakerr01,2019-06-09 04:24:29+00:00,-37.64360,143.64174,44.0,val,2021,/home/broug/Mushroom-Classifier/training/data/...
93455,Tremella mesenterica,Fungi,Fungi,Basidiomycota,Tremellomycetes,Tremellales,Tremellaceae,Tremella,mesenterica,05728_Fungi_Basidiomycota_Tremellomycetes_Trem...,...,b3b8cb90-d5d6-4118-826f-d69c6803a11a.jpg,4,megachile,2019-06-16 19:16:00+00:00,42.48023,-83.19875,197.0,val,2021,/home/broug/Mushroom-Classifier/training/data/...
93456,Tremella mesenterica,Fungi,Fungi,Basidiomycota,Tremellomycetes,Tremellales,Tremellaceae,Tremella,mesenterica,05728_Fungi_Basidiomycota_Tremellomycetes_Trem...,...,f38f1edd-34f2-4e8d-aaeb-c95ed22dd1ca.jpg,1,crazybirdy,2018-10-09 00:00:00+00:00,45.58926,-78.37099,15.0,val,2021,/home/broug/Mushroom-Classifier/training/data/...


In [17]:
parse_2018_data(raw_data_root / "2018")

[32m2023-11-11 04:19:34.967[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_2018_data[0m:[36m115[0m - [1mParsing 2018 data from /home/broug/Mushroom-Classifier/training/data/raw/2018[0m
[32m2023-11-11 04:19:38.145[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mparse_2018_data[0m:[36m155[0m - [34m[1m2018 dataframe shape (7827, 23)[0m


Unnamed: 0,kingdom,name,family,supercategory,phylum,order,genus,class,license,file_name,...,location_uncertainty,date,valid,user_id,latitude,longitude,dset,dataset,file_path,specific_epithet
0,Fungi,Phallus indusiatus,Phallaceae,Fungi,Basidiomycota,Phallales,Phallus,Agaricomycetes,2,8d841f576d05e05f0b4b5513d549630a.jpg,...,599,2014-10-01,True,25945,21.151650,-99.541750,train,2018,/home/broug/Mushroom-Classifier/training/data/...,indusiatus
1,Fungi,Phallus indusiatus,Phallaceae,Fungi,Basidiomycota,Phallales,Phallus,Agaricomycetes,3,f84f23fe93f3fde53f7193e3cc08d473.jpg,...,10000,2014-09-15,True,41331,18.417353,-95.171649,train,2018,/home/broug/Mushroom-Classifier/training/data/...,indusiatus
2,Fungi,Phallus indusiatus,Phallaceae,Fungi,Basidiomycota,Phallales,Phallus,Agaricomycetes,3,dabc337065c65a0ed19707c4a595bcb1.jpg,...,500,2016-02-22,True,23639,-8.566403,115.213867,train,2018,/home/broug/Mushroom-Classifier/training/data/...,indusiatus
3,Fungi,Phallus indusiatus,Phallaceae,Fungi,Basidiomycota,Phallales,Phallus,Agaricomycetes,3,8980532c8a1ef146bd3ed8d54f362b76.jpg,...,10000,2012-07-05,True,42137,16.841622,-93.019913,train,2018,/home/broug/Mushroom-Classifier/training/data/...,indusiatus
4,Fungi,Phallus indusiatus,Phallaceae,Fungi,Basidiomycota,Phallales,Phallus,Agaricomycetes,2,2d3e29631fa00733061111743ceb734c.jpg,...,0,2002-10-09,True,1000,-9.702458,-70.694447,train,2018,/home/broug/Mushroom-Classifier/training/data/...,indusiatus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7822,Fungi,Helvella acetabulum,Helvellaceae,Fungi,Ascomycota,Pezizales,Helvella,Pezizomycetes,3,5a6e4da01ba6797f44fe5309a485c17c.jpg,...,272,2006-04-20,True,38530,37.665648,-122.045166,val,2018,/home/broug/Mushroom-Classifier/training/data/...,acetabulum
7823,Fungi,Helvella acetabulum,Helvellaceae,Fungi,Ascomycota,Pezizales,Helvella,Pezizomycetes,3,a235a74bdf774b4a00c5990ebc777110.jpg,...,7327,2015-03-07,True,5179,36.483550,-121.714500,val,2018,/home/broug/Mushroom-Classifier/training/data/...,acetabulum
7824,Fungi,Sirococcus clavigignenti-juglandacearum,,Fungi,Ascomycota,Diaporthales,Sirococcus,Sordariomycetes,3,def3a53551106e7147705b0684a5a297.jpg,...,100,2017-05-05,True,164015,45.512544,-73.586879,val,2018,/home/broug/Mushroom-Classifier/training/data/...,clavigignenti-juglandacearum
7825,Fungi,Sirococcus clavigignenti-juglandacearum,,Fungi,Ascomycota,Diaporthales,Sirococcus,Sordariomycetes,3,a99f33cd0483f20ddae29be33af42930.jpg,...,27174,2017-04-23,True,164015,45.386961,-75.468014,val,2018,/home/broug/Mushroom-Classifier/training/data/...,clavigignenti-juglandacearum


In [23]:
df, month_d = join_datasets(CFG, raw_data_root)

[32m2023-11-11 04:28:56.841[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_2018_data[0m:[36m115[0m - [1mParsing 2018 data from /home/broug/Mushroom-Classifier/training/data/raw/2018[0m
[32m2023-11-11 04:29:00.190[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mparse_2018_data[0m:[36m152[0m - [34m[1m2018 dataframe shape (7827, 23)[0m
[32m2023-11-11 04:29:00.213[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_2021_data[0m:[36m166[0m - [1mParsing 2021 data from /home/broug/Mushroom-Classifier/training/data/raw/2021[0m
[32m2023-11-11 04:29:15.733[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mparse_2021_data[0m:[36m182[0m - [34m[1m2021 dataframe shape (93458, 23)[0m
[32m2023-11-11 04:29:15.742[0m | [1mINFO    [0m | [36m__main__[0m:[36mjoin_datasets[0m:[36m201[0m - [1mJoining all datasets[0m
[32m2023-11-11 04:29:17.585[0m | [1mINFO    [0m | [36m__main__[0m:[36mmonth_distributions[0m:[36m27[0m - [1mCalculating month dis

In [25]:
df['file_path'].values

array(['/home/broug/Mushroom-Classifier/training/data/raw/2018/train_val2018/Fungi/5156/8d841f576d05e05f0b4b5513d549630a.jpg',
       '/home/broug/Mushroom-Classifier/training/data/raw/2018/train_val2018/Fungi/5156/f84f23fe93f3fde53f7193e3cc08d473.jpg',
       '/home/broug/Mushroom-Classifier/training/data/raw/2018/train_val2018/Fungi/5156/dabc337065c65a0ed19707c4a595bcb1.jpg',
       ...,
       '/home/broug/Mushroom-Classifier/training/data/raw/2021/val/05728_Fungi_Basidiomycota_Tremellomycetes_Tremellales_Tremellaceae_Tremella_mesenterica/b3b8cb90-d5d6-4118-826f-d69c6803a11a.jpg',
       '/home/broug/Mushroom-Classifier/training/data/raw/2021/val/05728_Fungi_Basidiomycota_Tremellomycetes_Tremellales_Tremellaceae_Tremella_mesenterica/f38f1edd-34f2-4e8d-aaeb-c95ed22dd1ca.jpg',
       '/home/broug/Mushroom-Classifier/training/data/raw/2021/val/05728_Fungi_Basidiomycota_Tremellomycetes_Tremellales_Tremellaceae_Tremella_mesenterica/f8aa8d5e-6dab-4a50-9fc1-8a893d3740c6.jpg'],
      dtype=

In [None]:
total_filelist = raw_data_root.rglob('*.jpg')
total_fileset = set([x for x in total_filelist])

keep_set = set(df['file_name'].values.tolist())

files_to_delete = total_fileset - keep_set