In [22]:
import json
import numpy as np
import pandas as pd
from loguru import logger
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from training.train_config import CFG
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [23]:
from os import environ
from pathlib import Path
root = Path(environ['PYTHONPATH'].split(":")[0])
raw_data_root = root / 'training' / 'data' / "raw"

In [24]:
def stratified_sample(df: pd.DataFrame, groupby_column: str, sampling_rate: float = 0.01) -> pd.DataFrame:
    assert 0.0 < sampling_rate <= 1.0
    assert groupby_column in df.columns

    num_rows = int((df.shape[0] * sampling_rate) // 1)
    num_classes = len(df[groupby_column].unique())
    num_rows_per_class = int(max(1, ((num_rows / num_classes) // 1)))
    df_sample = df.groupby(groupby_column, group_keys=False).apply(lambda x: x.sample(min(len(x), num_rows_per_class)))

    return df_sample


# @task
def load_dataframe(root_path):
    """This function loads a dataframe from the given root path.

    Parameters:
        root_path (pathlib.Path): The root path where the dataframe is located.

    Returns:
        DataFrame: The loaded dataframe.
    """
    df = pd.read_csv(root_path.parent / "train.csv")
    if CFG.COMBINE_TRAIN_VAL:
        df = stratified_sample(df, "class_id", sampling_rate=1.0)
    else:
        val = stratified_sample(df[df["dset"] == "val"], "class_id", sampling_rate=1.0)
        train = stratified_sample(df[df["dset"] == "train"], "class_id", sampling_rate=1.0)
        
    del df
    logger.info("Loaded train and val dataframes")
    logger.debug(f"Train shape: {train.shape}  :  val shape: {val.shape}")
    return train, val

In [35]:
df = pd.read_csv(raw_data_root.parent / "train.csv")

  df = pd.read_csv(raw_data_root.parent / "train.csv")


In [36]:
df

Unnamed: 0,kingdom,name,family,supercategory,phylum,order,genus,class,license,file_name,rights_holder,height,width,location_uncertainty,date,valid,user_id,latitude,longitude,dset,dataset,file_path,specific_epithet,image_dir_name,image_id,class_id,gcs_path,class_priors
0,Fungi,Phallus indusiatus,Phallaceae,Fungi,Basidiomycota,Phallales,Phallus,Agaricomycetes,2,8d841f576d05e05f0b4b5513d549630a.jpg,alan_rockefeller,800,600,599.0,2014-10-01 00:00:00+00:00,True,25945.0,21.151650,-99.541750,train,2018,/home/broug/Desktop/Mushroom-Classifier/traini...,indusiatus,,,316,gs://mush-img-repo/data/raw/2018/Basidiomycota...,0.002409
1,Fungi,Phallus indusiatus,Phallaceae,Fungi,Basidiomycota,Phallales,Phallus,Agaricomycetes,3,f84f23fe93f3fde53f7193e3cc08d473.jpg,angelicabecerrat,600,800,10000.0,2014-09-15 00:00:00+00:00,True,41331.0,18.417353,-95.171649,train,2018,/home/broug/Desktop/Mushroom-Classifier/traini...,indusiatus,,,316,gs://mush-img-repo/data/raw/2018/Basidiomycota...,0.002409
2,Fungi,Phallus indusiatus,Phallaceae,Fungi,Basidiomycota,Phallales,Phallus,Agaricomycetes,3,dabc337065c65a0ed19707c4a595bcb1.jpg,Jade McGraw,800,600,500.0,2016-02-22 00:00:00+00:00,True,23639.0,-8.566403,115.213867,train,2018,/home/broug/Desktop/Mushroom-Classifier/traini...,indusiatus,,,316,gs://mush-img-repo/data/raw/2018/Basidiomycota...,0.002409
3,Fungi,Phallus indusiatus,Phallaceae,Fungi,Basidiomycota,Phallales,Phallus,Agaricomycetes,3,8980532c8a1ef146bd3ed8d54f362b76.jpg,belenjd,800,600,10000.0,2012-07-05 00:00:00+00:00,True,42137.0,16.841622,-93.019913,train,2018,/home/broug/Desktop/Mushroom-Classifier/traini...,indusiatus,,,316,gs://mush-img-repo/data/raw/2018/Basidiomycota...,0.002409
4,Fungi,Phallus indusiatus,Phallaceae,Fungi,Basidiomycota,Phallales,Phallus,Agaricomycetes,2,2d3e29631fa00733061111743ceb734c.jpg,116916927065934112165,600,800,0.0,2002-10-09 00:00:00+00:00,True,1000.0,-9.702458,-70.694447,train,2018,/home/broug/Desktop/Mushroom-Classifier/traini...,indusiatus,,,316,gs://mush-img-repo/data/raw/2018/Basidiomycota...,0.002409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101280,Fungi,Tremella mesenterica,Tremellaceae,Fungi,Basidiomycota,Tremellales,Tremella,Tremellomycetes,0,51f13b0b-f4f7-46ed-9e53-4d0c63ddad6b.jpg,Maxim Shashkov,375,500,13.0,2019-05-11 09:41:04+00:00,,,53.622730,35.868820,val,2021,/home/broug/Desktop/Mushroom-Classifier/traini...,mesenterica,05728_Fungi_Basidiomycota_Tremellomycetes_Trem...,2748639.0,435,gs://mush-img-repo/data/raw/2021/Basidiomycota...,0.003505
101281,Fungi,Tremella mesenterica,Tremellaceae,Fungi,Basidiomycota,Tremellales,Tremella,Tremellomycetes,1,294cf8dc-bfaa-45b7-a7a8-205d1a22e33d.jpg,ninakerr01,375,500,44.0,2019-06-09 04:24:29+00:00,,,-37.643600,143.641740,val,2021,/home/broug/Desktop/Mushroom-Classifier/traini...,mesenterica,05728_Fungi_Basidiomycota_Tremellomycetes_Trem...,2753098.0,435,gs://mush-img-repo/data/raw/2021/Basidiomycota...,0.003505
101282,Fungi,Tremella mesenterica,Tremellaceae,Fungi,Basidiomycota,Tremellales,Tremella,Tremellomycetes,4,b3b8cb90-d5d6-4118-826f-d69c6803a11a.jpg,megachile,341,500,197.0,2019-06-16 19:16:00+00:00,,,42.480230,-83.198750,val,2021,/home/broug/Desktop/Mushroom-Classifier/traini...,mesenterica,05728_Fungi_Basidiomycota_Tremellomycetes_Trem...,2754429.0,435,gs://mush-img-repo/data/raw/2021/Basidiomycota...,0.003505
101283,Fungi,Tremella mesenterica,Tremellaceae,Fungi,Basidiomycota,Tremellales,Tremella,Tremellomycetes,1,f38f1edd-34f2-4e8d-aaeb-c95ed22dd1ca.jpg,crazybirdy,500,410,15.0,2018-10-09 00:00:00+00:00,,,45.589260,-78.370990,val,2021,/home/broug/Desktop/Mushroom-Classifier/traini...,mesenterica,05728_Fungi_Basidiomycota_Tremellomycetes_Trem...,2777349.0,435,gs://mush-img-repo/data/raw/2021/Basidiomycota...,0.003505


In [31]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=50, shuffle=True, random_state=42)

In [37]:
for fold, (train_idx, val_idx) in enumerate(skf.split(df, df["class_id"])):
    df_train = df.iloc[train_idx]
    df_val = df.iloc[val_idx]



In [38]:
df_val

Unnamed: 0,kingdom,name,family,supercategory,phylum,order,genus,class,license,file_name,rights_holder,height,width,location_uncertainty,date,valid,user_id,latitude,longitude,dset,dataset,file_path,specific_epithet,image_dir_name,image_id,class_id,gcs_path,class_priors
53,Fungi,Niebla homalea,Ramalinaceae,Fungi,Ascomycota,Lecanorales,Niebla,Lecanoromycetes,3,bb186b1488543e954895f5039349aa03.jpg,JJ Johnson,533,800,0.0,2015-09-19 00:00:00+00:00,True,28037.0,37.718181,-122.462659,train,2018,/home/broug/Desktop/Mushroom-Classifier/traini...,homalea,,,290,gs://mush-img-repo/data/raw/2018/Ascomycota_Le...,0.000365
94,Fungi,Clavulina cristata,Clavulinaceae,Fungi,Basidiomycota,Cantharellales,Clavulina,Agaricomycetes,3,c470a61793ffec73fcb8285835266c97.jpg,Mike Leveille,607,800,197.0,2016-10-30 00:00:00+00:00,True,109854.0,45.373876,-76.091087,train,2018,/home/broug/Desktop/Mushroom-Classifier/traini...,cristata,,,91,gs://mush-img-repo/data/raw/2018/Basidiomycota...,0.000227
138,Fungi,Schizophyllum commune,Schizophyllaceae,Fungi,Basidiomycota,Agaricales,Schizophyllum,Agaricomycetes,3,b1b8631b4f049646dbd73d1163ce3630.jpg,Alan Mond,600,800,5.0,2014-12-25 00:00:00+00:00,True,61225.0,37.803173,-122.158665,train,2018,/home/broug/Desktop/Mushroom-Classifier/traini...,commune,,,395,gs://mush-img-repo/data/raw/2018/Basidiomycota...,0.003693
261,Fungi,Polyporus squamosus,Polyporaceae,Fungi,Basidiomycota,Polyporales,Polyporus,Agaricomycetes,3,74ad1c25d20b4765d55d34891ae82414.jpg,Nathan Fortunato,800,600,5.0,2016-05-19 00:00:00+00:00,True,131803.0,41.316601,-81.597709,train,2018,/home/broug/Desktop/Mushroom-Classifier/traini...,squamosus,,,349,gs://mush-img-repo/data/raw/2018/Basidiomycota...,0.001027
363,Fungi,Lobaria pulmonaria,Lobariaceae,Fungi,Ascomycota,Peltigerales,Lobaria,Lecanoromycetes,3,ae38c8a2b5783e1996a5192111163ea9.jpg,Ken-ichi Ueda,800,600,10.0,2015-02-01 00:00:00+00:00,True,1.0,37.196395,-122.294994,train,2018,/home/broug/Desktop/Mushroom-Classifier/traini...,pulmonaria,,,256,gs://mush-img-repo/data/raw/2018/Ascomycota_Le...,0.003851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100619,Fungi,Laetiporus sulphureus,Laetiporaceae,Fungi,Basidiomycota,Polyporales,Laetiporus,Agaricomycetes,1,f2352035-7e22-4915-aaad-440d38c70b4f.jpg,Nikolay Panasenko,332,500,61.0,2012-05-07 00:00:00+00:00,,,52.394200,34.179510,val,2021,/home/broug/Desktop/Mushroom-Classifier/traini...,sulphureus,05662_Fungi_Basidiomycota_Agaricomycetes_Polyp...,2740704.0,231,gs://mush-img-repo/data/raw/2021/Basidiomycota...,0.003870
100752,Fungi,Daedaleopsis confragosa,Polyporaceae,Fungi,Basidiomycota,Polyporales,Daedaleopsis,Agaricomycetes,1,eed796d8-c7ce-48d7-91db-c67551b8fe68.jpg,jeanbuckles,500,375,10.0,2019-03-09 19:46:49+00:00,,,39.449610,-76.550800,val,2021,/home/broug/Desktop/Mushroom-Classifier/traini...,confragosa,05675_Fungi_Basidiomycota_Agaricomycetes_Polyp...,2738702.0,123,gs://mush-img-repo/data/raw/2021/Basidiomycota...,0.003189
100980,Fungi,Auriscalpium vulgare,Auriscalpiaceae,Fungi,Basidiomycota,Russulales,Auriscalpium,Agaricomycetes,1,78168b39-bbea-407f-8e64-09dafede9046.jpg,Rebecca Johnson,500,375,30.0,2019-03-02 19:17:48+00:00,,,37.295150,-122.246930,val,2021,/home/broug/Desktop/Mushroom-Classifier/traini...,vulgare,05698_Fungi_Basidiomycota_Agaricomycetes_Russu...,2736936.0,43,gs://mush-img-repo/data/raw/2021/Basidiomycota...,0.002843
101222,Fungi,Allodus podophylli,Pucciniaceae,Fungi,Basidiomycota,Pucciniales,Allodus,Pucciniomycetes,1,a09d0fae-71bb-4f6c-abb6-02e89d35a2d8.jpg,bwilderman20,500,375,5.0,2019-05-21 13:49:49+00:00,,,39.370270,-82.124020,val,2021,/home/broug/Desktop/Mushroom-Classifier/traini...,podophylli,05722_Fungi_Basidiomycota_Pucciniomycetes_Pucc...,2749638.0,11,gs://mush-img-repo/data/raw/2021/Basidiomycota...,0.002488


In [9]:
df

Unnamed: 0,class_id,file_name
2658,0,a33a2b0b8da57bfeccebfc044ebebdce.jpg
81147,0,54909eac-94e9-441e-9f74-fbf3798d97bb.jpg
81088,0,b42fe3db-38fb-4af9-9b05-ee62ef94ecfe.jpg
81064,0,1e1d0aaf-e43c-4451-a58d-7659d3dfac17.jpg
81039,0,c3eb8212-c751-4688-a024-37008d3a7b43.jpg
...,...,...
16,466,28f836e97682e282424a8457c2367294.jpg
94113,466,2a945ac3-3639-45e7-8d89-5cb692ba0e52.jpg
94022,466,f263b78a-877d-4c7f-85d4-45230ea8c15f.jpg
94102,466,2cb020c9-a921-4ca2-8ef3-f1ef9351a09e.jpg
