In [22]:
import json
import numpy as np
import pandas as pd
from loguru import logger
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from training.train_config import CFG
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [23]:
from os import environ
from pathlib import Path
root = Path(environ['PYTHONPATH'].split(":")[0])
raw_data_root = root / 'training' / 'data' / "raw"

In [24]:
def stratified_sample(df: pd.DataFrame, groupby_column: str, sampling_rate: float = 0.01) -> pd.DataFrame:
    assert 0.0 < sampling_rate <= 1.0
    assert groupby_column in df.columns

    num_rows = int((df.shape[0] * sampling_rate) // 1)
    num_classes = len(df[groupby_column].unique())
    num_rows_per_class = int(max(1, ((num_rows / num_classes) // 1)))
    df_sample = df.groupby(groupby_column, group_keys=False).apply(lambda x: x.sample(min(len(x), num_rows_per_class)))

    return df_sample


# @task
def load_dataframe(root_path):
    """This function loads a dataframe from the given root path.

    Parameters:
        root_path (pathlib.Path): The root path where the dataframe is located.

    Returns:
        DataFrame: The loaded dataframe.
    """
    df = pd.read_csv(root_path.parent / "train.csv")
    if CFG.COMBINE_TRAIN_VAL:
        df = stratified_sample(df, "class_id", sampling_rate=1.0)
    else:
        val = stratified_sample(df[df["dset"] == "val"], "class_id", sampling_rate=1.0)
        train = stratified_sample(df[df["dset"] == "train"], "class_id", sampling_rate=1.0)
        
    del df
    logger.info("Loaded train and val dataframes")
    logger.debug(f"Train shape: {train.shape}  :  val shape: {val.shape}")
    return train, val

In [25]:
df = pd.read_csv(raw_data_root.parent / "train.csv")

  df = pd.read_csv(raw_data_root.parent / "train.csv")


In [26]:
df = df[['class_id', 'file_name']]

In [31]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=50, shuffle=True, random_state=42)

In [32]:
for fold, (train_idx, val_idx) in enumerate(skf.split(df, df["class_id"])):
    df_train = df.iloc[train_idx]
    df_val = df.iloc[val_idx]



In [34]:
df_train

Unnamed: 0,class_id,file_name
0,316,8d841f576d05e05f0b4b5513d549630a.jpg
1,316,f84f23fe93f3fde53f7193e3cc08d473.jpg
2,316,dabc337065c65a0ed19707c4a595bcb1.jpg
3,316,8980532c8a1ef146bd3ed8d54f362b76.jpg
4,316,2d3e29631fa00733061111743ceb734c.jpg
...,...,...
101280,435,51f13b0b-f4f7-46ed-9e53-4d0c63ddad6b.jpg
101281,435,294cf8dc-bfaa-45b7-a7a8-205d1a22e33d.jpg
101282,435,b3b8cb90-d5d6-4118-826f-d69c6803a11a.jpg
101283,435,f38f1edd-34f2-4e8d-aaeb-c95ed22dd1ca.jpg


In [9]:
df

Unnamed: 0,class_id,file_name
2658,0,a33a2b0b8da57bfeccebfc044ebebdce.jpg
81147,0,54909eac-94e9-441e-9f74-fbf3798d97bb.jpg
81088,0,b42fe3db-38fb-4af9-9b05-ee62ef94ecfe.jpg
81064,0,1e1d0aaf-e43c-4451-a58d-7659d3dfac17.jpg
81039,0,c3eb8212-c751-4688-a024-37008d3a7b43.jpg
...,...,...
16,466,28f836e97682e282424a8457c2367294.jpg
94113,466,2a945ac3-3639-45e7-8d89-5cb692ba0e52.jpg
94022,466,f263b78a-877d-4c7f-85d4-45230ea8c15f.jpg
94102,466,2cb020c9-a921-4ca2-8ef3-f1ef9351a09e.jpg
