In [1]:
# Install wandb
!pip install wandb --quiet

In [2]:
!wandb --version

wandb, version 0.13.9


In [3]:
import os
import wandb
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedGroupKFold

import params

Initialise a run to track de split

In [4]:
run = wandb.init(
    project=params.WANDB_PROJECT,
    entity=params.ENTITY,
    job_type="data_split"
)

[34m[1mwandb[0m: Currently logged in as: [33mmarioparreno[0m. Use [1m`wandb login --relogin`[0m to force relogin


In the previous Notebook we saved our data to an Artifact

We will use it now and track the lineage of our dataset in this way

In [5]:
raw_data_at = run.use_artifact(
    'marioparreno/mlops-wandb-course/oranges:latest',
    type='raw_data'
)
artifact_dir = raw_data_at.download()

[34m[1mwandb[0m: Downloading large artifact oranges:latest, 2521.61MB. 796 files... 
[34m[1mwandb[0m:   796 of 796 files downloaded.  
Done. 0:0:0.1


To create the splits we will need the data filenames and labels

We have already that information at our dataset we retrieved from the artifact

In [6]:
orig_dataset = pd.read_csv(os.path.join(artifact_dir, "labels.csv"))
orig_dataset.head()

Unnamed: 0,image,label
0,FreshOrange (1).jpg,FreshOrange
1,FreshOrange (10).jpg,FreshOrange
2,FreshOrange (100).jpg,FreshOrange
3,FreshOrange (101).jpg,FreshOrange
4,FreshOrange (102).jpg,FreshOrange


In [7]:
split_df = orig_dataset.copy()

Now we are going to fill the `Stage` column to define the splits for training, validation and test

In [10]:
validation_frac = 0.2
test_frac = 0.2
# implicit train size 1 - validation_size - test_size

# Set initially all to train split
split_df['stage'] = 'train'

# Shuffle the data
split_df = split_df.sample(frac=1, random_state=42).reset_index(drop=True)

# get the verfy first validate_size% as validation
val_size = int(len(split_df)*validation_frac)
split_df.loc[:val_size, ['stage']] = 'validation'

# get the next test_size% as test
test_size = int(len(split_df)*test_frac)
split_df.loc[val_size:(val_size+test_size-1), ['stage']] = 'test'

split_df.stage.value_counts()

train         240
validation     80
test           80
Name: stage, dtype: int64

Finally we save locally the data split

In [11]:
split_df.to_csv('data_split.csv', index=False)

And log the data split by using an Artifact. Create the Artifact

In [12]:
processed_data_at = wandb.Artifact(
    params.PROCESSED_DATA_AT,
    type="split_data"
)

Add the data relevant to the splitted dataset: 
- The raw data (we could process it, etc) but as we are not modifying => omit
- The split information + The dataset (labels) information

In [13]:
# The split information
processed_data_at.add_file('data_split.csv', name="data_split.csv")

ArtifactManifestEntry(path='data_split.csv', digest='1CGm5gi1rmC7F8JsFD91Qw==', ref=None, birth_artifact_id=None, size=16522, extra={}, local_path='/home/maparla/.local/share/wandb/artifacts/staging/tmp4y5ghzpd')

We are going to save the split information by using the Table object from W&B

In [14]:
data_split_table = wandb.Table(
    dataframe=split_df
)

In [15]:
processed_data_at.add(data_split_table, "eda_table_data_split")

ArtifactManifestEntry(path='eda_table_data_split.table.json', digest='Ss5yoU+VhIYx1Yng7iiwmQ==', ref=None, birth_artifact_id=None, size=21384, extra={}, local_path='/home/maparla/.local/share/wandb/artifacts/staging/tmp5w8piro7')

Now we can log our Artifact and finish the run

In [16]:
run.log_artifact(processed_data_at)
run.finish()