In [1]:
# Install wandb
!pip install wandb --quiet

In [2]:
import os
import numpy as np
from skimage import io
import pandas as pd
from tqdm.notebook import tqdm
import params

import wandb

In [3]:
DATA_DIR = 'DATASETS/oranges'

In [4]:
def _create_table(image_paths, labels):
    """
    Create a table with the dataset
    
    Args:
    - image_paths (list->str): List of image paths
    - labels (list->str): List of labels corresponding to image_paths
    
    Returns:
    A wandb table
    """
    assert len(image_paths)==len(labels), "Number of images and labels different!"
      
    table = wandb.Table(columns=["File_Name", "Images", "Dataset", "Label"])
    
    for i, image_path in tqdm(enumerate(image_paths), total=len(image_paths)):
        image_label = labels[i]
        image = io.imread(os.path.join(DATA_DIR, image_label, image_path))
        
        table.add_data(
            image_path,
            wandb.Image(image),
            "None", # we don't have a dataset split yet
            image_label
        )

    return table

### START A NEW WANDB RUN

https://docs.wandb.ai/ref/python/run

A run in W&B is a unit of computation we want to track

Normally most of the runs are experiments

But in this case we are going to use W&B to upload data

We need to pass a `project` parameter. A project in W&B is just a collection of runs

The `entity` represents a team (if we work collaboratory)

We will indicate in the `job_type` argument this run is for upload data. This will help us to **organize our runs** in the dashboard

In [5]:
run = wandb.init(
    project=params.WANDB_PROJECT,
    entity=params.ENTITY,
    job_type="upload"
)

[34m[1mwandb[0m: Currently logged in as: [33mmarioparreno[0m. Use [1m`wandb login --relogin`[0m to force relogin


### CREATE AN ARTIFACT

https://docs.wandb.ai/ref/python/artifact

We will use WANDB Artifacts to version our datasets

In [11]:
artifact = wandb.Artifact(
    params.RAW_DATA_AT,  # The name of the artifact
    type="raw_data"  # Will help us to organize our information
)

We can think of an Artifact as a version folder containing our data

We can add files or folders to our Artifact

In [12]:
artifact.add_dir(DATA_DIR, name="images")
artifact.add_file(os.path.join(DATA_DIR, "labels.csv"), name="labels.csv")

[34m[1mwandb[0m: Adding directory to artifact (./DATASETS/oranges)... Done. 0.3s


ArtifactManifestEntry(path='labels.csv', digest='A8e2FhAXBrqFxfJZmcTnRw==', ref=None, birth_artifact_id=None, size=13796, extra={}, local_path='/home/maparla/.local/share/wandb/artifacts/staging/tmpjfqvs2n_')

-------

In [13]:
df = pd.read_csv(os.path.join(DATA_DIR, "labels.csv"))
df.head()

Unnamed: 0,image,label
0,FreshOrange (1).jpg,FreshOrange
1,FreshOrange (10).jpg,FreshOrange
2,FreshOrange (100).jpg,FreshOrange
3,FreshOrange (101).jpg,FreshOrange
4,FreshOrange (102).jpg,FreshOrange


In [14]:
table = _create_table(df.image, df.label)

  0%|          | 0/400 [00:00<?, ?it/s]

-------

Add the table to our Artifact with a name

In [15]:
artifact.add(table, "eda_table")

ArtifactManifestEntry(path='eda_table.table.json', digest='3qao1HLUJjxaN9fVW2e+hg==', ref=None, birth_artifact_id=None, size=101090, extra={}, local_path='/home/maparla/.local/share/wandb/artifacts/staging/tmpsegs2uzk')

Log the Artifact to WANDB

In [16]:
run.log_artifact(artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x7fa07a60dd90>

Finally finish our run. That will upload our information to WANDB servers

In [17]:
run.finish()