In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ..

/home/suriya/dcu/Group-Project


# Experiment Tracking and Visualization

In [3]:
import numpy as np
import pandas as pd

In [4]:
from src.features.rssi_distance import extract_feature2, postproc_feature_dicts
from src.featutils import aggregate_features_from_folder

In [5]:
train_dir = "data/tc4tl_training_data_v1/tc4tl/data/train/"
train_key = pd.read_csv("data/tc4tl_training_data_v1/tc4tl/docs/tc4tl_train_key.tsv", sep="\t")
test_dir = "data/tc4tl_data_v5/tc4tl/data/test/"
test_mdata = pd.read_csv("data/tc4tl_data_v5/tc4tl/docs/tc4tl_test_metadata.tsv", sep="\t")
test_key = pd.read_csv("data/tc4tl_test_key/tc4tl/docs/tc4tl_test_key.tsv", sep="\t")

In [6]:
trainset, _ = aggregate_features_from_folder(train_dir, train_key, feat_fn=extract_feature2, postproc_fn=postproc_feature_dicts, verbose=True)
testset, _  = aggregate_features_from_folder(test_dir, test_key, feat_fn=extract_feature2, postproc_fn=postproc_feature_dicts, verbose=True)

100%|██████████| 15552/15552 [00:27<00:00, 558.46it/s]
100%|██████████| 8423/8423 [00:13<00:00, 620.51it/s]


In [7]:
cg_mapping = {'Y' : 0., 'N' : 1.}
trainset.CoarseGrain = trainset.CoarseGrain.replace(cg_mapping)
testset.CoarseGrain = testset.CoarseGrain.replace(cg_mapping)

In [8]:
features = ['PredictedDistance', 'CoarseGrain']
target = 'Distance'

X_train, y_train = trainset[features].values, trainset[target].values
X_test, y_test = testset[features].values, testset[target].values
labels = trainset[target].unique()

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

In [10]:
importances

array([0.90945141, 0.09054859])

In [11]:
rf.feature_importances_

array([0.90945141, 0.09054859])

## Init wandb

In [12]:
import wandb

In [13]:
run = wandb.init(project="sklearn-integration")

[34m[1mwandb[0m: Currently logged in as: [33msuriyadeepan[0m (use `wandb login --relogin` to force relogin)


## Classifier Plots

In [14]:
wandb.sklearn.plot_roc(y_test, y_prob, labels)

In [15]:
wandb.sklearn.plot_precision_recall(y_test, y_prob, labels)

In [16]:
wandb.sklearn.plot_feature_importances(rf, feature_names=features)

<wandb.viz.Visualize at 0x12875cc40>

In [17]:
wandb.sklearn.plot_class_proportions(y_train, y_test, labels)

In [19]:
wandb.sklearn.plot_confusion_matrix(y_test, y_pred, labels)

In [21]:
wandb.sklearn.plot_learning_curve(rf, X_train, y_train)

## Weights & Biases Visualization

[suriyadeepan/workspace](https://wandb.ai/suriyadeepan/sklearn-integration/runs/1v15n1em?workspace=user-suriyadeepan)

# Dataset Versioning

In [9]:
import wandb

run = wandb.init(job_type="dataset-creation")
artifact = wandb.Artifact('tc4tl-dataset', type='dataset')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msuriyadeepan[0m (use `wandb login --relogin` to force relogin)


In [10]:
trainset.to_csv('data/processed/basic.train.csv', index=False)
testset.to_csv('data/processed/basic.test.csv', index=False)

In [11]:
artifact.add_file("data/processed/basic.train.csv")
artifact.add_file("data/processed/basic.test.csv")
run.log_artifact(artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f8164ea6748>

In [12]:
run = wandb.init(job_type="model-training")
artifact = run.use_artifact('tc4tl-dataset:latest')
artifact_dir = artifact.download()

VBox(children=(Label(value=' 1.09MB of 1.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [18]:
!tree artifacts/

[01;34martifacts/[00m
└── [01;34mtc4tl-dataset:v0[00m
    ├── basic.test.csv
    └── basic.train.csv

1 directory, 2 files


## Log a new version

### Make a new version

In [23]:
from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler()
norm_predicted_distance = scalar.fit_transform(trainset.PredictedDistance.values.reshape(-1, 1))
trainset['NormPredictedDistance'] = norm_predicted_distance

In [24]:
trainset.to_csv("data/processed/basic.train.csv", index=False)

### Save newer version

In [25]:
run = wandb.init(job_type="dataset-creation")
artifact = wandb.Artifact('tc4tl-dataset', type='dataset')
artifact.add_file("data/processed/basic.train.csv")
run.log_artifact(artifact)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

<wandb.sdk.wandb_artifacts.Artifact at 0x7f8130a35400>

### Get newer version of the dataset

In [26]:
run = wandb.init(job_type="model-training")
artifact = run.use_artifact('tc4tl-dataset:latest')
artifact_dir = artifact.download()

VBox(children=(Label(value=' 1.00MB of 1.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [27]:
!tree artifacts/

[01;34martifacts/[00m
├── [01;34mtc4tl-dataset:v0[00m
│   ├── basic.test.csv
│   └── basic.train.csv
└── [01;34mtc4tl-dataset:v1[00m
    └── basic.train.csv

2 directories, 3 files


## Model Versioning

[Model versioning](https://docs.wandb.ai/guides/artifacts/model-versioning) works similar to dataset versioning using the concept of `artifacts`.


# Hyperparameter Tuning with W&B sweeps

### (1) Define the sweep

In [66]:
parameters_dict = {
    'TX' : {
        'distribution' : 'uniform',
        'min' : -80, 'max' : -40
    },
    'N' : {
        'distribution' : 'uniform',
        'min' : 0.1, 'max' : 5
    }
}

In [67]:
sweep_config = {
    'method' : 'bayes',
    'metric' : {
        'name' : 'accuracy',
        'goal' : 'maximize'
    }
}

In [68]:
sweep_config['parameters'] = parameters_dict

In [69]:
sweep_config

{'method': 'bayes',
 'metric': {'name': 'accuracy', 'goal': 'maximize'},
 'parameters': {'TX': {'distribution': 'uniform', 'min': -80, 'max': -40},
  'N': {'distribution': 'uniform', 'min': 0.1, 'max': 5}}}

### (2) Initialize Sweep

In [70]:
sweep_id = wandb.sweep(sweep_config, project="sklearn-sweeps-test-2")

Create sweep with ID: yy9lpbfm
Sweep URL: https://wandb.ai/suriyadeepan/sklearn-sweeps-test-2/sweeps/yy9lpbfm


### (3) Run the sweep agent

In [71]:
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from sklearn.ensemble import RandomForestClassifier

def evaluate(features, labels, classifiers, verbose=False):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=69)
    scores = []
    iterate_through = tqdm(classifiers) if verbose else classifiers
    for _classifier in iterate_through:
        model = _classifier()
        model.fit(X_train, y_train)
        scores.append((model.predict(X_test) == y_test).mean())
        if verbose:
            iterate_through.set_description(f"{_classifier.__name__}: {scores[-1]}")
    return max(scores)

In [74]:
def fit(config=None):
    
    with wandb.init(config=config):
        config = wandb.config
        dataset, _ = aggregate_features_from_folder(test_dir, test_key, feat_fn=extract_feature2, postproc_fn=postproc_feature_dicts,
                                      tunables={'TX' : config.TX, 'N' : config.N}, testing=False, verbose=False)
        dataset.CoarseGrain = dataset.CoarseGrain.replace({
        'Y' : 0., 'N' : 1.
        })
        train_features, train_labels = dataset[['PredictedDistance', 'CoarseGrain']], dataset['Distance']
        accuracy = evaluate(train_features, train_labels, [RandomForestClassifier])
        wandb.log({'accuracy' : accuracy})

In [75]:
wandb.agent(sweep_id, fit, count=10)

[34m[1mwandb[0m: Agent Starting Run: jywsyzxk with config:
[34m[1mwandb[0m: 	N: 4.803324604146783
[34m[1mwandb[0m: 	TX: -77.27107482331897


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁

0,1
accuracy,0.50827


[34m[1mwandb[0m: Agent Starting Run: k5vcy40r with config:
[34m[1mwandb[0m: 	N: 4.933336805154698
[34m[1mwandb[0m: 	TX: -45.11958428207836


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁

0,1
accuracy,0.50755


[34m[1mwandb[0m: Agent Starting Run: j8ih45gs with config:
[34m[1mwandb[0m: 	N: 3.71886655756538
[34m[1mwandb[0m: 	TX: -79.99016456867038


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁

0,1
accuracy,0.50791


[34m[1mwandb[0m: Agent Starting Run: 3fbv126j with config:
[34m[1mwandb[0m: 	N: 4.994103998607602
[34m[1mwandb[0m: 	TX: -72.43492206888419


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁

0,1
accuracy,0.50719


[34m[1mwandb[0m: Agent Starting Run: 6pcjh6xy with config:
[34m[1mwandb[0m: 	N: 0.6586797875247725
[34m[1mwandb[0m: 	TX: -68.92398593337349


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁

0,1
accuracy,0.50755


[34m[1mwandb[0m: Agent Starting Run: asdqdhx4 with config:
[34m[1mwandb[0m: 	N: 0.3315446283532275
[34m[1mwandb[0m: 	TX: -63.98220495367256


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁

0,1
accuracy,0.50827


[34m[1mwandb[0m: Agent Starting Run: 4mj3c83l with config:
[34m[1mwandb[0m: 	N: 3.3200260039671172
[34m[1mwandb[0m: 	TX: -75.65116800199961


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁

0,1
accuracy,0.50755


[34m[1mwandb[0m: Agent Starting Run: 2oj10os5 with config:
[34m[1mwandb[0m: 	N: 0.7875034881945091
[34m[1mwandb[0m: 	TX: -67.57199451031316


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁

0,1
accuracy,0.50719


[34m[1mwandb[0m: Agent Starting Run: s0r56tey with config:
[34m[1mwandb[0m: 	N: 0.5062857231816177
[34m[1mwandb[0m: 	TX: -69.7077102007787


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁

0,1
accuracy,0.50791


[34m[1mwandb[0m: Agent Starting Run: f294ww3z with config:
[34m[1mwandb[0m: 	N: 4.852732982739059
[34m[1mwandb[0m: 	TX: -60.65569853430385


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁

0,1
accuracy,0.50755
