<a href="https://www.kaggle.com/code/luminlab/ber-curation?scriptVersionId=182257992" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

Details of Steps can be found on Google Slides via:¶

https://docs.google.com/presentation/d/1sb3QkXiYooHqi3p-tkGVUqwqFKd-601_pzU96W1drw0/edit?usp=sharing

In [127]:
from kaggle_secrets import UserSecretsClient
wandb_key_label = "WANDB_KEY"
wandb_key= UserSecretsClient().get_secret(wandb_key_label)

In [None]:
import git
git.Repo.clone_from('https://github.com/Lumin-Lab/BerCuration', '/kaggle/working/scarf')

In [None]:
!pip install -r /kaggle/working/scarf/requirements.txt

In [None]:
import os
def save_csv_file(df, path):
    dir_name = os.path.dirname(path)
    if dir_name:
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
    df.to_csv(path, index=False)

In [None]:
import pandas
df = pd.read_csv("/kaggle/input/ber-stratified-samples/BER_stratified_sample.csv")[:1000]

In [None]:
n_splits = 2
output_dir = "/kaggle/working/output"
config_dir="/kaggle/working/scarf/configs"
scarf_model_name = "scarf"
mlp_model_name = "mlp"

In [None]:
from sklearn.model_selection import KFold
import os
kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)
for i, (train_index, test_index) in enumerate(kf.split(df)):
    save_csv_file(df.iloc[train_index], f"{output_dir}/split_{i+1}/raw_train.csv")
    save_csv_file(df.iloc[test_index], f"{output_dir}/split_{i+1}/raw_test.csv")

In [None]:
for i in range(n_splits):
    command = f"""
    python /kaggle/working/scarf/get_processed_dataset.py \
      --config_dir "/kaggle/working/scarf/configs" \
      --output_dir "{output_dir}/split_{i+1}" \
      --data_path "{output_dir}/split_{i+1}/raw_train.csv" \
      --output_csv_name "processed_train" \
      --is_train
    """
    os.system(command)
    command = f"""
    python /kaggle/working/scarf/get_processed_dataset.py \
      --config_dir "/kaggle/working/scarf/configs" \
      --output_dir "{output_dir}/split_{i+1}" \
      --data_path "{output_dir}/split_{i+1}/raw_test.csv" \
      --output_csv_name "processed_test"
    """
    os.system(command)

**Train the SCARF Encoder on the Train Dataset**

In [None]:
scarf_batch_size = 32
scarf_epochs = 1
scarf_lr = 3e-5
scarf_emb_dim = 32
scarf_encoder_depth = 3
scarf_corruption_rate=0.3

In [None]:
import os
"""The trained scarf model is saved in 
/kaggle/working/output/split_{split}/scarf.pt if you run the following command:
"""
for i in range(n_splits):
    command = f"""
    python /kaggle/working/scarf/run_scarf.py\
      --config_dir={config_dir} \
      --output_dir="{output_dir}/split_{i+1}" \
      --train_data_path="{output_dir}/split_{i+1}/processed_train.csv"\
      --batch_size={scarf_batch_size} \
      --epochs={scarf_epochs} \
      --lr=3e-5 \
      --emb_dim={scarf_emb_dim} \
      --encoder_depth={scarf_encoder_depth} \
      --model_name="{scarf_model_name}" \
      --corruption_rate={scarf_corruption_rate} \
      --wandb_project_name='SCARF_Project' \
      --wandb_entity='urbancomp' \
      --wandb_key='{wandb_key}'
    """

    os.system(command)

**Obtain the SCARF embeddings for the Small Train dataset, and save the result**

In [None]:
"""The generated embeddings are saved as a NumPy array in 
/kaggle/working/output/split_{split}/train.npy if you run the following command:
"""
for i in range(n_splits):
    command = f"""
    python /kaggle/working/scarf/get_scarf_embedding.py \
      --config_dir={config_dir} \
      --output_dir="{output_dir}/split_{i+1}" \
      --data_path="{output_dir}/split_{i+1}/processed_train.csv" \
      --batch_size={scarf_batch_size} \
      --epochs={scarf_epochs} \
      --lr={scarf_lr} \
      --emb_dim={scarf_emb_dim} \
      --encoder_depth={scarf_encoder_depth} \
      --model_name={scarf_model_name} \
      --corruption_rate={scarf_corruption_rate} \
      --embedding_save_name="train"
    """

    os.system(command)

**Obtain the SCARF embeddings for the Test dataset, and save the result**

In [None]:
"""The generated embeddings are saved as a NumPy array in 
/kaggle/working/output/split_{split}/test.npy if you run the following command:
"""
for i in range(n_splits):
    command = f"""
    python /kaggle/working/scarf/get_scarf_embedding.py \
      --config_dir={config_dir} \
      --output_dir="{output_dir}/split_{i+1}" \
      --data_path="{output_dir}/split_{i+1}/processed_test.csv" \
      --batch_size={scarf_batch_size} \
      --epochs={scarf_epochs} \
      --lr={scarf_lr} \
      --emb_dim={scarf_emb_dim} \
      --encoder_depth={scarf_encoder_depth} \
      --model_name={scarf_model_name} \
      --corruption_rate={scarf_corruption_rate} \
      --embedding_save_name="test"
    """

    os.system(command)

**Filter outliers based on scarf embeddings**

In [141]:
threshold = 0.2

In [147]:
from cleanlab.outlier import OutOfDistribution
import numpy as np

for i in range(n_splits):
    ood = OutOfDistribution()
    train_emb = np.load(f"{output_dir}/split_{i+1}/train.npy")
    test_emb = np.load(f"{output_dir}/split_{i+1}/test.npy")
    ood.fit_score(features=train_emb)
    ood_train_feature_scores = ood.score(features=train_emb)
    ood_test_feature_scores = ood.score(features=test_emb)
    train_outliers_idx = np.where(ood_train_feature_scores  < threshold)[0]
    test_outliers_idx = np.where(ood_test_feature_scores  < threshold)[0]
    train_df = pd.read_csv(f"{output_dir}/split_{i+1}/raw_train.csv")
    test_df = pd.read_csv(f"{output_dir}/split_{i+1}/raw_test.csv")
    save_csv_file(train_df.iloc[train_outliers_idx], f"{output_dir}/split_{i+1}/cleanlab/train_outliers.csv")
    save_csv_file(test_df.iloc[test_outliers_idx], f"{output_dir}/split_{i+1}/cleanlab/test_outliers.csv")
    save_csv_file(train_df[~train_df.index.isin(train_outliers_idx)], f"{output_dir}/split_{i+1}/cleanlab/train_removed_outliers.csv")
    save_csv_file(test_df[~test_df.index.isin(test_outliers_idx)], f"{output_dir}/split_{i+1}/cleanlab/test_removed_outliers.csv")

Fitting OOD estimator based on provided features ...
Fitting OOD estimator based on provided features ...


In [None]:
"""Process train and test sets after their outliers are removed.
"""
for i in range(n_splits):
    command = f"""
    python /kaggle/working/scarf/get_processed_dataset.py \
      --config_dir "/kaggle/working/scarf/configs" \
      --output_dir "{output_dir}/split_{i+1}/cleanlab" \
      --data_path "{output_dir}/split_{i+1}/cleanlab/train_removed_outliers.csv" \
      --output_csv_name "processed_train" \
      --is_train
    """
    os.system(command)
    command = f"""
    python /kaggle/working/scarf/get_processed_dataset.py \
      --config_dir "/kaggle/working/scarf/configs" \
      --output_dir "{output_dir}/split_{i+1}/cleanlab" \
      --data_path "{output_dir}/split_{i+1}/cleanlab/test_removed_outliers.csv" \
      --output_csv_name "processed_test"
    """
    os.system(command)

**Train the MLP classifier on the datasets**

In [166]:
mlp_batch_size = 32
mlp_epochs = 1
mlp_lr = 0.00003
mlp_dropout= 0.1

**Before outlier removal**

In [168]:
for i in range(n_splits):
    command = f"""
    python /kaggle/working/scarf/run_mlp.py \
      --config_dir={config_dir} \
      --output_dir="{output_dir}/split_{i+1}/"\
      --train_data_path="{output_dir}/split_{i+1}/processed_train.csv" \
      --test_data_path="{output_dir}/split_{i+1}/processed_test.csv" \
      --batch_size={mlp_batch_size} \
      --epochs={mlp_epochs} \
      --lr={mlp_lr} \
      --model_name={mlp_model_name} \
      --wandb_project_name "test" \
      --wandb_entity "urbancomp" \
      --wandb_key {wandb_key} \
      --hidden_layer 256 128 64 32 16 \
      --dropout={mlp_dropout} 
    """

    os.system(command)

wandb: Currently logged in as: dan-liu. Use `wandb login --relogin` to force relogin
wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc
wandb: Currently logged in as: dan-liu (urbancomp). Use `wandb login --relogin` to force relogin
wandb: wandb version 0.17.1 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade
wandb: Tracking run with wandb version 0.17.0
wandb: Run data is saved locally in /kaggle/working/wandb/run-20240608_181222-br9udic0
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run test
wandb: ⭐️ View project at https://wandb.ai/urbancomp/Scarf-MLP
wandb: 🚀 View run at https://wandb.ai/urbancomp/Scarf-MLP/runs/br9udic0


Model saved at /kaggle/working/output/split_1//mlp.pt
Epoch [1/1] - Train Loss: 2.700, Train Acc: 0.333, Train F1: 0.167, Test Loss: 2.692, Test Acc: 0.093, Test F1: 0.011
Test Accuracy: 0.093, Test F1: 0.011


wandb: / 0.113 MB of 0.132 MB uploaded
wandb: Run history:
wandb:      test/A1_acc ▁
wandb:      test/A2_acc ▁
wandb:      test/A3_acc ▁
wandb:      test/B1_acc ▁
wandb:      test/B2_acc ▁
wandb:      test/B3_acc ▁
wandb:      test/C1_acc ▁
wandb:      test/C2_acc ▁
wandb:      test/C3_acc ▁
wandb:      test/D1_acc ▁
wandb:      test/D2_acc ▁
wandb:      test/E1_acc ▁
wandb:      test/E2_acc ▁
wandb:       test/F_acc ▁
wandb:       test/G_acc ▁
wandb:    test/test_acc ▁
wandb:     test/test_f1 ▁
wandb:   test/test_loss ▁
wandb:         train/f1 ▄▁▁▅▅▄▄▇▅▆▅▆▆█▃
wandb:  train/train_acc ▃▁▁▅▅▃▅▇▅▆▅▆▆█▃
wandb: train/train_loss ▇▇█▇▂██▃▅▁▄▁▂▇▅
wandb: 
wandb: Run summary:
wandb:      test/A1_acc 0.0
wandb:      test/A2_acc 0.0
wandb:      test/A3_acc 0.0
wandb:      test/B1_acc 0.0
wandb:      test/B2_acc 0.0
wandb:      test/B3_acc 0.0
wandb:      test/C1_acc 0.0
wandb:      test/C2_acc 0.0
wandb:      test/C3_acc 0.0
wandb:      test/D1_acc 1.0
wandb:      test/D2_acc 0.0
wandb:      test/

Model saved at /kaggle/working/output/split_2//mlp.pt
Epoch [1/1] - Train Loss: 2.699, Train Acc: 0.000, Train F1: 0.000, Test Loss: 2.686, Test Acc: 0.095, Test F1: 0.012
Test Accuracy: 0.095, Test F1: 0.012


wandb: / 0.114 MB of 0.133 MB uploaded
wandb: Run history:
wandb:      test/A1_acc ▁
wandb:      test/A2_acc ▁
wandb:      test/A3_acc ▁
wandb:      test/B1_acc ▁
wandb:      test/B2_acc ▁
wandb:      test/B3_acc ▁
wandb:      test/C1_acc ▁
wandb:      test/C2_acc ▁
wandb:      test/C3_acc ▁
wandb:      test/D1_acc ▁
wandb:      test/D2_acc ▁
wandb:      test/E1_acc ▁
wandb:      test/E2_acc ▁
wandb:       test/F_acc ▁
wandb:       test/G_acc ▁
wandb:    test/test_acc ▁
wandb:     test/test_f1 ▁
wandb:   test/test_loss ▁
wandb:         train/f1 ▅▁▂▁▅▄▄▂▂▃▃▂▇█▂
wandb:  train/train_acc ▅▁▂▁▅▃▆▂▂▃▃▂█▇▂
wandb: train/train_loss ▃▅▄▆▃▂▄▄▅▃▄█▁▄▅
wandb: 
wandb: Run summary:
wandb:      test/A1_acc 0.0
wandb:      test/A2_acc 0.0
wandb:      test/A3_acc 0.0
wandb:      test/B1_acc 0.0
wandb:      test/B2_acc 0.0
wandb:      test/B3_acc 0.0
wandb:      test/C1_acc 0.0
wandb:      test/C2_acc 0.0
wandb:      test/C3_acc 0.0
wandb:      test/D1_acc 1.0
wandb:      test/D2_acc 0.0
wandb:      test/

**After outlier removal**

In [169]:

for i in range(n_splits):
    command = f"""
    python /kaggle/working/scarf/run_mlp.py \
      --config_dir={config_dir} \
      --output_dir="{output_dir}/split_{i+1}/cleanlab"\
      --train_data_path="{output_dir}/split_{i+1}/cleanlab/processed_train.csv" \
      --test_data_path="{output_dir}/split_{i+1}/cleanlab/processed_test.csv" \
      --batch_size={mlp_batch_size} \
      --epochs={mlp_epochs} \
      --lr={mlp_lr} \
      --model_name={mlp_model_name} \
      --wandb_project_name "test" \
      --wandb_entity "urbancomp" \
      --wandb_key {wandb_key} \
      --hidden_layer 256 128 64 32 16 \
      --dropout={mlp_dropout} 
    """

    os.system(command)

wandb: Currently logged in as: dan-liu. Use `wandb login --relogin` to force relogin
wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc
wandb: Currently logged in as: dan-liu (urbancomp). Use `wandb login --relogin` to force relogin
wandb: wandb version 0.17.1 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade
wandb: Tracking run with wandb version 0.17.0
wandb: Run data is saved locally in /kaggle/working/wandb/run-20240608_181328-u6socy7m
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run test
wandb: ⭐️ View project at https://wandb.ai/urbancomp/Scarf-MLP
wandb: 🚀 View run at https://wandb.ai/urbancomp/Scarf-MLP/runs/u6socy7m


Model saved at /kaggle/working/output/split_1/cleanlab/mlp.pt
Epoch [1/1] - Train Loss: 2.700, Train Acc: 0.333, Train F1: 0.167, Test Loss: 2.692, Test Acc: 0.093, Test F1: 0.011
Test Accuracy: 0.093, Test F1: 0.011


wandb: / 0.113 MB of 0.132 MB uploaded
wandb: Run history:
wandb:      test/A1_acc ▁
wandb:      test/A2_acc ▁
wandb:      test/A3_acc ▁
wandb:      test/B1_acc ▁
wandb:      test/B2_acc ▁
wandb:      test/B3_acc ▁
wandb:      test/C1_acc ▁
wandb:      test/C2_acc ▁
wandb:      test/C3_acc ▁
wandb:      test/D1_acc ▁
wandb:      test/D2_acc ▁
wandb:      test/E1_acc ▁
wandb:      test/E2_acc ▁
wandb:       test/F_acc ▁
wandb:       test/G_acc ▁
wandb:    test/test_acc ▁
wandb:     test/test_f1 ▁
wandb:   test/test_loss ▁
wandb:         train/f1 ▄▁▁▅▅▄▄▇▅▆▅▆▆█▃
wandb:  train/train_acc ▃▁▁▅▅▃▅▇▅▆▅▆▆█▃
wandb: train/train_loss ▇▇█▇▂██▃▅▁▄▁▂▇▅
wandb: 
wandb: Run summary:
wandb:      test/A1_acc 0.0
wandb:      test/A2_acc 0.0
wandb:      test/A3_acc 0.0
wandb:      test/B1_acc 0.0
wandb:      test/B2_acc 0.0
wandb:      test/B3_acc 0.0
wandb:      test/C1_acc 0.0
wandb:      test/C2_acc 0.0
wandb:      test/C3_acc 0.0
wandb:      test/D1_acc 1.0
wandb:      test/D2_acc 0.0
wandb:      test/

Model saved at /kaggle/working/output/split_2/cleanlab/mlp.pt
Epoch [1/1] - Train Loss: 2.699, Train Acc: 0.000, Train F1: 0.000, Test Loss: 2.686, Test Acc: 0.095, Test F1: 0.012
Test Accuracy: 0.095, Test F1: 0.012


wandb: / 0.114 MB of 0.133 MB uploaded
wandb: Run history:
wandb:      test/A1_acc ▁
wandb:      test/A2_acc ▁
wandb:      test/A3_acc ▁
wandb:      test/B1_acc ▁
wandb:      test/B2_acc ▁
wandb:      test/B3_acc ▁
wandb:      test/C1_acc ▁
wandb:      test/C2_acc ▁
wandb:      test/C3_acc ▁
wandb:      test/D1_acc ▁
wandb:      test/D2_acc ▁
wandb:      test/E1_acc ▁
wandb:      test/E2_acc ▁
wandb:       test/F_acc ▁
wandb:       test/G_acc ▁
wandb:    test/test_acc ▁
wandb:     test/test_f1 ▁
wandb:   test/test_loss ▁
wandb:         train/f1 ▅▁▂▁▅▄▄▂▂▃▃▂▇█▂
wandb:  train/train_acc ▅▁▂▁▅▃▆▂▂▃▃▂█▇▂
wandb: train/train_loss ▃▅▄▆▃▂▄▄▅▃▄█▁▄▅
wandb: 
wandb: Run summary:
wandb:      test/A1_acc 0.0
wandb:      test/A2_acc 0.0
wandb:      test/A3_acc 0.0
wandb:      test/B1_acc 0.0
wandb:      test/B2_acc 0.0
wandb:      test/B3_acc 0.0
wandb:      test/C1_acc 0.0
wandb:      test/C2_acc 0.0
wandb:      test/C3_acc 0.0
wandb:      test/D1_acc 1.0
wandb:      test/D2_acc 0.0
wandb:      test/

**Train the random forest on the datasets**