In [None]:
#change path, the directory where the source files are located
import os
os.chdir("..")
import sys
sys.path.append("clear/pytorch")

In [None]:
wandb_key = "YOUR_WANDB_KEY"

In [None]:
import wandb
wandb.login(key=wandb_key)

In [None]:
import pandas as pd
data_folder = 'ber-processed-parquet'
input_dir = f"input/{data_folder}"
output_dir = "output"
config_dir=f"clear/pytorch/configs"
scarf_model_name = "scarf"
mlp_model_name = "mlp"

In [None]:
import os
n_splits = len(list(os.walk(input_dir))[0][1])
n_splits

In [None]:
scarf_batch_size = 32
scarf_epochs = 10
scarf_lr = 0.001
scarf_emb_dim = 32
scarf_encoder_depth = 3
scarf_corruption_rate=0.3

In [None]:
splits = range(n_splits)

In [None]:
import clear.pytorch.src as src
from src.utils import set_seed, load_from_yaml, get_features_from_yaml


In [None]:
flag_train = False

flag_embedding = True

mode = 'full'
# mode = 'batch'

model_dir = 'output'
output_dir = 'output'

In [None]:
import argparse
import os
import pandas as pd
import wandb
from src.utils import set_seed, load_from_yaml, get_features_from_yaml
from src.data_processor import DataProcessor
from src.scarf import SCARF
from src.dataloader import ScarfToDataLoader
from train import train_encoder
from dotenv import load_dotenv
# Suppress Dtype and Future warnings from pandas
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
import torch
# Set a consistent seed for reproducibility
set_seed(42)
load_dotenv()


In [None]:

def run_scarf(arguments):
    # Argument parsing
    parser = argparse.ArgumentParser(description='Train SCARF model')
    parser.add_argument('--config_dir', default='configs', help='Directory for configuration files')
    parser.add_argument('--output_dir', default='exp', help='Output directory for models and stats')
    parser.add_argument('--train_data_path', default='data/small_train.csv', help='Data directory')
    parser.add_argument('--batch_size', type=int, default=32, help='Batch size for training')
    parser.add_argument('--epochs', type=int, default=1, help='Number of training epochs')
    parser.add_argument('--lr', type=float, default=3e-5, help='Learning rate')
    parser.add_argument('--emb_dim', type=int, default=32, help='Dimensionality of the embedding space')
    parser.add_argument('--encoder_depth', type=int, default=3, help='Depth of the encoder model')
    parser.add_argument('--model_name', type=str, default="scarf", help='Name of saved model')
    parser.add_argument('--corruption_rate', type=float, default=0.3, help='Rate of corruption applied during training')
    # parser.add_argument('--device', type=str, default="cpu")
    parser.add_argument('--wandb_project_name', type=str, required=True, help='Name of wandb project')
    parser.add_argument('--wandb_entity', type=str, default="urbancomp", help='Name of wandb entity')
    parser.add_argument('--wandb_key', type=str)
    args = parser.parse_args(arguments)

    # Ensure output directory exists
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load configurations from YAML files
    preprocessing_config = load_from_yaml(f"{args.config_dir}/preprocess_config.yaml")
    energy_config = load_from_yaml(f"{args.config_dir}/energy_config.yaml")
    column_type_path = f"{args.output_dir}/column_type_classification.yaml"
    train_stats_path = f"{args.output_dir}/train_stats.json"
    scaler_path = f"{args.output_dir}/scaler.joblib"
    encoder_path = f"{args.output_dir}/encoder.joblib"
    small_area_path = f"{args.config_dir}/small_area.yaml"
    target = "EnergyRating"
    feature_config = load_from_yaml(f"{args.config_dir}/column_type_classification.yaml")
    features = get_features_from_yaml(feature_config, target)
    energyRatingEncoding = energy_config["original_rating_encoding"]


    wandb_key = os.getenv('WANDB_KEY')
    if not wandb_key:
        wandb_key = args.wandb_key

    # Initialize Weights & Biases
    wandb.login(key=wandb_key)
    wandb.init(
        project="Scarf",
        name=args.wandb_project_name,
        entity=args.wandb_entity,
        config={
            "epochs": args.epochs,
            "batch_size": args.batch_size,
            "lr": args.lr,
            "feature_num": len(features),
            "class_num": len(energyRatingEncoding),
            "features": features,
            "model_save_dir": args.output_dir,
            "model_name": args.model_name,
            "emb_dim": args.emb_dim,
            "encoder_depth": args.encoder_depth,
            "corruption_rate": args.corruption_rate,
        }
    )

    # Load datasets
    data_format = args.train_data_path.split('.')[-1]
    if data_format == 'csv':
        df_train = pd.read_csv(f"{args.train_data_path}")
    elif data_format == 'parquet':
        df_train = pd.read_parquet(f"{args.train_data_path}")
    else:
        print("wrong data format")
        return


    # # if not processed, Process datasets
    # processor = DataProcessor(preprocessing_config, train_stats_path, column_type_path, scaler_path, encoder_path, small_area_path, target, features)
    # train_df = processor.process(df_train, is_train=True)


    device = "cuda" if  torch.cuda.is_available() else "cpu"
    # Initialize and train the model
    model = SCARF(input_dim=df_train.shape[1]-1, emb_dim=args.emb_dim, encoder_depth=args.encoder_depth, corruption_rate=args.corruption_rate)
    model.to(device)
    train_encoder(df_train, 
                  ScarfToDataLoader, 
                  model, 
                  device=device, 
                  target_col=target, 
                  batch_size=args.batch_size, 
                  lr=args.lr, 
                  epochs=args.epochs, 
                  model_save_dir=args.output_dir, 
                  model_name=args.model_name)


In [None]:
import os
"""The trained scarf model is saved in 
{root_path}/output/split_{split}/scarf.pt if you run the following command:
"""
def train_scarf(splits=5, batch=True, 
                tag='train'):
    arguments = [
          f'--config_dir={config_dir}',
          f"--batch_size={scarf_batch_size}",
          f"--epochs={scarf_epochs}",
          f"--lr={scarf_lr}",
          f"--emb_dim={scarf_emb_dim}",
          f"--encoder_depth={scarf_encoder_depth}",
          f"--model_name={scarf_model_name}",
          f"--corruption_rate={scarf_corruption_rate}",
          f"--wandb_project_name=SCARF_Project",
          f"--wandb_entity=urbancomp",
        ]
    if batch == True:
        for i in splits:
            args = arguments + [
              f'--output_dir={output_dir}/split_{i+1}',
              f'--train_data_path={input_dir}/split_{i+1}/processed_{tag}.parquet',
            ]
            run_scarf(args)
    else:
        args = arguments + [
            f'--output_dir={output_dir}/',
            f'--train_data_path={input_dir}/processed.parquet',
        ]
        run_scarf(args)

flag_train = True
if flag_train:
    if mode == 'batch':
        train_scarf(batch=True)
    if mode == 'full':
        train_scarf(batch=False)

In [None]:
import argparse
import os
import pandas as pd
import torch
from src.utils import set_seed, load_from_yaml, get_features_from_yaml, load_model
from src.data_processor import DataProcessor
from src.scarf import SCARF
from src.dataloader import ScarfToDataLoader
# Suppress Dtype and Future warnings from pandas
import warnings
import numpy as np



warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Set a consistent seed for reproducibility
set_seed(42)

def get_scarf_embedding(arguments):
    # Argument parsing
    parser = argparse.ArgumentParser(description='Generate Embeddings')
    parser.add_argument('--config_dir', default='configs', help='Directory for configuration files')
    parser.add_argument('--model_dir', default='exp', help='Input Model directory for models')
    parser.add_argument('--output_dir', default='exp', help='Output directory for models and stats')
    parser.add_argument('--data_path', type=str, help='Specify the data path for the file you want to convert into SCARF embeddings.')
    parser.add_argument('--batch_size', type=int, default=32, help='Batch size for training')
    parser.add_argument('--epochs', type=int, default=1, help='Number of training epochs')
    parser.add_argument('--lr', type=float, default=3e-5, help='Learning rate')
    parser.add_argument('--emb_dim', type=int, default=32, help='Dimensionality of the embedding space')
    parser.add_argument('--encoder_depth', type=int, default=3, help='Depth of the encoder model')
    parser.add_argument('--model_name', type=str, default="scarf", help='Name of saved model')
    parser.add_argument('--corruption_rate', type=float, default=0.3, help='Rate of corruption applied during training')
    # parser.add_argument('--device', type=str, default="cpu")
    parser.add_argument('--embedding_save_name', type=str, required=True)
    args = parser.parse_args(arguments)

    # Ensure output directory exists
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    device = "cuda" if  torch.cuda.is_available() else "cpu"

    # Load configurations from YAML files
    preprocessing_config = load_from_yaml(f"{args.config_dir}/preprocess_config.yaml")
    energy_config = load_from_yaml(f"{args.config_dir}/energy_config.yaml")
    column_type_path = f"{args.output_dir}/column_type_classification.yaml"
    train_stats_path = f"{args.output_dir}/train_stats.json"
    scaler_path = f"{args.output_dir}/scaler.joblib"
    encoder_path = f"{args.output_dir}/encoder.joblib"
    small_area_path = f"{args.config_dir}/small_area.yaml"
    target = "EnergyRating"
    feature_config = load_from_yaml(f"{args.config_dir}/column_type_classification.yaml")
    features = get_features_from_yaml(feature_config, target)
    energyRatingEncoding = energy_config["original_rating_encoding"]


    # Load datasets
    data_format = args.data_path.split('.')[-1]
    if data_format == 'csv':
        df = pd.read_csv(f"{args.data_path}")
    elif data_format == 'parquet':
        df = pd.read_parquet(f"{args.data_path}")
    else:
        print("wrong data format")
        return

    # Process datasets
    # processor = DataProcessor(preprocessing_config, train_stats_path, column_type_path, scaler_path, encoder_path, small_area_path, target, features)
    # data_df = processor.process(df, is_train=True)

    # Initialize and train the model
    model = SCARF(input_dim=df.shape[1]-1, emb_dim=args.emb_dim, encoder_depth=args.encoder_depth, corruption_rate=args.corruption_rate)
    model = load_model(model_dir = args.model_dir, 
               model_name= args.model_name,
               model = model,
               device = device)

    model.eval()
    embeddings = []
    dataloader = ScarfToDataLoader(df, 
                                    target_col=target, 
                                    batch_size=args.batch_size, 
                                    shuffle=False).dataloader

    with torch.no_grad():
        for f, _ in dataloader:
            embeddings.append(model.get_embeddings(f.to(device)))

    embeddings = torch.cat(embeddings, dim=0)
    embeddings = embeddings.cpu()
    embeddings_numpy= embeddings.numpy()
    np.save(f"{args.output_dir}/{args.embedding_save_name}.npy", embeddings_numpy)




In [None]:
"""The generated embeddings are saved as a NumPy array in 
output/split_{split}/train.npy for batch mode, 
or
output/processed.npy for full mode:
"""
def get_embeddings(output_dir, model_dir, splits=5, batch=True, tag='train'):
    arguments = [
          f"--config_dir={config_dir}",

          f"--batch_size={scarf_batch_size}",
          f"--epochs={scarf_epochs}",
          f"--lr={scarf_lr}",
          f"--emb_dim={scarf_emb_dim}",
          f"--encoder_depth={scarf_encoder_depth}",
          f"--model_name={scarf_model_name}",
          f"--corruption_rate={scarf_corruption_rate}",
          f"--embedding_save_name={tag}",
        ]
    if batch == True:
        for i in splits:
            args = arguments + [
          f"--output_dir={output_dir}/split_{i+1}",
          f"--model_dir={model_dir}/split_{i+1}",
          f"--data_path={input_dir}/split_{i+1}/processed_{tag}.parquet",
            ]
            get_scarf_embedding(args)
    else:
        args = arguments + [
          f"--output_dir={output_dir}/",
          f"--model_dir={model_dir}/",
          f"--data_path={input_dir}/processed.parquet",
        ]
        get_scarf_embedding(args)


if flag_embedding:
    if mode == 'batch':
        get_embeddings(output_dir, 
                       model_dir, splits=5)
    if mode == 'full':
        get_embeddings(output_dir, 
                       model_dir, batch=False, tag='processed')


In [None]:
!ls output