## Connect to Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import repo

In [None]:
%rm -rf /content/seq2seq
import getpass
!git clone --branch feature/use_proper_classes_embedding https://{getpass.getpass()}@github.com/JoaoJanini/seq2seq

··········
Cloning into 'learning'...
remote: Enumerating objects: 223, done.[K
remote: Counting objects: 100% (223/223), done.[K
remote: Compressing objects: 100% (146/146), done.[K
remote: Total 223 (delta 108), reused 170 (delta 67), pack-reused 0[K
Receiving objects: 100% (223/223), 241.15 KiB | 6.35 MiB/s, done.
Resolving deltas: 100% (108/108), done.


In [None]:
!sleep 5

In [None]:
%cd seq2seq

/content/learning


In [None]:
%mkdir data

In [None]:
!pip install -r requirements.txt 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Sequence to Sequence

In [None]:
import sys
path = "./seq2seq"
if path not in sys.path:
  sys.path.append(path)
else:
  print(path + "already in path")

In [None]:
sys.path

['/content',
 '/env/python',
 '/usr/lib/python37.zip',
 '/usr/lib/python3.7',
 '/usr/lib/python3.7/lib-dynload',
 '',
 '/usr/local/lib/python3.7/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.7/dist-packages/IPython/extensions',
 '/root/.ipython',
 './seq2seq']

In [None]:
from datetime import datetime

model_directory = f"/content/drive/MyDrive/Coding/seq2seq/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

## Training

In [None]:
from transformers import TrainingArguments, Trainer, logging
from hf_sequence_to_sequence.model import FaciesForConditionalGeneration
from hf_sequence_to_sequence.configuration import FaciesConfig
import torchmetrics
import math
import time
from torch import nn, optim
from torch.optim import Adam
import torch
from torch.utils.data import DataLoader
from dataset.dataset import WellsDataset
from torch.utils.data import random_split
from typing import List
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from utils import compute_metrics_fn, collate_fn, ray_hp_space
from ray.tune.schedulers import PopulationBasedTraining
from ray.tune.search.hyperopt import HyperOptSearch

# define function to compute metrics
import numpy as np


DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

BATCH_SIZE = 256
SEQUENCE_LEN = 15
TRAINING_RATIO = 0.95
WIRELINE_LOGS_HEADER = ["GR", "NPHI", "RSHA", "DTC", "RHOB", "SP"]
LABEL_COLUMN_HEADER = ["FORCE_2020_LITHOFACIES_LITHOLOGY"]

train_dataset = WellsDataset(
    dataset_type="train",
    sequence_len=SEQUENCE_LEN,
    model_type="seq2seq",
    feature_columns=WIRELINE_LOGS_HEADER,
    label_columns=LABEL_COLUMN_HEADER,
)

DATA_LEN = train_dataset.train_len
d_input = train_dataset.input_len
d_output = train_dataset.output_len
d_channel = train_dataset.channel_len
tgt_vocab_size = train_dataset.output_len + len(train_dataset.special_symbols)
TRAIN_DATA_LEN = int(DATA_LEN * TRAINING_RATIO)

train_data, validation_data = random_split(
    train_dataset, lengths=[TRAIN_DATA_LEN, DATA_LEN - TRAIN_DATA_LEN]
)

# function to collate data samples into batch tesors


facies_config = {
    "vocab_size": tgt_vocab_size,
    "max_position_embeddings": 1024,
    "encoder_layers": 6,
    "encoder_ffn_dim": 1024,
    "encoder_attention_heads": 8,
    "decoder_layers": 4,
    "decoder_ffn_dim": 1024,
    "decoder_attention_heads": 8,
    "encoder_layerdrop": 0.0,
    "decoder_layerdrop": 0.0,
    "activation_function": "relu",
    "d_model": 512,
    "n_input_features": d_channel,
    "n_output_features": d_output,
    "sequence_len": SEQUENCE_LEN,
    "dropout": 0.2,
    "attention_dropout": 0.0,
    "activation_dropout": 0.0,
    "init_std": 0.02,
    "classifier_dropout": 0.0,
    "scale_embedding": False,
    "use_cache": False,
    "num_labels": tgt_vocab_size,
    "pad_token_id": train_dataset.PAD_IDX,
    "bos_token_id": train_dataset.PAD_IDX,
    "eos_token_id": train_dataset.PAD_IDX,
    "is_encoder_decoder": True,
    "decoder_start_token_id": train_dataset.PAD_IDX,
    "forced_eos_token_id": train_dataset.PAD_IDX,
}
facies_transformer_config = FaciesConfig(**facies_config)
facies_transformer_config.save_pretrained(
    f"{model_directory}/facies-transformer-config"
)
facies_transformer_config = FaciesConfig.from_pretrained(
    f"{model_directory}/facies-transformer-config"
)


def model_init(trial):

    return FaciesForConditionalGeneration(facies_transformer_config)


training_args = TrainingArguments(
    output_dir=f"{model_directory}/facies-transformer",
    evaluation_strategy="steps",
    eval_steps=500,
    disable_tqdm=True
)

trainer = Trainer(
    model=None,
    train_dataset=train_data,
    eval_dataset=validation_data,
    data_collator=collate_fn,
    args=training_args,
    model_init=model_init,
    compute_metrics=compute_metrics_fn,
)

In [None]:
best_model = trainer.hyperparameter_search(
    direction="maximize",
    backend="ray",
    n_trials=10,
    search_alg=HyperOptSearch(metric="objective", mode="max"),
    hp_space=ray_hp_space, 
    local_dir=f"{model_directory}/ray_results",
)

In [None]:
test_dataset = WellsDataset(
    dataset_type="test",
    sequence_len=SEQUENCE_LEN,
    model_type="seq2seq",
    feature_columns=WIRELINE_LOGS_HEADER,
    label_columns=LABEL_COLUMN_HEADER,
    scaler=train_dataset.scaler,
    output_len=train_dataset.output_len,
)
test_loader = DataLoader(
    dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn
)

torch.save(
    best_model.state_dict(),
    f=f"{model_directory}/facies-transformer/facies_transformer_state_dict.pt",
)

data structure: [lines, timesteps, features]
train data size: [(76492, 10, 2)]
Number of classes: 12
