### How to prepare your data for inference task

1. make data.csv file in root/data/JobDescription/ folder

- data.csv has columns of JD sections
- data.csv has rows of JDs
- data.csv has cells of sentences

2. change 'target_cols' list before running
3. run


In [1]:
# Modules About Hydra
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from hydra.utils import instantiate
from omegaconf import DictConfig, OmegaConf

# Modules About Torch, Numpy
import numpy as np
import torch
import torch.nn.functional as F
import torchmetrics
import torchvision
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split, Dataset
from torchvision import datasets, transforms

# Modules About Pytorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import LightningModule, LightningDataModule
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, ProgressBar
from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS, STEP_OUTPUT

# Modules About Hugging Face Transformers
from transformers import AutoTokenizer, AutoModel, BertForMaskedLM, Trainer

# Modules About Language Pre-processing
import re
from konlpy.tag import Mecab

# Modules About Pandas, Matplotlib, Numpy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Others
from PIL import Image
from typing import List, Any
import sys
import traceback
import yaml
import ruamel.yaml
import wandb
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
'''
Input
data: {'input_ids': [tensor1, tensor2, ...], 'token_type_ids': [tensor1, tensor2, ...], 'attention_mask': [tensor1, tensor2, ...]}
'''


class CustomDataset(Dataset):
    def __init__(self, data) -> None:
        super().__init__()

        self.data = data
        self.keys = list(data.keys())

    def __len__(self):
        return len(self.data[self.keys[0]])

    def __getitem__(self, index) -> Any:
        item = []
        for key in self.keys:
            item.append(self.data[key][index])

        return item

In [3]:
class HFBertDataModule(pl.LightningDataModule):
    def __init__(
        self,
        tokenizer,
        train_data_dir=None,
        pred_data_dir=None,
        max_batch_size=64,
        predict_target_cols=[],
        train_target_cols=[],
        max_length=None,
        sliding_window_interval=200,
        train_test_ratio=0.9,
        train_val_ratio=0.8,
        masked_token_ratio=0.15
    ) -> None:
        super().__init__()
        self.predict_target_cols = predict_target_cols
        self.train_target_cols = train_target_cols
        self.train_data_dir = train_data_dir
        self.pred_data_dir = pred_data_dir

        self.train_test_ratio = train_test_ratio
        self.train_val_ratio = train_val_ratio

        self.batch_size = max_batch_size
        if predict_target_cols:
            self.predict_batch_size = int(
                max_batch_size / len(predict_target_cols)) * len(predict_target_cols)

        # load Bert Tokenizer
        self.tokenizer = tokenizer

        if max_length:
            self.max_length = max_length
        else:
            self.max_length = tokenizer.model_max_length

        self.sliding_window_interval = sliding_window_interval

        self.masked_token_ratio = masked_token_ratio

    def prepare_data(self) -> None:
        # load predict data
        try:
            self.predict_data_pd = pd.read_csv(
                self.pred_data_dir)
        except:
            print('No inference data available!')
            self.predict_data_pd = None

        if self.predict_data_pd is not None and self.predict_target_cols:
            # serialize columns
            predict_data_serialized = []
            for row in range(len(self.predict_data_pd)):
                for col in self.predict_target_cols:
                    predict_data_serialized.append(
                        self.predict_data_pd.iloc[row][col])

            # make tokens
            self.predict_tokens = self.tokenizer(
                predict_data_serialized, return_tensors='pt', padding=True, truncation=True)

            # make predict dataset
            self.predict_dataset = CustomDataset(self.predict_tokens)
            self.predict_token_keys = self.predict_tokens.keys()

    def setup(self, stage: str) -> None:
        # load train data
        try:
            self.train_data_pd = pd.read_csv(
                self.train_data_dir)
        except:
            print('No training data available!')
            self.train_data_pd = None

        if self.train_data_pd is not None and self.train_target_cols:
            # serialize columns
            train_data_serialized = []
            for col in self.train_target_cols:
                train_data_serialized += list(self.train_data_pd[col])

            # make tokens
            self.train_tokens = self.tokenizer(
                train_data_serialized, return_tensors='pt', padding=True)
            self.train_token_keys = self.train_tokens.keys()

            # slicing tokens by a sliding window
            current_token_length = self.train_tokens['input_ids'].shape[1]
            if current_token_length > self.max_length:
                self.train_tokens_sliced = self._make_sliced_tokens(
                    self.train_tokens, current_token_length)
            else:
                self.train_tokens_sliced = self.train_tokens

            # make train dataset
            train_dataset = CustomDataset(self.train_tokens_sliced)

            # split train val test datasets
            self.train_dataset, self.val_dataset, self.test_dataset = random_split(
                train_dataset,
                [
                    self.train_test_ratio * self.train_val_ratio,
                    self.train_test_ratio * (1 - self.train_val_ratio),
                    1 - self.train_test_ratio
                ]
            )

    def _collate_fn_predict(self, batch):
        '''
        Inputs
        batch: [[tensor1_1, tensor1_2, tensor1_3], [tensor2_1, tensor2_2, tensor2_3], ...]
        self.predict_token_keys: ['input_ids', 'token_type_ids', 'attention_mask']

        Output
        dict_by_keys: {'input_ids': [tensor1_1, tensor2_1, ...], 'token_type_ids': [tensor1_2, tensor2_2, ...], 'attention_mask': [tensor1_3, tensor2_3, ...]}
        '''
        list_by_keys = list(zip(*batch))
        dict_by_keys = {}
        for i, key in enumerate(self.predict_token_keys):
            dict_by_keys[key] = torch.stack(list_by_keys[i])

        return dict_by_keys

    def _collate_fn_train(self, batch):
        list_by_keys = list(zip(*batch))
        dict_by_keys = {}
        for i, key in enumerate(self.train_token_keys):
            dict_by_keys[key] = torch.stack(list_by_keys[i])

        dict_by_keys['labels'] = dict_by_keys['input_ids'].clone()

        for i, tokens in enumerate(dict_by_keys['input_ids']):
            self._make_tokens(
                tokens, dict_by_keys['labels'][i], self.masked_token_ratio)

        return dict_by_keys

    def _make_sliced_tokens(self, tokens, tokens_length):
        train_tokens_sliced = {}
        for key in self.train_token_keys:
            train_tokens_sliced[key] = []

        for i in range(len(tokens[key])):
            window_index = 0
            while True:
                if window_index + self.max_length <= tokens_length:
                    for key in self.train_token_keys:
                        train_tokens_sliced[key].append(
                            tokens[key][i][window_index:window_index + self.max_length])

                    if tokens[key][i][window_index + self.max_length - 1] != self.tokenizer.pad_token_id:
                        window_index += self.sliding_window_interval
                        continue
                break

        return train_tokens_sliced

    def _make_tokens(self, tensor1, tensor2, mask_token_ratio, masking_ratio=[0.8, 0.1, 0.1]):
        assert sum(masking_ratio) == 1

        token_len = 0
        for token in tensor1:
            if token != self.tokenizer.pad_token_id:
                token_len += 1
                continue
            break

        masked_tokens = torch.tensor(np.random.choice(range(
            1, token_len - 1), int((token_len - 2) * mask_token_ratio)), dtype=torch.int)
        token_types = torch.randint_like(masked_tokens, 1, 101)

        tensor2[:] = -100
        tensor2[masked_tokens] = tensor1[masked_tokens]
        tensor1[masked_tokens] = torch.where(token_types <= int(masking_ratio[0] * 100), self.tokenizer.mask_token_id, torch.where(token_types <= int((1 - masking_ratio[2]) * 100), torch.randint(0, self.tokenizer.vocab_size, (len(masked_tokens),)), tensor1[masked_tokens]))

    def train_dataloader(self) -> TRAIN_DATALOADERS:
        return DataLoader(self.train_dataset, batch_size=self.batch_size, collate_fn=self._collate_fn_train)

    def val_dataloader(self) -> EVAL_DATALOADERS:
        return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=self._collate_fn_train)

    def test_dataloader(self) -> EVAL_DATALOADERS:
        return DataLoader(self.test_dataset, batch_size=self.batch_size, collate_fn=self._collate_fn_train)

    def predict_dataloader(self) -> EVAL_DATALOADERS:
        return DataLoader(self.predict_dataset, batch_size=self.predict_batch_size, collate_fn=self._collate_fn_predict)


In [4]:
class HFBertTask(pl.LightningModule):
    def __init__(self, tokenizer, optimizer=None, lr_scheduler=None, predict_model=None,
                 train_model=None, predict_target_cols=[],
                 train_target_cols=[]) -> None:
        super().__init__()
        self.predict_target_cols = predict_target_cols
        self.train_target_cols = train_target_cols

        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
        self.tokenizer = tokenizer

        self.save_hyperparameters(
            "optimizer", "tokenizer",
            "lr_scheduler", "predict_target_cols",
            "train_target_cols")

        self.predict_model = predict_model
        self.train_model = train_model
        self.training_step_outputs = []
        self.validation_step_outputs = []

        self.acc_func = None

    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
        if not (self.train_model or self.train_target_cols):
            print('No train_model or train_target_cols available!')
            return

        outputs = self.train_model(**batch)

        metrics = {
            'train_loss': outputs.loss
        }
        self.training_step_outputs.append(metrics)
        self.log_dict(metrics, prog_bar=True)

        return outputs.loss

    def validation_step(self, batch, batch_idx) -> STEP_OUTPUT | None:
        if not (self.train_model or self.train_target_cols):
            print('No train_model or train_target_cols available!')
            return

        outputs = self.train_model(**batch)

        metrics = {
            'val_loss': outputs.loss
        }
        self.validation_step_outputs.append(metrics)
        self.log_dict(metrics, prog_bar=True)

        return outputs.loss

    def on_validation_epoch_end(self):
        if not (self.training_step_outputs and self.validation_step_outputs):
            return

        train_avg_loss = torch.stack([x["train_loss"]
                                      for x in self.training_step_outputs]).mean()
        metrics = {
            "train_avg_loss": train_avg_loss
        }
        self.log_dict(metrics)

        val_avg_loss = torch.stack([x["val_loss"]
                                    for x in self.validation_step_outputs]).mean()
        metrics = {
            "val_avg_loss": val_avg_loss
        }
        self.log_dict(metrics)

        print("\n" +
              (f'Epoch {self.current_epoch}, Avg. Training Loss: {train_avg_loss:.3f} ' +
               f'Avg. Validation Loss: {val_avg_loss:.3f}'), flush=True)

        self.training_step_outputs.clear()
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx) -> None:
        if not (self.train_model or self.train_target_cols):
            print('No train_model or train_target_cols available!')
            return

        outputs = self.train_model(**batch)

        metrics = {
            'test_loss': outputs.loss
        }
        self.log_dict(metrics, prog_bar=True)

        sentence_index, mask_token_index = (
            batch['input_ids'] == self.tokenizer.mask_token_id).nonzero(as_tuple=True)

        predicted_token_id = []
        for index, sentence in enumerate(sentence_index):
            if sentence >= len(predicted_token_id):
                predicted_token_id.append([])

            predicted_token_id[-1].append(self.tokenizer.decode(
                outputs.logits[sentence, mask_token_index[index]].argmax(axis=-1)))

        random_numbers = torch.randint(low=0, high=len(
            predicted_token_id), size=(int(len(batch['input_ids']) / 2),))

        original_token_id = self.tokenizer.batch_decode(batch['input_ids'])

        for i in random_numbers:
            print(predicted_token_id[i], original_token_id[i])

    def predict_step(self, batch, batch_idx, dataloader_idx: int = 0):
        if not (self.predict_model or self.predict_target_cols):
            print('No predict_model or predict_target_cols available!')
        outputs = self.predict_model(**batch)
        # these are the sentence embedding vectors (768 dim each)
        pooler_outputs = outputs[1]
        outputs_concated = []
        for i in range(int(len(pooler_outputs) / len(self.predict_target_cols))):
            outputs_concated.append(torch.concat(list(
                pooler_outputs[i * len(self.predict_target_cols):(i + 1) * len(self.predict_target_cols)])))
            # Concatenating sentence embedding vectors from a job description

        return torch.stack(outputs_concated)

    def configure_optimizers(self):
        optimizer = self.optimizer
        if self.lr_scheduler is not None:
            return [optimizer], [self.lr_scheduler]
        else:
            return optimizer


In [5]:
def generate_train_func(cfg):
    def find_key(cfg, query, new_value):
        for key, value in cfg.items():
            if key == query:
                cfg[key] = new_value
                return True
            elif isinstance(value, DictConfig):
                if find_key(value, query, new_value):
                    return True
        return False

    def train():
        try:
            # Set Constant
            CHECKPOINT_PATH = "./checkpoints"
            CONFIGS_PATH = "./configs"

            # Make Folder If Not Exists
            if not os.path.exists("./configs"):
                os.makedirs("./configs")

            # Get Global Version Info
            with open("global.yaml", "r", encoding="utf-8") as f:
                global_data = ruamel.yaml.safe_load(f)
            version_count = global_data["next_version_count"]
            sweep_count = global_data["next_sweep_count"]

            # Initalize Wandb
            if "name" in cfg.train.logger:
                name = cfg.train.logger.name + f"_s{sweep_count}"
            else:
                name = f"v{version_count}_s{sweep_count}"

            logger = instantiate(
                cfg.train.logger, name=name)

            # Get config.yaml file
            with open("config.yaml", "r", encoding="utf-8") as f:
                cfg_data = ruamel.yaml.safe_load(f)

            # Save Version Config Info On Configs Folder
            if sweep_count == 0:
                with open(f"{CONFIGS_PATH}/version_{version_count}_config.yaml",
                          "w", encoding="utf-8") as f:
                    ruamel.yaml.dump(cfg_data, f, Dumper=ruamel.yaml.RoundTripDumper,
                                     allow_unicode=True)

            # Set Sweeping Setting
            for key, item in wandb.config.items():
                if not find_key(cfg, key, item):
                    print(
                        f"key: {key} in your sweeping configuration was not found in your configuration")

            # Get Universal Configuration
            tokenizer = instantiate(cfg.universal.tokenizer.train)
            train_target_cols = cfg.universal.train_target_cols
            predict_target_cols = cfg.universal.predict_target_cols

            # Load Data Module
            data_module = HFBertDataModule(
                **cfg.train_data, tokenizer=tokenizer, train_target_cols=train_target_cols)

            # Load Model Configuration
            models = [(instantiate(cfg.models[model].train), instantiate(cfg.models[model].pred))
                      for model in dir(cfg.models)]

            # Add Callbacks
            cfg_callbacks = cfg.train.callbacks
            callbacks = []
            checkpoint_callback = ModelCheckpoint(**cfg_callbacks.checkpoint_callback,
                                                  dirpath=f"{CHECKPOINT_PATH}/v{version_count}_s{sweep_count}/"
                                                  )
            callbacks.append(checkpoint_callback)

            early_stop_callback = EarlyStopping(
                **cfg_callbacks.early_stop_callback)
            callbacks.append(early_stop_callback)

            # Train
            for model in models:
                train_model = model[0]
                predict_model = model[1]
                # Set Optimizer
                optimizer = instantiate(
                    cfg.task.optimizer, params=train_model.parameters())

                # Set Lr Scheduler If exists
                if cfg.task.lr_scheduler.scheduler._target_ is not None:
                    lr_scheduler = {}
                    lr_scheduler["scheduler"] = instantiate(
                        cfg.task.lr_scheduler.scheduler, optimizer=optimizer)
                    lr_scheduler["interval"] = cfg.task.lr_scheduler.interval
                else:
                    lr_scheduler = None

                # Define Task
                cfg_task = OmegaConf.to_container(cfg.task)
                cfg_task.pop("optimizer")
                if cfg_task["lr_scheduler"]["scheduler"]["_target_"] is not None:
                    cfg_task.pop("lr_scheduler")
                task = HFBertTask(**cfg_task,
                                  train_model=train_model, predict_model=predict_model,
                                  tokenizer=tokenizer,
                                  optimizer=optimizer, lr_scheduler=lr_scheduler,
                                  train_target_cols=train_target_cols,
                                  predict_target_cols=predict_target_cols)

                # Train and Test
                trainer = pl.Trainer(**cfg.train.trainer,
                                     callbacks=callbacks, logger=logger)
                trainer.fit(task, data_module)
                trainer.test(task, datamodule=data_module)
                trainer.save_checkpoint(f"{CHECKPOINT_PATH}/best_model.ckpt")

            # Save Version Config Info On Checkpoints Folder
            with open(f"{CHECKPOINT_PATH}/v{version_count}_s{sweep_count}/version_config.yaml",
                      "w", encoding="utf-8") as f:
                ruamel.yaml.dump(cfg_data, f, Dumper=ruamel.yaml.RoundTripDumper,
                                 allow_unicode=True)

            # Set Sweep Info
            global_data["next_sweep_count"] += 1
            with open("global.yaml", "w", encoding="utf-8") as f:
                ruamel.yaml.dump(global_data, f, Dumper=ruamel.yaml.RoundTripDumper,
                                 allow_unicode=True)

            # Finish wandb
            if cfg.train.logger._target_ == "pytorch_lightning.loggers.WandbLogger":
                wandb.finish()
        except Exception:
            # Finish wandb
            if cfg.train.logger._target_ == "pytorch_lightning.loggers.WandbLogger":
                wandb.finish()

            print("An error occurred:")
            print(traceback.format_exc())
            return
    return train


# Load Configuration Object
with initialize(version_base=None, config_path="./"):
    cfg = compose(config_name="config.yaml")

if cfg.is_sweep_enable:
    # Get Sweep ID
    sweep_id = wandb.sweep(OmegaConf.to_container(
        cfg.sweep), project=cfg.logger.project)

    # Apply Sweeping
    wandb.agent(sweep_id, function=generate_train_func(cfg))
else:
    generate_train_func(cfg)()


# Update Version / Sweep Info
with open("global.yaml", "r", encoding="utf-8") as f:
    global_data = ruamel.yaml.safe_load(f)
global_data["next_version_count"] += 1
global_data["next_sweep_count"] = 0
with open("global.yaml", "w", encoding="utf-8") as f:
    ruamel.yaml.dump(global_data, f, Dumper=ruamel.yaml.RoundTripDumper,
                     allow_unicode=True)

[34m[1mwandb[0m: Currently logged in as: [33mstarinhoo[0m ([33maivengersteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at klue/bert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expec

No inference data available!


Token indices sequence length is longer than the specified maximum sequence length for this model (2457 > 512). Running this sequence through the model will result in indexing errors

  | Name          | Type            | Params
--------------------------------------------------
0 | predict_model | BertModel       | 110 M 
1 | train_model   | BertForMaskedLM | 110 M 
--------------------------------------------------
221 M     Trainable params
0         Non-trainable params
221 M     Total params
885.073   Total estimated model params size (MB)


Epoch 0: 100%|██████████| 8199/8199 [1:45:06<00:00,  1.30it/s, v_num=lbjg, train_loss=0.0378] 
Epoch 0, Avg. Training Loss: 0.060 Avg. Validation Loss: 0.048
Epoch 1: 100%|██████████| 8199/8199 [1:44:07<00:00,  1.31it/s, v_num=lbjg, train_loss=0.0324, val_loss=0.0475] 
Epoch 1, Avg. Training Loss: 0.050 Avg. Validation Loss: 0.043
Epoch 2:  14%|█▍        | 1153/8199 [18:09<1:51:00,  1.06it/s, v_num=lbjg, train_loss=0.0417, val_loss=0.0435]

wandb: Network error (ConnectionError), entering retry loop.


Epoch 2: 100%|██████████| 8199/8199 [14:01:50<00:00,  6.16s/it, v_num=lbjg, train_loss=0.0397, val_loss=0.0435]   
Epoch 2, Avg. Training Loss: 0.045 Avg. Validation Loss: 0.040
Epoch 2: 100%|██████████| 8199/8199 [14:08:28<00:00,  6.21s/it, v_num=lbjg, train_loss=0.0397, val_loss=0.0396]

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|██████████| 8199/8199 [14:08:33<00:00,  6.21s/it, v_num=lbjg, train_loss=0.0397, val_loss=0.0396]
No inference data available!
Testing DataLoader 0:   0%|          | 0/1139 [00:00<?, ?it/s]['##심', '##의', '##의학', '##지원'] [CLS] ㆍ [MASK]의학과 소프트웨어 ( Q - Dose 소프트웨어 ) 임상, 교육 및 사용자 교육 ㆍ [MASK] [MASK]과 소프트웨어 ( Q - Dose 소프트웨어 ) 소프트웨어 유지보수 및 기술 [MASK] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
test_loss,▁
train_avg_loss,█▃▁
train_loss,▄█▂▃▃▅▄▂▃▂▂▂▃▂▂▃▂▂▂▂▂▂▄▂▂▃▂▃▃▂▂▁▄▂▂▂▂▅▃▄
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_avg_loss,█▄▁
val_loss,█▄▁

0,1
epoch,3.0
test_loss,0.04015
train_avg_loss,0.04491
train_loss,0.05818
trainer/global_step,24597.0
val_avg_loss,0.03963
val_loss,0.03962
