### How to prepare your data for inference task

1. make data.csv file in root/data/JobDescription/ folder

- data.csv has columns of JD sections
- data.csv has rows of JDs
- data.csv has cells of sentences

2. change 'target_cols' list before running
3. run


In [2]:
# Modules About Hydra
# from PIL import Image
from typing import List, Any
# from hydra import initialize, initialize_config_module, initialize_config_dir, compose
# from omegaconf import DictConfig

# Modules About Torch, Numpy
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
# from torchvision import datasets, transforms

# Modules About Pytorch Lightning
import lightning.pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
# from lightning.pytorch.loggers import WandbLogger, TensorBoardLogger
from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS, STEP_OUTPUT

# Modules About Hugging Face Transformers
from transformers import AutoTokenizer, AutoModel, BertForMaskedLM, LongformerForMaskedLM, Trainer

# Modules About Pandas, Matplotlib, Numpy
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

# Modules About Language Pre-processing
# import re
# from konlpy.tag import Mecab

# Others
import os
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
'''
Input
data: {'input_ids': [tensor1, tensor2, ...], 'token_type_ids': [tensor1, tensor2, ...], 'attention_mask': [tensor1, tensor2, ...]}
'''


class CustomDataset(Dataset):
    def __init__(self, data) -> None:
        super().__init__()

        self.data = data
        self.keys = list(data.keys())

    def __len__(self):
        return len(self.data[self.keys[0]])

    def __getitem__(self, index) -> Any:
        item = []
        for key in self.keys:
            item.append(self.data[key][index])

        return item


In [11]:
class HFBertDataModule(pl.LightningDataModule):
    def __init__(
        self,
        tokenizer,
        max_batch_size=64,
        data_dir='../data/',
        predict_target_cols=[],
        train_target_cols=[],
        max_length=None,
        sliding_window_interval=200,
        train_test_ratio=0.9,
        train_val_ratio=0.8,
        masked_token_ratio=0.15
    ) -> None:
        super().__init__()
        self.predict_target_cols = predict_target_cols
        self.train_target_cols = train_target_cols
        self.data_dir = data_dir

        self.train_test_ratio = train_test_ratio
        self.train_val_ratio = train_val_ratio

        self.batch_size = max_batch_size
        if predict_target_cols:
            self.predict_batch_size = int(
                max_batch_size / len(predict_target_cols)) * len(predict_target_cols)

        # load Bert Tokenizer
        self.tokenizer = tokenizer

        if max_length:
            self.max_length = max_length
        else:
            self.max_length = tokenizer.model_max_length

        self.sliding_window_interval = sliding_window_interval

        self.masked_token_ratio = 0.15

    def prepare_data(self) -> None:
        # load predict data
        try:
            self.predict_data_pd = pd.read_csv(
                self.data_dir)
        except:
            print('No inference data available!')

        if self.predict_data_pd is not None and self.predict_target_cols:
            # serialize columns
            predict_data_serialized = []
            for row in range(len(self.predict_data_pd)):
                for col in self.predict_target_cols:
                    predict_data_serialized.append(
                        self.predict_data_pd.iloc[row][col])

            # make tokens
            self.predict_tokens = self.tokenizer(
                predict_data_serialized, return_tensors='pt', padding=True, truncation=True)

            # make predict dataset
            self.predict_dataset = CustomDataset(self.predict_tokens)
            self.predict_token_keys = self.predict_tokens.keys()

    def setup(self, stage: str) -> None:
        # load train data
        try:
            self.train_data_pd = pd.read_csv(
                self.data_dir)
        except:
            print('No training data available!')
            self.train_data_pd = None

        if self.train_data_pd is not None and self.train_target_cols:
            # serialize columns
            train_data_serialized = []
            for col in self.train_target_cols:
                train_data_serialized += list(self.train_data_pd[col])

            # make tokens
            self.train_tokens = self.tokenizer(
                train_data_serialized, return_tensors='pt', padding=True)
            self.train_token_keys = self.train_tokens.keys()

            # slicing tokens by a sliding window
            current_token_length = self.train_tokens['input_ids'].shape[1]
            if current_token_length > self.max_length:
                self.train_tokens_sliced = self._make_sliced_tokens(
                    self.train_tokens, current_token_length)
            else:
                self.train_tokens_sliced = self.train_tokens

            # make train dataset
            train_dataset = CustomDataset(self.train_tokens_sliced)

            # split train val test datasets
            self.train_dataset, self.val_dataset, self.test_dataset = random_split(
                train_dataset,
                [
                    self.train_test_ratio * self.train_val_ratio,
                    self.train_test_ratio * (1 - self.train_val_ratio),
                    1 - self.train_test_ratio
                ]
            )

    def _collate_fn_predict(self, batch):
        '''
        Inputs
        batch: [[tensor1_1, tensor1_2, tensor1_3], [tensor2_1, tensor2_2, tensor2_3], ...]
        self.predict_token_keys: ['input_ids', 'token_type_ids', 'attention_mask']

        Output
        dict_by_keys: {'input_ids': [tensor1_1, tensor2_1, ...], 'token_type_ids': [tensor1_2, tensor2_2, ...], 'attention_mask': [tensor1_3, tensor2_3, ...]}
        '''
        list_by_keys = list(zip(*batch))
        dict_by_keys = {}
        for i, key in enumerate(self.predict_token_keys):
            dict_by_keys[key] = torch.stack(list_by_keys[i])

        return dict_by_keys

    def _collate_fn_train(self, batch):
        list_by_keys = list(zip(*batch))
        dict_by_keys = {}
        for i, key in enumerate(self.train_token_keys):
            dict_by_keys[key] = torch.stack(list_by_keys[i])

        dict_by_keys['labels'] = dict_by_keys['input_ids'].clone()

        for i, tokens in enumerate(dict_by_keys['input_ids']):
            self._make_tokens(
                tokens, dict_by_keys['labels'][i], self.masked_token_ratio)

        return dict_by_keys

    def _make_sliced_tokens(self, tokens, tokens_length):
        train_tokens_sliced = {}
        for key in self.train_token_keys:
            train_tokens_sliced[key] = []

        for i in range(len(tokens[key])):
            window_index = 0
            while True:
                if window_index + self.max_length <= tokens_length:
                    for key in self.train_token_keys:
                        train_tokens_sliced[key].append(
                            tokens[key][i][window_index:window_index + self.max_length])

                    if tokens[key][i][window_index + self.max_length - 1] != self.tokenizer.pad_token_id:
                        window_index += self.sliding_window_interval
                        continue
                break

        return train_tokens_sliced

    def _make_tokens(self, tensor1, tensor2, mask_token_ratio, masking_ratio=[0.8, 0.1, 0.1]):
        assert sum(masking_ratio) == 1

        token_len = 0
        for token in tensor1:
            if token != self.tokenizer.pad_token_id:
                token_len += 1
                continue
            break

        masked_tokens = torch.tensor(np.random.choice(range(
            1, token_len - 1), int((token_len - 2) * mask_token_ratio)), dtype=torch.int)
        token_types = torch.randint_like(masked_tokens, 1, 101)

        tensor2[:] = -100
        tensor2[masked_tokens] = tensor1[masked_tokens]
        # tensor1[masked_tokens] = torch.where(token_types <= int(masking_ratio[0] * 100), self.tokenizer.mask_token_id, torch.where(token_types <= int((1 - masking_ratio[2]) * 100), torch.randint(0, self.tokenizer.vocab_size, (len(masked_tokens),)), tensor1[masked_tokens]))
        tensor1[masked_tokens] = self.tokenizer.mask_token_id

    def train_dataloader(self) -> TRAIN_DATALOADERS:
        return DataLoader(self.train_dataset, batch_size=self.batch_size, collate_fn=self._collate_fn_train)

    def val_dataloader(self) -> EVAL_DATALOADERS:
        return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=self._collate_fn_train)

    def test_dataloader(self) -> EVAL_DATALOADERS:
        return DataLoader(self.test_dataset, batch_size=self.batch_size, collate_fn=self._collate_fn_train)

    def predict_dataloader(self) -> EVAL_DATALOADERS:
        return DataLoader(self.predict_dataset, batch_size=self.predict_batch_size, collate_fn=self._collate_fn_predict)


In [5]:
class HFBertTask(pl.LightningModule):
    def __init__(self, tokenizer, predict_model=None, train_model=None, predict_target_cols=[], train_target_cols=[]) -> None:
        super().__init__()
        self.predict_target_cols = predict_target_cols
        self.train_target_cols = train_target_cols

        self.tokenizer = tokenizer

        self.predict_model = predict_model
        self.train_model = train_model
        self.training_step_outputs = []
        self.validation_step_outputs = []

        self.acc_func = None

    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
        if not (self.train_model or self.train_target_cols):
            print('No train_model or train_target_cols available!')
            return

        outputs = self.train_model(**batch)

        metrics = {
            'train_loss': outputs.loss
        }
        self.training_step_outputs.append(metrics)
        self.log_dict(metrics, prog_bar=True)

        return outputs.loss

    def validation_step(self, batch, batch_idx) -> STEP_OUTPUT | None:
        if not (self.train_model or self.train_target_cols):
            print('No train_model or train_target_cols available!')
            return

        outputs = self.train_model(**batch)

        metrics = {
            'val_loss': outputs.loss
        }
        self.validation_step_outputs.append(metrics)
        self.log_dict(metrics, prog_bar=True)

        return outputs.loss

    def on_validation_epoch_end(self):
        if not (self.training_step_outputs and self.validation_step_outputs):
            return

        train_avg_loss = torch.stack([x["train_loss"]
                                      for x in self.training_step_outputs]).mean()
        metrics = {
            "train_avg_loss": train_avg_loss
        }
        self.log_dict(metrics)

        val_avg_loss = torch.stack([x["val_loss"]
                                    for x in self.validation_step_outputs]).mean()
        metrics = {
            "val_avg_loss": val_avg_loss
        }
        self.log_dict(metrics)

        print("\n" +
              (f'Epoch {self.current_epoch}, Avg. Training Loss: {train_avg_loss:.3f} ' +
               f'Avg. Validation Loss: {val_avg_loss:.3f}'), flush=True)

        self.training_step_outputs.clear()
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx) -> None:
        if not (self.train_model or self.train_target_cols):
            print('No train_model or train_target_cols available!')
            return

        outputs = self.train_model(**batch)

        metrics = {
            'test_loss': outputs.loss
        }
        self.log_dict(metrics, prog_bar=True)

        sentence_index, mask_token_index = (
            batch['labels'] != -100).nonzero(as_tuple=True)

        predicted_token_id = []
        for index, sentence in enumerate(sentence_index):
            if sentence >= len(predicted_token_id):
                predicted_token_id.append([])

            predicted_token_id[-1].append(self.tokenizer.decode(
                outputs.logits[sentence, mask_token_index[index]].argmax(axis=-1)))

        random_numbers = torch.randint(low=0, high=len(
            predicted_token_id), size=(int(len(batch['input_ids']) / 2),))

        original_token_id = self.tokenizer.batch_decode(batch['input_ids'])

        print('')
        for i in random_numbers:
            print(predicted_token_id[i], original_token_id[i])
            answers = batch['labels'][i]
            print(self.tokenizer.convert_ids_to_tokens(
                answers[answers != -100]))

    def predict_step(self, batch, batch_idx, dataloader_idx: int = 0):
        if not (self.predict_model or self.predict_target_cols):
            print('No predict_model or predict_target_cols available!')

        outputs = self.predict_model(**batch)
        # these are the sentence embedding vectors (768 dim each)
        pooler_outputs = outputs[1]
        outputs_concated = []
        for i in range(int(len(pooler_outputs) / len(self.predict_target_cols))):
            outputs_concated.append(torch.concat(list(
                pooler_outputs[i * len(self.predict_target_cols):(i + 1) * len(self.predict_target_cols)])))
            # Concatenating sentence embedding vectors from a job description

        return torch.stack(outputs_concated)

    def configure_optimizers(self):
        pass


In [17]:
# Inference

predict_target_cols = ['자격요건', '우대조건']

tokenizer = AutoTokenizer.from_pretrained('dkqp/AiVENGERS_BERT_FineTuned')
data_module = HFBertDataModule(
    tokenizer=tokenizer,
    max_batch_size=15,
    predict_target_cols=predict_target_cols,
    data_dir="../dataset/result_5.csv"
)

model = AutoModel.from_pretrained(
    'dkqp/AiVENGERS_BERT_FineTuned', torchscript=True)
task = HFBertTask(tokenizer=tokenizer, predict_model=model,
                  predict_target_cols=predict_target_cols)

trainer = pl.Trainer()

# this list contains tensors of each output of batch running
predicted_embedding_vectors = trainer.predict(task, datamodule=data_module)
concatenated_embedding_vectors = torch.concat(
    predicted_embedding_vectors, dim=-2)


Some weights of the model checkpoint at dkqp/AiVENGERS_BERT_FineTuned were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dkqp/AiVENGERS_BERT_FineTuned and are newly initialized: ['bert.pooler.dense.weight', 'bert.poo

In [10]:
# check the inference result

# number of sentence * embedding vector dim
concatenated_embedding_vectors.shape


torch.Size([1000, 3840])

In [5]:
# Training

train_target_cols = ['자격요건']

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
data_module = HFBertDataModule(
    tokenizer=tokenizer,
    max_batch_size=4,
    train_target_cols=train_target_cols,
    train_test_ratio=0.9,
    train_val_ratio=0.5
)

model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
task = HFBertTask(tokenizer=tokenizer, train_model=model,
                  train_target_cols=train_target_cols)

trainer = pl.Trainer(
    max_epochs=1,
)

# this list contains tensors of each output of batch running
trainer.fit(task, datamodule=data_module)


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Token indices sequence length is longer than the specified maximum sequence length for this model (595 > 512). Running this sequence through the model will result in indexing errors

  | Name        | Type

Epoch 0: 100%|██████████| 113/113 [02:50<00:00,  1.50s/it, v_num=40, train_loss=0.0565]
Epoch 0, Avg. Training Loss: 0.075 Avg. Validation Loss: 0.073
Epoch 0: 100%|██████████| 113/113 [03:40<00:00,  1.95s/it, v_num=40, train_loss=0.0565, val_loss=0.0721]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 113/113 [03:40<00:00,  1.95s/it, v_num=40, train_loss=0.0565, val_loss=0.0721]


In [6]:
# Testing

trainer.test(task, datamodule=data_module)

Testing DataLoader 0:   0%|          | 0/25 [00:00<?, ?it/s]
['##를', '애', '운영', '이상', '##D', '##의', '##의', '##험', '이', ',', '##능', ',', '경', '##험', '•', '##이고', '##적인', '##험', '##래', '##텍', '패', '##트', '있', '##신', '있', 'C', '##elo', '##per'] [CLS] • Flutter [MASK] 이용한 [MASK]플리케이션 [MASK] 경험 1년 [MASK] • Dart 언어에 능숙 • RE [MASK] [MASK] API 등 외부 API [MASK] 통신 경 [MASK] • Git 등의 버전 관리 시스템 [MASK]용 경험 [MASK] [MASK] [MASK] [MASK] [MASK] 3년 이상 • Google Play Store에 서비스 출시 혹은 운영한 경험 [MASK] 안정적 [MASK] 퍼포먼스 [MASK] 앱을 개발하기 위해 지속적으로 학습하시는 분 • 사용자 대상의 서비스 설계, 개발, 런칭 및 운영 경 [MASK]이 있으신 분 • Bloc / Provider 등의 상태관리 라이브러리 이해가 있으신 분 • MVVM, Clean Architecture 등 아 [MASK] [MASK]처 [MASK]턴 [MASK] 이해가 [MASK]으 [MASK] 분 • Firebase에 대한 이해도가 [MASK]으신 분 Azure [MASK]ytorch Python AWS Dev [MASK] [MASK] ML [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

[{'test_loss': 0.08654780685901642}]

In [19]:
# save hugging face model & tokenizer weights

hf_trainer = Trainer(
    model=model,
    tokenizer=tokenizer
)

hf_trainer.save_model('../models')