### How to prepare your data for inference task
1. make data.csv file in root/data/JobDescription/ folder
* data.csv has columns of JD sections
* data.csv has rows of JDs
* data.csv has cells of sentences
2. change 'target_cols' list before running
3. run

In [1]:
# Modules About Hydra
# from PIL import Image
from typing import List, Any
# from hydra import initialize, initialize_config_module, initialize_config_dir, compose
# from omegaconf import DictConfig

# Modules About Torch, Numpy
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
# from torchvision import datasets, transforms

# Modules About Pytorch Lightning
import lightning.pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
# from lightning.pytorch.loggers import WandbLogger, TensorBoardLogger
from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS, STEP_OUTPUT

# Modules About Hugging Face Transformers
from transformers import AutoTokenizer, AutoModel, BertForMaskedLM, Trainer

# Modules About Pandas, Matplotlib, Numpy
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

# Modules About Language Pre-processing
# import re
# from konlpy.tag import Mecab

# Others
import os
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
'''
Input
data: {'input_ids': [tensor1, tensor2, ...], 'token_type_ids': [tensor1, tensor2, ...], 'attention_mask': [tensor1, tensor2, ...]}
'''

class CustomDataset(Dataset):
    def __init__(self, data) -> None:
        super().__init__()

        self.data = data
        self.keys = list(data.keys())

    def __len__(self):
        return len(self.data[self.keys[0]])

    def __getitem__(self, index) -> Any:
        item = []
        for key in self.keys:
            item.append(self.data[key][index])

        return item

In [3]:
class HFBertDataModule(pl.LightningDataModule):
    def __init__(self, tokenizer, max_batch_size=64, data_dir='../data/', predict_target_cols=[], train_target_cols=[], train_test_ratio=0.9, train_val_ratio=0.8) -> None:
        super().__init__()
        self.predict_target_cols = predict_target_cols
        self.train_target_cols = train_target_cols
        self.data_dir = data_dir

        self.train_test_ratio = train_test_ratio
        self.train_val_ratio = train_val_ratio

        self.batch_size = max_batch_size
        if predict_target_cols:
            self.predict_batch_size = int(max_batch_size / len(predict_target_cols)) * len(predict_target_cols)

        # load Bert Tokenizer
        self.tokenizer = tokenizer

    def prepare_data(self) -> None:
        # load predict data
        try:
            self.predict_data_pd = pd.read_csv(self.data_dir + 'JobDescription/inference_data.csv')
        except:
            print('No inference data available!')

        if self.predict_data_pd is not None and self.predict_target_cols:
            # serialize columns
            predict_data_serialized = []
            for row in range(len(self.predict_data_pd)):
                for col in self.predict_target_cols:
                    predict_data_serialized.append(self.predict_data_pd.iloc[row][col])

            # make tokens
            self.predict_tokens = self.tokenizer(predict_data_serialized, return_tensors='pt', padding=True)

            # make predict dataset
            self.predict_dataset = CustomDataset(self.predict_tokens)
            self.predict_token_keys = self.predict_dataset.keys

    def setup(self, stage: str) -> None:
        # load train data
        try:
            self.train_data_pd = pd.read_csv(self.data_dir + 'JobDescription/training_data.csv')
        except:
            print('No training data available!')
            self.train_data_pd = None

        if self.train_data_pd is not None and self.train_target_cols:
            # serialize columns
            train_data_serialized = []
            for col in self.train_target_cols:
                train_data_serialized += list(self.train_data_pd[col])

            # make tokens
            self.train_tokens = self.tokenizer(train_data_serialized, return_tensors='pt', padding=True)

            # make train dataset
            train_dataset = CustomDataset(self.train_tokens)
            self.train_token_keys = train_dataset.keys

            # split train val test datasets
            self.train_dataset, self.val_dataset, self.test_dataset = random_split(
                train_dataset,
                [
                    self.train_test_ratio * self.train_val_ratio,
                    self.train_test_ratio * (1 - self.train_val_ratio),
                    1 - self.train_test_ratio
                ]
            )

    def _collate_fn_predict(self, batch):
        '''
        Inputs
        batch: [[tensor1_1, tensor1_2, tensor1_3], [tensor2_1, tensor2_2, tensor2_3], ...]
        self.predict_token_keys: ['input_ids', 'token_type_ids', 'attention_mask']

        Output
        dict_by_keys: {'input_ids': [tensor1_1, tensor2_1, ...], 'token_type_ids': [tensor1_2, tensor2_2, ...], 'attention_mask': [tensor1_3, tensor2_3, ...]}
        '''
        list_by_keys = list(zip(*batch))
        dict_by_keys = {}
        for i, key in enumerate(self.predict_token_keys):
            dict_by_keys[key] = torch.stack(list_by_keys[i])

        return dict_by_keys

    def _collate_fn_train(self, batch):
        list_by_keys = list(zip(*batch))
        dict_by_keys = {}
        for i, key in enumerate(self.train_token_keys):
            dict_by_keys[key] = torch.stack(list_by_keys[i])

        dict_by_keys['labels'] = dict_by_keys['input_ids'].clone()

        for tokens in dict_by_keys['input_ids']:
            token_len = 0
            for token in tokens:
                if token != self.tokenizer.pad_token_id:
                    token_len += 1
                    continue
                break
            tokens[np.random.randint(1, token_len - 2)] = self.tokenizer.mask_token_id

        dict_by_keys['labels'] = torch.where(dict_by_keys['input_ids'] == self.tokenizer.mask_token_id, dict_by_keys['labels'], -100)

        return dict_by_keys

    def train_dataloader(self) -> TRAIN_DATALOADERS:
        return DataLoader(self.train_dataset, batch_size=self.batch_size, collate_fn=self._collate_fn_train)

    def val_dataloader(self) -> EVAL_DATALOADERS:
        return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=self._collate_fn_train)

    def test_dataloader(self) -> EVAL_DATALOADERS:
        return DataLoader(self.test_dataset, batch_size=self.batch_size, collate_fn=self._collate_fn_train)

    def predict_dataloader(self) -> EVAL_DATALOADERS:
        return DataLoader(self.predict_dataset, batch_size=self.predict_batch_size, collate_fn=self._collate_fn_predict)

In [57]:
class HFBertTask(pl.LightningModule):
    def __init__(self, tokenizer, predict_model=None, train_model=None, predict_target_cols=[], train_target_cols=[]) -> None:
        super().__init__()
        self.predict_target_cols = predict_target_cols
        self.train_target_cols = train_target_cols

        self.tokenizer = tokenizer

        self.predict_model = predict_model
        self.train_model = train_model
        self.training_step_outputs = []
        self.validation_step_outputs = []

        self.acc_func = None

    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
        if not (self.train_model or self.train_target_cols):
            print('No train_model or train_target_cols available!')
            return

        outputs = self.train_model(**batch)

        metrics = {
            'train_loss': outputs.loss
        }
        self.training_step_outputs.append(metrics)
        self.log_dict(metrics, prog_bar=True)

        return outputs.loss

    def validation_step(self, batch, batch_idx) -> STEP_OUTPUT | None:
        if not (self.train_model or self.train_target_cols):
            print('No train_model or train_target_cols available!')
            return

        outputs = self.train_model(**batch)

        metrics = {
            'val_loss': outputs.loss
        }
        self.validation_step_outputs.append(metrics)
        self.log_dict(metrics, prog_bar=True)

        return outputs.loss

    def on_validation_epoch_end(self):
        if not (self.training_step_outputs and self.validation_step_outputs):
            return

        train_avg_loss = torch.stack([x["train_loss"]
            for x in self.training_step_outputs]).mean()
        metrics = {
            "train_avg_loss": train_avg_loss
        }
        self.log_dict(metrics)

        val_avg_loss = torch.stack([x["val_loss"]
            for x in self.validation_step_outputs]).mean()
        metrics = {
            "val_avg_loss": val_avg_loss
        }
        self.log_dict(metrics)

        print("\n" +
              (f'Epoch {self.current_epoch}, Avg. Training Loss: {train_avg_loss:.3f} ' +
               f'Avg. Validation Loss: {val_avg_loss:.3f}'), flush=True)

        self.training_step_outputs.clear()
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx) -> None:
        if not (self.train_model or self.train_target_cols):
            print('No train_model or train_target_cols available!')
            return

        outputs = self.train_model(**batch)

        metrics = {
            'test_loss': outputs.loss
        }
        self.log_dict(metrics, prog_bar=True)

        sentence_index, mask_token_index = (batch['input_ids'] == self.tokenizer.mask_token_id).nonzero(as_tuple=True)

        predicted_token_id = []
        for index, sentence in enumerate(sentence_index):
            if sentence >= len(predicted_token_id):
                predicted_token_id.append([])

            predicted_token_id[-1].append(self.tokenizer.decode(outputs.logits[sentence, mask_token_index[index]].argmax(axis=-1)))

        random_numbers = torch.randint(low=0, high=len(predicted_token_id), size=(20,))

        original_token_id = self.tokenizer.batch_decode(batch['input_ids'])

        for i in random_numbers:
            print(predicted_token_id[i], original_token_id[i])

    def predict_step(self, batch, batch_idx, dataloader_idx: int = 0):
        if not (self.predict_model or self.predict_target_cols):
            print('No predict_model or predict_target_cols available!')

        outputs = self.predict_model(**batch)
        pooler_outputs = outputs['pooler_output'] # these are the sentence embedding vectors (768 dim each)
        outputs_concated = []
        for i in range(int(len(pooler_outputs) / len(self.predict_target_cols))):
            outputs_concated.append(torch.concat(list(pooler_outputs[i * len(self.predict_target_cols):(i + 1) * len(self.predict_target_cols)])))
            # Concatenating sentence embedding vectors from a job description

        return torch.stack(outputs_concated)

    def configure_optimizers(self):
        pass

In [7]:
# Inference

predict_target_cols = ['sentences']

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
data_module = HFBertDataModule(
    tokenizer=tokenizer,
    max_batch_size=64,
    predict_target_cols=predict_target_cols,
)

model = AutoModel.from_pretrained('bert-base-multilingual-cased')
task = HFBertTask(tokenizer=tokenizer, predict_model=model, predict_target_cols=predict_target_cols)

trainer = pl.Trainer()

predicted_embedding_vectors = trainer.predict(task, datamodule=data_module) # this list contains tensors of each output of batch running
concatenated_embedding_vectors = torch.concat(predicted_embedding_vectors, dim=-2)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /Use

Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  8.40it/s]


In [8]:
# check the inference result

concatenated_embedding_vectors.shape # number of sentence * embedding vector dim

torch.Size([8, 768])

In [58]:
# Training

train_target_cols = ['sentences']

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
data_module = HFBertDataModule(
    tokenizer=tokenizer,
    max_batch_size=64,
    train_target_cols=train_target_cols,
    train_test_ratio=0.7,
    train_val_ratio=0.5
)

model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
task = HFBertTask(tokenizer=tokenizer, train_model=model, train_target_cols=train_target_cols)

trainer = pl.Trainer(
    max_epochs=10,
)

trainer.fit(task, datamodule=data_module) # this list contains tensors of each output of batch running


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /Users/tglim/Programmers_AI_dev/KDT-AiVENGERS/AIInfra/baselines/lightning_logs

  | Name        | Type            | Params
------------------------------------------------
0 | train_

Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  5.48it/s, v_num=0, train_loss=0.0702]
Epoch 0, Avg. Training Loss: 0.070 Avg. Validation Loss: 0.053
Epoch 1: 100%|██████████| 1/1 [00:00<00:00,  5.94it/s, v_num=0, train_loss=0.0829, val_loss=0.0431]
Epoch 1, Avg. Training Loss: 0.083 Avg. Validation Loss: 0.008
Testing DataLoader 0:   0%|          | 0/1 [28:34<?, ?it/s]m=0, train_loss=0.0829, val_loss=0.00806]
Testing DataLoader 0:   0%|          | 0/1 [28:03<?, ?it/s]
Testing DataLoader 0:   0%|          | 0/1 [27:14<?, ?it/s]
Testing DataLoader 0:   0%|          | 0/1 [09:26<?, ?it/s]
Epoch 2: 100%|██████████| 1/1 [00:00<00:00,  5.22it/s, v_num=0, train_loss=0.0665, val_loss=0.00806]
Epoch 2, Avg. Training Loss: 0.067 Avg. Validation Loss: 0.064
Epoch 3: 100%|██████████| 1/1 [00:00<00:00,  6.00it/s, v_num=0, train_loss=0.125, val_loss=0.0638]  
Epoch 3, Avg. Training Loss: 0.125 Avg. Validation Loss: 0.076
Epoch 4: 100%|██████████| 1/1 [00:00<00:00,  5.84it/s, v_num=0, train_loss=0.0752,

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 1/1 [00:00<00:00,  1.19it/s, v_num=0, train_loss=0.0788, val_loss=0.0448]


In [59]:
# Testing

trainer.test(task, datamodule=data_module)

Testing DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]['-'] [CLS] 2. 여러분은 각 문장이 어떻게 Embedding [MASK] 로 변환되는지 test 해보실 수 있으십니다. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
['.'] [CLS] 5 [MASK] Example sentence 는 5개면 충분하겠지요? 너무 예시가 많아도 불편하니까요. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
['-'] [CLS] 2. 여러분은 각 문장이 어떻게 Embedding [MASK] 로 변환되는지 test 해보실 수 있으십니다. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
['.'] [CLS] 5 [MASK] Example sentence 는 5개면 충분하겠지요? 너무 예시가 많아도 불편하니까요. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
['.'] [CLS] 5 [MASK] Example sentence 는 5개면 충분하겠지요? 너무 예시가 많아도 불편하니까요. [SEP] 

[{'test_loss': 0.10760138183832169}]

In [19]:
# save hugging face model & tokenizer weights

hf_trainer = Trainer(
    model=model,
    tokenizer=tokenizer
)

hf_trainer.save_model('../models')