### How to prepare your data for inference task
1. make data.csv file in root/data/JobDescription/ folder
* data.csv has columns of JD sections
* data.csv has rows of JDs
* data.csv has cells of sentences
2. change 'target_cols' list before running
3. run

In [1]:
# Modules About Hydra

# from PIL import Image
from typing import List, Any
# from hydra import initialize, initialize_config_module, initialize_config_dir, compose
# from omegaconf import DictConfig

# Modules About Torch, Numpy
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
# from torchvision import datasets, transforms

# Modules About Pytorch Lightning
import lightning.pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
# from lightning.pytorch.loggers import WandbLogger, TensorBoardLogger
from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS, STEP_OUTPUT

# Modules About Hugging Face Transformers
from transformers import AutoTokenizer, AutoModel

# Modules About Pandas, Matplotlib, Numpy
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

# Modules About Language Pre-processing
# import re
# from konlpy.tag import Mecab

# Others
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
'''
Input
data: {'input_ids': [tensor1, tensor2, ...], 'token_type_ids': [tensor1, tensor2, ...], 'attention_mask': [tensor1, tensor2, ...]}
'''

class CustomDatasetForInference(Dataset):
    def __init__(self, data) -> None:
        super().__init__()

        self.data = data
        self.keys = list(data.keys())

    def __len__(self):
        return len(self.data[self.keys[0]])

    def __getitem__(self, index) -> Any:
        item = []
        for key in self.keys:
            item.append(self.data[key][index])

        return item

In [3]:
class HFBertDataModule(pl.LightningDataModule):
    def __init__(self, target_cols, pretrained_model_name, max_batch_size=64, data_dir='../data/') -> None:
        super().__init__()
        self.target_cols = target_cols
        self.data_dir = data_dir

        self.batch_size = int(max_batch_size / len(target_cols)) * len(target_cols)

        # load Bert Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

    def prepare_data(self) -> None:
        # load predict data
        self.predict_data_pd = pd.read_csv(self.data_dir + 'JobDescription/data.csv')

        # serialize columns
        predict_data_serialized = []
        for row in range(len(self.predict_data_pd)):
            for col in self.target_cols:
                predict_data_serialized.append(self.predict_data_pd.iloc[row][col])

        # make tokens
        self.tokens = self.tokenizer(predict_data_serialized, return_tensors='pt', padding=True)

        # make predict dataset
        self.predict_dataset = CustomDatasetForInference(self.tokens)
        self.predict_token_keys = self.predict_dataset.keys

    def setup(self, stage: str) -> None:
        pass

    def _collate_fn_predict(self, batch):
        '''
        Inputs
        batch: [[tensor1_1, tensor1_2, tensor1_3], [tensor2_1, tensor2_2, tensor2_3], ...]
        self.predict_token_keys: ['input_ids', 'token_type_ids', 'attention_mask']

        Output
        dict_by_keys: {'input_ids': [tensor1_1, tensor2_1, ...], 'token_type_ids': [tensor1_2, tensor2_2, ...], 'attention_mask': [tensor1_3, tensor2_3, ...]}
        '''
        list_by_keys = list(zip(*batch))
        dict_by_keys = {}
        for i, key in enumerate(self.predict_token_keys):
            dict_by_keys[key] = torch.stack(list_by_keys[i])

        return dict_by_keys

    def train_dataloader(self) -> TRAIN_DATALOADERS:
        pass

    def val_dataloader(self) -> EVAL_DATALOADERS:
        pass

    def test_dataloader(self) -> EVAL_DATALOADERS:
        pass

    def predict_dataloader(self) -> EVAL_DATALOADERS:
        return DataLoader(self.predict_dataset, batch_size=self.batch_size, collate_fn=self._collate_fn_predict)

In [4]:
class HFBertModel(pl.LightningModule):
    def __init__(self, pretrained_model_name, config=None) -> None:
        super().__init__()
        self.save_hyperparameters()
        self.config = config

        self.loss_func = None

        self.model = AutoModel.from_pretrained(pretrained_model_name, output_hidden_states=False)

    def forward(self, x, y=None) -> Any:
        output = self.model(**x) # put arguments by **kwargs method

        if y is not None:
            loss = self.loss_func(output, y)
            return loss, output

        return output


In [5]:
class HFBertTask(pl.LightningModule):
    def __init__(self, model, n_target_cols) -> None:
        super().__init__()
        self.n_target_cols = n_target_cols

        self.model = model
        self.training_step_outputs = []
        self.validation_step_outputs = []

        self.acc_func = None

    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
        pass

    def validation_step(self, batch, batch_idx) -> STEP_OUTPUT | None:
        pass

    def test_step(self, batch, batch_idx) -> STEP_OUTPUT | None:
        pass

    def predict_step(self, batch, batch_idx, dataloader_idx: int = 0):
        outputs = self.model(batch)
        pooler_outputs = outputs['pooler_output'] # these are the sentence embedding vectors (768 dim each)
        outputs_concated = []
        for i in range(int(len(pooler_outputs) / self.n_target_cols)):
            outputs_concated.append(torch.concat(list(pooler_outputs[i * self.n_target_cols:(i + 1) * self.n_target_cols])))
            # Concatenating sentence embedding vectors from a job description

        return torch.stack(outputs_concated)

    def configure_optimizers(self):
        pass

In [6]:
target_cols = ['sentences']

data_module = HFBertDataModule(
    target_cols=target_cols,
    pretrained_model_name='bert-base-multilingual-cased',
    max_batch_size=64,
)
model = HFBertModel('bert-base-multilingual-cased')
task = HFBertTask(model, len(target_cols))

trainer = pl.Trainer()

predicted_embedding_vectors = trainer.predict(task, datamodule=data_module) # this list contains tensors of each output of batch running
concatenated_embedding_vectors = torch.concat(predicted_embedding_vectors, dim=-2)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  8.63it/s]


In [7]:
concatenated_embedding_vectors.shape # number of sentence * embedding vector dim

torch.Size([8, 768])