# MLM Tester
* Model 선언
* Load from folder
* Data load
* Inference


In [2]:
import logging
import math
import os
import sys
from dataclasses import dataclass, field
from datetime import datetime
from itertools import chain
from typing import Optional

## custom load
import bootstrap
import datasets
import evaluate
# custom config load
import torch
import transformers
from datasets import load_dataset
from ingt_tokenizer import IngtTokenizer
from transformers import (CONFIG_MAPPING, MODEL_FOR_MASKED_LM_MAPPING,
                          AutoConfig, AutoModelForMaskedLM, AutoTokenizer,
                          DataCollatorForLanguageModeling, HfArgumentParser,
                          Trainer, TrainingArguments, is_torch_tpu_available,
                          set_seed)
from transformers.trainer_utils import get_last_checkpoint
# from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils import send_example_telemetry
from transformers.utils.versions import require_version

## custom load

VOCAB_CONFIG = "ingr_title"  # 'ingr_only' or 'ingr_title' (ing_title -> memory error)
# VOCAB_CONFIG    =   'ingr_title' # 'ingr_only' or 'ingr_title'
CONFIG_PATH = "/home/donghee/projects/mlm/config.json"

ingt_config = bootstrap.IngTConfig(vocab=VOCAB_CONFIG, path=CONFIG_PATH)

In [3]:
logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [4]:
# class args:
#     data_folder = '/media/ssd/dh/projects/ing_mlm/test_output'
#     model_name_or_path = '/media/ssd/dh/projects/ing_mlm/test_output/checkpoint-5500'
# class args:
#     data_folder = '/media/ssd/dh/projects/ing_mlm/test_output_2023-03-06-20-47'
#     model_name_or_path = '/media/ssd/dh/projects/ing_mlm/checkpoints/v1-ing-only_2023-03-06-20-51/checkpoint-56000'

class args:
    data_folder = '/disk1/data/ing_mlm_data/processed/v2_ing_title_sample/test.txt'
    model_name_or_path = '/home/donghee/projects/mlm/checkpoints/v2-ing-title_2023-08-03-12-02/checkpoint-60000'

In [5]:
args.data_folder

'/disk1/data/ing_mlm_data/processed/v2_ing_title_sample/test.txt'

In [6]:
ingt_tokenizer = IngtTokenizer(ingt_config)
ingt_tokenizer.load()
tokenizer = ingt_tokenizer.tokenizer

In [7]:
config = AutoConfig.from_pretrained(args.model_name_or_path)
config

BertConfig {
  "_name_or_path": "/home/donghee/projects/mlm/checkpoints/v2-ing-title_2023-08-03-12-02/checkpoint-60000",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 3,
  "num_hidden_layers": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.30.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 171490
}

In [8]:
model = AutoModelForMaskedLM.from_pretrained(
            args.model_name_or_path,
            config=config,
        )

In [11]:
# inputs = tokenizer("lentils salt_and_black_pepper vegetable_broth white_pepper chili_powder extra_virgin_olive_oil garlic_powder celery carrots tomatoes onions water", return_tensors="pt")

# test -> 
# completion 맨마지막을 전부 [MASK] -> answer : water
# ideation ing1 ing2 [MASK] ing3 [MASK] ing4 ing5 -> -100 -100 ing100 -100 ing200

inputs = tokenizer("[MASK] whole_kernel_corn cheddar_cheese milk water butter[SEP]Crunchy Onion Potato Bake", return_tensors="pt")
outputs = model(**inputs )
# outputs = model(**inputs,output_attentions=True )
# outputs = model(**inputs, output_hidden_states=True)
# outputs
tokenizer.convert_ids_to_tokens(torch.argmax(outputs.logits, dim=2).reshape(-1))


['bisquick',
 'whole_kernel_corn',
 'cheddar_cheese',
 'milk',
 'water',
 'butter',
 'potatoes',
 'crunchy',
 'onion',
 'potato',
 'sour_cream']

In [12]:
outputs.logits.shape

torch.Size([1, 11, 171490])

In [13]:
outputs.logits[0]

tensor([[-5.6294, -6.9249, -5.4099,  ..., -5.8047, -6.7900, -5.0061],
        [-6.4214, -5.4866, -6.5880,  ..., -5.7604, -5.1726, -6.4609],
        [-6.0903, -7.2667, -4.9855,  ..., -6.7639, -6.0559, -5.2878],
        ...,
        [-5.5508, -5.0793, -5.7811,  ..., -4.6969, -6.7108, -6.2666],
        [-5.4562, -4.7530, -5.3175,  ..., -5.0579, -7.3074, -5.0504],
        [-7.6900, -8.9001, -7.0929,  ..., -7.8584, -8.0195, -7.1072]],
       grad_fn=<SelectBackward0>)

In [14]:
torch.argmax(outputs.logits, dim=2).reshape(-1)

tensor([3152, 2345,  732,  295,  256,  202,  498, 3408,  218,  712,  458])

In [15]:
torch.argmax(outputs.logits[0][0])

tensor(3152)

In [16]:
tokenizer.convert_ids_to_tokens(torch.topk(outputs.logits[0][4], 20).indices)

['water',
 'eggs',
 'flour',
 'onion',
 'onions',
 'butter',
 'sugar',
 'garlic_cloves',
 'olive_oil',
 'boiling_water',
 'warm_water',
 'hot_water',
 'milk',
 'pepper',
 'chicken_broth',
 'celery',
 'egg',
 'green_onions',
 'garlic',
 'garlic_powder']

['water', 'olive_oil', 'water', 'water', 'water', 'water', 'water', 'water']