In [6]:
#https://huggingface.co/learn/nlp-course/chapter7/3?fw=pt
from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from utils import count_parameters, print_params_with_requires_grad, get_params_dict
import torch
from datasets import load_dataset


In [7]:
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
print_params_with_requires_grad(model)


bert.embeddings.word_embeddings.weight True
bert.embeddings.position_embeddings.weight True
bert.embeddings.token_type_embeddings.weight True
bert.embeddings.LayerNorm.weight True
bert.embeddings.LayerNorm.bias True
bert.encoder.layer.0.attention.self.query.weight True
bert.encoder.layer.0.attention.self.query.bias True
bert.encoder.layer.0.attention.self.key.weight True
bert.encoder.layer.0.attention.self.key.bias True
bert.encoder.layer.0.attention.self.value.weight True
bert.encoder.layer.0.attention.self.value.bias True
bert.encoder.layer.0.attention.output.dense.weight True
bert.encoder.layer.0.attention.output.dense.bias True
bert.encoder.layer.0.attention.output.LayerNorm.weight True
bert.encoder.layer.0.attention.output.LayerNorm.bias True
bert.encoder.layer.0.intermediate.dense.weight True
bert.encoder.layer.0.intermediate.dense.bias True
bert.encoder.layer.0.output.dense.weight True
bert.encoder.layer.0.output.dense.bias True
bert.encoder.layer.0.output.LayerNorm.weight True


In [9]:
text = "This is a great [MASK]."
inputs = tokenizer(text, return_tensors="pt")
print(inputs)
print(len(inputs["input_ids"][0]))
outputs = model(**inputs)
token_logits = model(**inputs).logits
print(token_logits.shape)

# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
print(top_5_tokens)
for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
    

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 2307,  103, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
8
torch.Size([1, 8, 30522])
[2801, 2154, 2173, 2051, 2518]
'>>> This is a great idea.'
'>>> This is a great day.'
'>>> This is a great place.'
'>>> This is a great time.'
'>>> This is a great thing.'


In [10]:
imdb_dataset = load_dataset("imdb")
imdb_dataset
"""
    DatasetDict({
        train: Dataset({
            features: ['text', 'label'],
            num_rows: 25000
        })
        test: Dataset({
            features: ['text', 'label'],
            num_rows: 25000
        })
        unsupervised: Dataset({
            features: ['text', 'label'],
            num_rows: 50000
        })
    })
"""

"\n    DatasetDict({\n        train: Dataset({\n            features: ['text', 'label'],\n            num_rows: 25000\n        })\n        test: Dataset({\n            features: ['text', 'label'],\n            num_rows: 25000\n        })\n        unsupervised: Dataset({\n            features: ['text', 'label'],\n            num_rows: 50000\n        })\n    })\n"

In [11]:
sample = imdb_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...'
'>>> Label: 1'

'>>> Review: This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stu

In [12]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))] # 단어 index 저장
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
    

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 25000/25000 [00:07<00:00, 3402.62 examples/s]


In [13]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")


'>>> Review 0 length: 363'
'>>> Review 1 length: 304'
'>>> Review 2 length: 133'


In [14]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 800'


In [15]:
chunk_size = 128
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 32'


In [16]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column : 정답으로 사용할 (masking 하지 않는) 원본 sentence.
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map: 100%|██████████| 25000/25000 [01:25<00:00, 293.15 examples/s]
Map:  48%|████▊     | 12000/25000 [00:38<00:41, 314.34 examples/s]


KeyboardInterrupt: 

In [None]:
print(tokenizer.decode(lm_datasets["train"][0]["labels"]))
print(tokenizer.decode(lm_datasets["train"][0]["input_ids"]))
print(lm_datasets["train"][0]["token_type_ids"])
print(lm_datasets["train"][0]["attention_mask"])
print(lm_datasets["train"][0]["word_ids"])

[CLS] i rented i am curious - yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u. s. customs if it ever tried to enter this country, therefore being a fan of films considered " controversial " i really had to see this for myself. < br / > < br / > the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life. in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such
[CLS] i rented i am curious - yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u. s. customs if it ever tried to enter this country, therefore being a fan of films considered " controversial " i really had to see this for myself. < br / > < br / > the plot is cent

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

"""
	1.	input_ids: 원래의 텍스트 시퀀스에서 일부 토큰이 [MASK] 토큰으로 대체된 시퀀스입니다.
	2.  labels: 원래의 텍스트 시퀀스를 나타내며, 모델이 예측해야 할 목표(target)입니다. [MASK] 토큰이 아닌 위치는 -100으로 설정되어 손실 계산에서 무시됩니다.
"""

'\n\t1.\tinput_ids: 원래의 텍스트 시퀀스에서 일부 토큰이 [MASK] 토큰으로 대체된 시퀀스입니다.\n\t2.  labels: 원래의 텍스트 시퀀스를 나타내며, 모델이 예측해야 할 목표(target)입니다. [MASK] 토큰이 아닌 위치는 -100으로 설정되어 손실 계산에서 무시됩니다.\n'

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
print(samples)
for sample in samples:
    _ = sample.pop("word_ids")
# print(data_collator(samples)['input_ids'])
for chunk in data_collator(samples)["input_ids"]:
    print(chunk)
    print(len(chunk))
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

NameError: name 'lm_datasets' is not defined

In [None]:
import torch.nn.functional as F

text = ["This is a great [MASK].", "[MASK] is happy."]
inputs = tokenizer(text, return_tensors="pt", padding= True)
outputs = model(**inputs)[0]
logits = outputs[:,-1, [8699, 4963, 12721, 3571, 6569, 12039, 4474]]
probs = F.log_softmax(logits, -1)
print(probs)
# tokenizer.decode([8699, 4963, 12721, 3571, 6569, 12039, 4474])
tokenizer.decode(torch.tensor([2748]))

tensor([[-3.1803, -2.0598, -2.1803, -1.3638, -1.3523, -2.5335, -2.0857],
        [-4.5211, -2.1576, -4.8789, -2.1684, -0.6226, -2.4729, -2.0351]],
       grad_fn=<LogSoftmaxBackward0>)


'yes'