In [3]:
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import AutoModelForTokenClassification, BertForTokenClassification
from functools import partial
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

from transformers import DataCollatorForTokenClassification
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from transformers.modeling_outputs import TokenClassifierOutput
from torch.nn.modules.loss import CrossEntropyLoss


# scalar weight
# load model checkpoint



# load kkangtong

# tokenizing

# extract hidden state
# add weight with linear layer

# pos tagging
# data = load_dataset("eriktks/conll2003")

# dependency
class My_model(nn.Module):
    def __init__(self, model, device, num_layer, hidden_dim, num_labels):
        super(My_model, self).__init__()
        self.model = model
        self.scalar = nn.Linear(in_features=num_layer, out_features=1, bias = False)
        self.dropout = nn.Dropout(p = 0.1)
        self.output = nn.Linear(in_features=hidden_dim, out_features=num_labels, bias = True)
        self.num_labels = num_labels

    def forward(self, input_ids, attention_mask, labels):
        inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
        out = self.model(**inputs).hidden_states
        out = torch.stack(out)
        out = out.permute(1,2,3,0).contiguous() # layers, batch, sequence, hidden_dimension -> batch, sequence, hidden_dimension, layers
        out = self.scalar(out).squeeze() # batch, sequence, hidden_dimension, layers -> batch, sequence, hidden_dimension
        logits = self.output(self.dropout(out)) # batch, sequence, hidden_dimension -> batch, sequence, labels

        # from BertForTokenClassification
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # if not return_dict:
        #     output = (logits,) + outputs[2:]
        #     return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=None
            # attentions=outputs.attentions,
        )
    


class FineTuningTrainer():
    def __init__(self, data, tokenizer, num_labels, metrics, device):
        self.data = data
        self.tokenizer = tokenizer
        self.tokenized_dataset = self.data.map(self._tokenize_and_align_labels, batched = True)
        self.num_labels = num_labels
        self.metrics = metrics
        self.collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
        self.device = device
    
    def train(self, *, checkpoint, types):
        print(f"{types} training starts")
            
        train_dataset = self.tokenized_dataset['train']
        eval_dataset = self.tokenized_dataset['test']
        model = AutoModel.from_pretrained(checkpoint, output_hidden_states = True)

        for param in model.parameters():
            param.requires_grad = False
        my_model = My_model(model, self.device, 13, 768, self.num_labels)
        my_model.to(self.device)
        
        my_model1 = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels = self.num_labels)
    
        # 이거 나중에 직접 구현해 보기

        
        training_args = TrainingArguments(output_dir = f"finetuning/{types}",
                                          eval_strategy="steps",
                                          eval_steps=10,
                                          save_steps=10,
                                          save_total_limit=1,
                                          load_best_model_at_end=True,
                                          save_only_model=True,
                                          greater_is_better=True,
                                          num_train_epochs=20,
                                          seed=42,
                                          per_device_train_batch_size=1024,
                                          per_device_eval_batch_size=1024,
                                          learning_rate=1e-6,
                                          weight_decay=0.0001,
                                          metric_for_best_model="f1"
                                          )

        trainer = Trainer(model = my_model,
                        args=training_args,
                        train_dataset=train_dataset,
                        eval_dataset=eval_dataset,
                        compute_metrics=self._compute_metrics,
                        callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
                        )
        
        trainer.train()
        print(f"{types} training ends")

    def _tokenize_fn(self, examples):
        return self.tokenizer(examples['text'], padding = 'max_length', truncation = True, max_length = 128)


    def _tokenize_and_align_labels(self, examples):
        tokenized_inputs = self.tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding = "max_length", max_length = 128, return_tensors = "pt")

        labels = []
        for i, label in enumerate(examples[f"upos"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:  # Set the special tokens to -100.
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs


    # def _compute_metrics(self, pred):
    #     logits, labels = pred
    #     predictions = np.argmax(logits, axis = 1)

    #     return self.metrics.compute(predictions = predictions, references = labels)
    
    def _compute_metrics(self, pred):

        # import numpy as np
        # labels = [label_list[i] for i in example[f"ner_tags"]]

        predictions, labels = pred
        predictions = np.argmax(predictions, axis=2)

        true_predictions = [
            [p for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [l for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_predictions = [j for i in true_predictions for j in i]
        true_labels = [j for i in true_labels for j in i]

        results = self.metrics.compute(predictions=true_predictions, references=true_labels, average="micro")
        return results
    

# def main(*, checkpoint):
def main(*, data, tokenizer, metrics, device):

    # dencies/universal_dependencies", "en_ewt")
    # tokenizer = AutoTokenizer.from_pretrained(base_model)
    # metrics= evaluate.load("accuracy")

    my_model_dependency = "checkpoint_output_dependency/checkpoint-dependency-high/"
    my_model_random = "checkpoint_output_random/checkpoint-random-high/"
    base_model = "google-bert/bert-base-uncased"

    num_labels_dependency = 18
    fineTuningTrainer = FineTuningTrainer(data, tokenizer, num_labels_dependency, metrics, device)
    fineTuningTrainer.train(checkpoint = my_model_dependency, types="dependency")
    fineTuningTrainer.train(checkpoint = my_model_random, types="random")
    fineTuningTrainer.train(checkpoint = base_model, types="base")






if __name__ == "__main__":
#loaded
    # glob
    base_model = "google-bert/bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    metrics= evaluate.load("f1")

    # in func
    data = load_dataset("universal-dependencies/universal_dependencies", "en_ewt")
    data1 = data.remove_columns(['text', 'xpos',  'feats', 'head', 'deprel', 'deps', 'misc', 'lemmas'])


    device = "cuda" if torch.cuda.is_available() else "cpu"
    out = main(data=data1, tokenizer=tokenizer, metrics=metrics, device= device)
    # out = main(checkpoint=my_model_dependency)



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Map:   0%|          | 0/12543 [00:00<?, ? examples/s]

Map:   0%|          | 0/2002 [00:00<?, ? examples/s]

Map:   0%|          | 0/2077 [00:00<?, ? examples/s]

dependency training starts


Some weights of BertModel were not initialized from the model checkpoint at checkpoint_output_dependency/checkpoint-dependency-high/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at checkpoint_output_dependency/checkpoint-dependency-high/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 3.10.0, which is below the recommended

Step,Training Loss,Validation Loss,F1
5,No log,3.010121,0.062684
10,No log,3.008893,0.062881
15,No log,3.007773,0.063078


KeyboardInterrupt: 