In [4]:
from datasets import load_dataset

dataset_files = {
    "train": ["/root/workspace/my_reserch/data/chABSA-dataset-tsv/train.csv"],
    "validation": ["/root/workspace/my_reserch/data/chABSA-dataset-tsv/validation.csv"],
    "test": ["/root/workspace/my_reserch/data/chABSA-dataset-tsv/test.csv"],
}
chABSA_dataset = load_dataset("csv", data_files=dataset_files)

Using custom data configuration default-a60b5795f4a82772
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-a60b5795f4a82772/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
import pprint
print(chABSA_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 1688
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 562
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 563
    })
})


In [8]:
import torch.nn as nn
from transformers import AutoModel
from transformers.modeling_outputs import ModelOutput

class RobertaWithLstmForClassification(nn.Module):
    def __init__(self, pretrained_model, num_categories, loss_function=None):
        super().__init__()
        self.robert = pretrained_model
        self.hidden_size = self.robert.config.hidden_size
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, batch_first=True)
        self.linear = nn.Linear(self.hidden_size, num_categories)
        self.loss_function = loss_function
    
    def forward(self,
                input_ids,
                attention_mask=None,
                position_ids=None,
                token_type_ids=None,
                output_attentions=False,
                output_hidden_states=False,
                labels=None):
        
        outputs = self.robert(input_ids,
                            attention_mask=attention_mask,
                            position_ids=position_ids,
                            token_type_ids=token_type_ids,
                            output_attentions=output_attentions,
                            output_hidden_states=output_hidden_states)
        
        #state = outputs.last_hidden_state[:, 0, :]
        #state = self.lstm(state, None)
        #state = self.linear(state)
        
        out, _ = self.lstm(outputs['last_hidden_state'], None)
        sequence_output = out[:, -1, :]
        logits = self.linear(sequence_output)
        
        loss=None
        if labels is not None and self.loss_function is not None:
            loss = self.loss_function(logits, labels)
        
        attentions=None
        if output_attentions:
            attentions=outputs.attentions
        
        hidden_states=None
        if output_hidden_states:
            hidden_states=outputs.hidden_states
        
        return ModelOutput(
            logits=logits,
            loss=loss,
            last_hidden_state=outputs.last_hidden_state,
            attentions=attentions,
            hidden_states=hidden_states
        )

In [9]:
from torch import nn
from transformers import RobertaPreTrainedModel, RobertaModel
from transformers.modeling_outputs import SequenceClassifierOutput

class CustomModelForSequenceClassification(RobertaPreTrainedModel):

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = num_labels
        self.config = config

        # 系列分類の場合、[CLS]トークンのみの隠れ状態で十分なのでadd_pooling_layer=TrueでOK
        self.roberta = RobertaModel(config)

        #LSTMの用意
        self.lstm = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)

        # 分類ヘッドの用意
        #self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # ヘッドのlinear層について、入力の隠れ状態数にsource分の1を加算
        # self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        # 初期化(Body部は事前学習済みの重みをロード、ヘッドはランダム初期化される)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None\
        , labels=None, source=None, **kwargs):

        # robertaの隠れ状態を取得
        outputs = self.roberta(
            input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids, **kwargs
        )

        out, _ = self.lstm(outputs['last_hidden_state'], None)
        sequence_output = out[:, -1, :]
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [13]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("nlp-waseda/roberta-base-japanese-with-auto-jumanpp")

In [14]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

chABSA_dataset_encoded = chABSA_dataset.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ValueError: malformed line:   \  \  特殊 1 空白 6 * 0 * 0 "代表表記:S/*"

In [21]:
num_labels = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

loss_fct = nn.CrossEntropyLoss()
model_ckpt = "nlp-waseda/roberta-base-japanese-with-auto-jumanpp"
#model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))
#model = (RobertaWithLstmForClassification(pretrained_model, num_labels, loss_fct).to(device))
model = (CustomModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))
tokenizer = AutoTokenizer.from_pretrained("nlp-waseda/roberta-base-japanese-with-auto-jumanpp")


batch_size = 32
logging_steps = len(chABSA_dataset_encoded["train"]) // batch_size
output_dir = "/content/drive/MyDrive/Colab Notebooks/robarta-lstm"
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False, 
                                  log_level="error")

trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics, 
                  train_dataset=chABSA_dataset_encoded["train"], 
                  eval_dataset=chABSA_dataset_encoded["validation"],
                  tokenizer=tokenizer)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4938,0.338716,0.884342,0.883623
2,0.2128,0.316431,0.900356,0.899778


TrainOutput(global_step=106, training_loss=0.3500838352824157, metrics={'train_runtime': 166.0581, 'train_samples_per_second': 20.33, 'train_steps_per_second': 0.638, 'total_flos': 459478786197696.0, 'train_loss': 0.3500838352824157, 'epoch': 2.0})