In [None]:
!pip install datasets
!pip install accelerate
!pip install peft
!pip install japanize_matplotlib

In [None]:
from datasets import list_datasets

In [None]:
all_datasets = list_datasets()
all_datasets

In [None]:
from datasets import load_dataset
emotions = load_dataset("dair-ai/emotion")
emotions

In [None]:
type(emotions)

In [None]:
emotions["train"]

In [None]:
emotions["train"][0]

In [None]:
emotions["train"].features

In [None]:
# import pandas as pd
# emotions.set_format(type="pandas")
# df = emotions["train"][:]
# df

In [None]:
# def label_int2str(row):
#   return emotions["train"].features["label"].int2str(row)

# df["label_name"] = df["label"].apply(label_int2str)

In [None]:
# df

In [None]:
from transformers import AutoTokenizer

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize(batch):
  token = tokenizer(batch["text"], padding=True)
  return token

In [None]:
emotions_tokenized = emotions.map(tokenize, batched=True, batch_size=None)
emotions_tokenized

In [None]:
emotions_tokenized["train"]

In [None]:
emotions_tokenized["train"].features

In [None]:
from transformers import AutoModelForSequenceClassification
import numpy as np
import os

In [None]:
model_name

In [None]:
num_labels = len(emotions["train"].features["label"].names)
num_labels

In [None]:
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
base_model

In [None]:
import peft

In [None]:
peft_config = peft.LoraConfig(
    task_type=peft.TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    inference_mode=False,
    target_modules="all-linear",
)

In [None]:
model = peft.get_peft_model(base_model, peft_config)
model

In [None]:
model.print_trainable_parameters()

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
training_args = TrainingArguments(
    output_dir = "./finetuned_result",
    evaluation_strategy = "epoch",
    learning_rate = 1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay = 0.01,
    push_to_hub = False
    )

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
def metrics(pred):
  labels = pred.label_ids
  predictions = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, predictions)
  f1 = f1_score(labels, predictions, average = "weighted")
  return {"accuracy":acc, "f1":f1}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=emotions_tokenized["train"],
    eval_dataset=emotions_tokenized["validation"],
    tokenizer=tokenizer,
    compute_metrics=metrics
)

In [None]:
trainer.train()

In [None]:
pred_output = trainer.predict(emotions_tokenized["test"])
pred_output

In [None]:
pred_output.predictions

In [None]:
y_pred = np.argmax(pred_output.predictions, axis=1)
y_test = np.array(emotions_tokenized["test"]["label"])
label_name = emotions_tokenized["test"].features["label"].names

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import japanize_matplotlib

In [None]:
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,8))
dp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = label_name)
dp.plot(ax=ax, colorbar=False)
plt.title('混同行列')
plt.show()