- trainデータセットとvalidデータセットを用いて、テキスト分類モデルの実装しなさい(学習が遅い場合、一部のtrainデータセットだけ使っても構いません)
- testデータセットでモデルの精度を検証しなさい

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# 把三个文件的数据取出来整理成一个数据集
from datasets import Dataset, DatasetDict

train_df = pd.read_csv("./train.csv")
valid_df = pd.read_csv("./valid.csv")
test_df = pd.read_csv("./test.csv")

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

data_dict = DatasetDict({
    'train': train_dataset,
    'valid': valid_dataset,
    'test': test_dataset
})

In [None]:
data_dict

In [None]:
data_dict.set_format(type="pandas")
train_df = data_dict["train"][:]
train_df.head(5)

In [None]:
data_dict["train"].features

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(4, 6))
data_dict["train"]["label"].value_counts(ascending=True).plot(kind="barh", title="Train Dataset", ax = axes[0])
data_dict["valid"]["label"].value_counts(ascending=True).plot(kind="barh", title="Valid Dataset", ax = axes[1])
data_dict["test"]["label"].value_counts(ascending=True).plot(kind="barh", title="Test Dataset", ax = axes[2])
plt.tight_layout()
plt.show()


In [None]:
train_df["text_length"]=train_df["sentence"].str.len()
train_df.boxplot(column="text_length", by="label", figsize=(3, 6))

In [None]:
# 调用模型的分词器进行分词
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')

sample_text_encoded = tokenizer(train_df["sentence"][0])
sample_tokens = tokenizer.convert_ids_to_tokens(sample_text_encoded.input_ids)

print(sample_text_encoded)
print(sample_tokens)


In [None]:
# 在整个数据集上应用分词器
def tokenize(batch):
    return tokenizer(batch["sentence"], padding=True, truncation=True, return_tensors="pt")

data_dict.reset_format()

data_dict = data_dict.map(tokenize, batched=True)

In [None]:
sample_encoded = data_dict["train"][0]
pd.DataFrame(
    [sample_encoded["input_ids"]
     , sample_encoded["attention_mask"]
     , tokenizer.convert_ids_to_tokens(sample_encoded["input_ids"])],
    ['input_ids', 'attention_mask', "tokens"]
).T

In [None]:
# 检查cuda是否可用
import torch

if torch.cuda.is_available():
    print("CUDA is available! :D")
    print("CUDA version:", torch.version.cuda)
else:
    print("CUDA is not available. :(")

# 如果安装了cuda还是不可用，可能是cuda和pytorch的版本没有对应上，比如安装了pytorch的cpu版本
# 如果真是版本问题，建议删除当前环境的cuda，pytorch等相关的库，使用conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia 或者 pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118重新安装

In [None]:
# 定义训练所需参数
from transformers import TrainingArguments

batch_size = 16
logging_steps = len(data_dict["train"]) // batch_size

# 设置模型输出的存储位置
model_name = r"G:\result"


training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    log_level="error"
)

In [None]:
# 进行设备的设置，如果cuda可用，则使用gpu训练，不可用则使用cpu
# 调用预训练的bert模型
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 2

model = (AutoModelForSequenceClassification
    .from_pretrained("cl-tohoku/bert-base-japanese-v3", num_labels=num_labels)
    .to(device))

In [14]:
# 定义模型的性能指标
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
# 定义训练所需参数
from transformers import TrainingArguments

batch_size = 16
logging_steps = len(data_dict["train"]) // batch_size

# 设置模型输出的存储位置
model_name = r"G:\result"


training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    log_level="error"
)

In [None]:
# 测试输出目录是否可用，不可用则生成目录
import os


test_dir = r"G:\result"

if not os.path.exists(test_dir):
    os.makedirs(test_dir)

test_file_path = os.path.join(test_dir, 'test_file.txt')
try:
    with open(test_file_path, 'w') as file:
        file.write('Hello, this is a test file.')
    print("文件写入成功，目录可写。")
except Exception as e:
    print(f"写入文件时发生错误：{e}")


In [None]:
# 训练模型
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=data_dict["train"],
    eval_dataset=data_dict["valid"],
    tokenizer=tokenizer
)
trainer.train() 

In [None]:
preds_output = trainer.predict(data_dict["test"])

In [None]:
# 进行预测
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

y_preds = np.argmax(preds_output.predictions, axis=1)
y_valid = np.array(data_dict["test"]["label"])
labels = ['0', '1']

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

plot_confusion_matrix(y_preds, y_valid, labels)