In [1]:
import os
import json
import torch
import numpy as np
from datasets import load_dataset, Value
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer, 
)
from sklearn.metrics import accuracy_score, f1_score

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# data training arguments
data_dir = "data/iflytek/"
max_length = 256

In [4]:
# model arguments
model_name = "Langboat/mengzi-bert-base"

In [5]:
# training arguments
output_dir = "tmp/iflytek/"
overwrite_output_dir = True
batch_size = 32
num_train_epochs = 3
learning_rate = 5e-5

# Step 1: Read in data

In [6]:
train_filepath = os.path.join(data_dir, "train.json")
valid_filepath = os.path.join(data_dir, "dev.json")
test_filepath = os.path.join(data_dir, "test.json")
split_2_filepath = {
    "train": train_filepath, 
    "valid": valid_filepath, 
}

In [7]:
dataset = load_dataset("json", data_files=split_2_filepath)

Using custom data configuration default-e245dfba59b3c0bf
Reusing dataset json (/home/studio-lab-user/.cache/huggingface/datasets/json/default-e245dfba59b3c0bf/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
dataset = dataset.cast_column("label", Value("int32"))
dataset = dataset.rename_column("label_des", "label_name")

Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/json/default-e245dfba59b3c0bf/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426/cache-3c87beba9aa2c129.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/json/default-e245dfba59b3c0bf/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426/cache-432f08a282d4c3a1.arrow


In [9]:
label_id_2_label_name = {
    int(label_id): label_name 
    for label_id, label_name in zip(dataset["train"]["label"], dataset["train"]["label_name"])
}
label_name_2_label_id = {
    label_name: label_id 
    for label_id, label_name in label_id_2_label_name.items()
}
num_classes = len(label_id_2_label_name)
label_names = [label_name for _, label_name in sorted(label_id_2_label_name.items(), key=lambda x: x[0])]

# Step 2: Feature Engineering

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
tokenizer("我是一只小俊俊", padding="max_length", truncation=True, max_length=max_length)

{'input_ids': [101, 2769, 3221, 671, 1372, 2207, 916, 916, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [12]:
# tokenizer(...) returns a `BatchEncoding` instance
dataset = dataset.map(
    lambda batch: tokenizer(batch["sentence"], padding=True, truncation=True, max_length=max_length), 
    batched=True, 
    batch_size=None
)

Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/json/default-e245dfba59b3c0bf/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426/cache-ae2a3cb2e4b69c9b.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/json/default-e245dfba59b3c0bf/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426/cache-5a29bab98e2db568.arrow


# Step 3: Build our model

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes).to(device)

Some weights of the model checkpoint at Langboat/mengzi-bert-base were not used when initializing BertForSequenceClassification: ['sop.cls.bias', 'sop.cls.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

# Step 4: Train & Evaluate

In [14]:
def compute_metrics(pred):
    y_true = pred.label_ids
    y_pred = pred.predictions.argmax(axis=-1)
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="micro")
    return {"accuracy": acc, "f1": f1}

In [15]:
logging_steps = len(dataset["train"]) // batch_size
train_args = TrainingArguments(
    output_dir=output_dir, 
    overwrite_output_dir=overwrite_output_dir, 
    do_train=True, 
    do_eval=True, 
    evaluation_strategy="epoch", 
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size, 
    num_train_epochs=num_train_epochs, 
    learning_rate=learning_rate, 
    logging_steps=logging_steps, 
    log_level="error", 
    disable_tqdm=False, 
    push_to_hub=False, 
)

In [16]:
trainer = Trainer(
    tokenizer=tokenizer, 
    model=model, 
    compute_metrics=compute_metrics, 
    args=train_args, 
    train_dataset=dataset["train"], 
    eval_dataset=dataset["valid"], 
)
# fix bug: 
# RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
old_data_collator = trainer.data_collator  # `DataCollatorWithPadding` instance
trainer.data_collator = lambda x: dict(old_data_collator(x))

In [17]:
train_output = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.6484,1.948184,0.555214,0.555214
2,1.6503,1.713838,0.598692,0.598692
3,1.3156,1.67353,0.604463,0.604463


In [18]:
trainer.save_model()

In [19]:
pred_output_valid = trainer.predict(dataset["valid"])
pred_valid = pred_output_valid.predictions.argmax(axis=-1)
valid_f1 = f1_score(dataset["valid"]["label"], pred_valid, average="micro")
print(f"the F1-Score of validation set is: {valid_f1:.4f}")

the F1-Score of validation set is: 0.6045


# Step 5: Predict

In [20]:
test_dataset = load_dataset("json", data_files=test_filepath)
test_dataset = test_dataset["train"]

Using custom data configuration default-098ae10728338e6f
Reusing dataset json (/home/studio-lab-user/.cache/huggingface/datasets/json/default-098ae10728338e6f/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)


  0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
test_dataset = test_dataset.map(
    lambda batch: tokenizer(batch["sentence"], padding="max_length", truncation=True, max_length=max_length), 
    batched=True, 
    batch_size=None, 
)

Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/json/default-098ae10728338e6f/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426/cache-4b576133e69525bd.arrow


In [22]:
pred_output_test = trainer.predict(test_dataset)
pred_test = pred_output_test.predictions.argmax(axis=-1)

In [23]:
test_df = test_dataset.to_pandas()[["id"]]
test_df["label"] = pred_test

In [24]:
with open("iflytek_predict.json", "w") as fout:
    for _, row in test_df.iterrows():
        fout.write(json.dumps(row.to_dict()) + "\n")