In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np 
import evaluate
import os 
from scipy.special import softmax

In [3]:
os.environ["WANDB_API_KEY"] = 'ca91f3fe76b1db407047964b611a44bd349c4a73'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [4]:
#load depressive speech dataset and store it
dataset = load_dataset('./data/depressive_speech')
#dataset
dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 27977
})

In [5]:
# set the base model to a bert-base-cased
model_name = "./bert-base-cased"

In [6]:
# get the tokenizer from the model and store it
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
# define the tokenize function that the tokenizer will be used for
def tokenize(samples):
    return tokenizer(samples['text'], padding="max_length", truncation=True,max_length=512)

In [8]:
#tokenize the data and store it as a new variable, then determine the number of unique labels that will be used to classify the data during training and evaluation
dataset_tokens = dataset.map(tokenize, batched=True)
label_count = np.unique(np.array(dataset_tokens['train']['label'])).size
label_count

2

In [9]:
# randomize the tokenized data, then split it up into a training set and an evaluation set
shuffle = dataset_tokens['train'].shuffle(seed=42)

train_count = int(shuffle.num_rows * 0.9) 

dataset_train = shuffle.select(range(0, train_count))
dataset_eval = shuffle.select(range(train_count, shuffle.num_rows))
print("Eval:", dataset_eval, "\nTrain:", dataset_train)

Eval: Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2798
}) 
Train: Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25179
})


In [10]:
# create an instance of the base model that will be trained
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=label_count)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# set the characteristics of how the model will be trained (ex: the similarity of the output to the ground truth will be evaluated every epoch)
#training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
training_args = TrainingArguments(
    output_dir='test_trainer',  # 保存模型和日志的目录
    num_train_epochs=1,  # 训练轮数
    per_device_train_batch_size=2,  # 训练时每个 GPU 上的 batch size
    per_device_eval_batch_size=2,  # 验证时每个 GPU 上的 batch size
    warmup_steps=100,  # 学习率 warmup 步数
    learning_rate=3e-5,  # 初始学习率
    logging_dir='./logs',  # 日志保存目录
    logging_steps=100,  # 每隔多少步打印一次训练日志
    evaluation_strategy='epoch',  # 在哪些时间步骤上评估性能：'no', 'steps', 'epoch'
    save_total_limit=3,  # 保存的模型数量上限
    save_strategy='epoch', # 模型保存策略，'steps':每隔多少步保存一次，'epoch':每个epoch保存一次
    gradient_accumulation_steps=2,  # 每多少个 batch 合并为一个，等于期望的 batch size / 
)

In [12]:
# load the accuracy metric
metrics = evaluate.combine(["./metrics/accuracy", "./metrics/f1", "./metrics/precision", "./metrics/recall"])

In [13]:
# define a fuction that will evaluate the accuracy of the model's output
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=predictions, references=labels)

In [14]:
# create an instance of the trainer class
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_eval,
    compute_metrics=compute_metrics,
)

In [15]:
# train the model!
trainer.train()
print(trainer.evaluate())


[34m[1mwandb[0m: Currently logged in as: [33midiau[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

Epoch,Training Loss,Validation Loss


Checkpoint destination directory test_trainer\checkpoint-6295 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.20620355010032654, 'eval_accuracy': 0.952108649035025, 'eval_f1': 0.9522111269614836, 'eval_precision': 0.9474804826117814, 'eval_recall': 0.956989247311828, 'eval_runtime': 164.4518, 'eval_samples_per_second': 17.014, 'eval_steps_per_second': 8.507, 'epoch': 1.0}


In [18]:
# 保存微调后的模型和分词器
model_path = "depression-bert-base-cased"
base_model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('depression-bert-base-cased\\tokenizer_config.json',
 'depression-bert-base-cased\\special_tokens_map.json',
 'depression-bert-base-cased\\vocab.txt',
 'depression-bert-base-cased\\added_tokens.json',
 'depression-bert-base-cased\\tokenizer.json')

In [2]:
model_path = "depression-bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [3]:
sample_text = "i feel completely exhausted my life isn t going anywhere and i ve got nobody to turn to"
#sample_text="i'll make fresh start i promise xtra sad puppy face"
sample_tokens = tokenizer(sample_text, return_tensors="pt", padding="max_length", truncation=True,max_length=512)
sample_out = model(**sample_tokens)
scores = sample_out[0][0].detach().numpy()
scores = softmax(scores)
print("depressive score:",scores[1], "neutral score:", scores[0])

{'input_ids': tensor([[ 101,  178, 1631, 2423, 8984, 1139, 1297, 2762,  189, 1280, 5456, 1105,
          178, 1396, 1400, 8582, 1106, 1885, 1106,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,