In [7]:
import pandas as pd
import numpy as np
import warnings
import torch
warnings.filterwarnings('ignore')

In [9]:
# import wandb
# wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


True

In [10]:
# load dataset
data_train = pd.read_csv('./sentiment-analysis-on-movie-reviews/train.tsv/train.tsv', sep='\t')
data_test = pd.read_csv('./sentiment-analysis-on-movie-reviews/test.tsv/test.tsv', sep='\t')

In [11]:
# 提取phrase:x和label:y，并划分训练集为训练集和验证集
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(data_train['Phrase'].values.tolist(), 
                                                      data_train['Sentiment'].values.tolist(), 
                                                      test_size=0.25, 
                                                      random_state=42)

In [77]:
# 加载预训练模型
from transformers import AutoTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [78]:
# tokenization 向量化
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, num_labels=5)  # 五种情绪

In [79]:
train_encodings = tokenizer(list(x_train), truncation=True, padding=True, return_tensors='pt')  # max_length=512
val_encodings = tokenizer(list(x_valid), truncation=True, padding=True, return_tensors='pt')

In [80]:
# 将向量化后的data封装成torch形式的dataset
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        # 获取每一个图片
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}  # TODO 本来就是tensor了啊
        item['labels'] = torch.tensor(self.labels[idx])
        return item  # 构建的data包括input_ids, attention_mask, labels
    
    def __len__(self):
        return len(self.labels)

class SentimentTestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}  # TODO 本来就是tensor了啊
        return item
    
    def __len__(self):
        return len(self.encodings)

In [81]:
# 生成dataloaders (tokenized data -> torch dataset)
train_dataset = SentimentDataset(train_encodings, y_train)
val_dataset = SentimentDataset(val_encodings, y_valid)

In [82]:
len(train_dataset), len(val_dataset)

(117045, 39015)

In [22]:
# 定义度量函数
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(y_true=labels, y_pred=predictions),
        'f1_score': f1_score(labels, predictions, average='weighted')
}

In [33]:
# 定义训练参数
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='steps',  # evaluation strategy to adopt during training
    num_train_epochs=5,
    learning_rate=2e-5, # default=5e-5
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',
    logging_steps=100,
    save_steps=1000,
    load_best_model_at_end=True,
    report_to='tensorboard'  # 日志展示方式吧  wandb可选
)

In [4]:
# 基于Bert进行finetune
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=5) # 5类
trainer = Trainer(
    model=model, # 实例化的模型
    args=training_args, # 训练参数
    train_dataset=train_dataset, # 训练集
    eval_dataset=val_dataset, # 验证集
    compute_metrics=compute_metrics, # 评价指标
)

NameError: name 'BertForSequenceClassification' is not defined

In [2]:
trainer.train()  # 开训

NameError: name 'trainer' is not defined

In [3]:
trainer.evaluate()  # 在验证集上进行评估

NameError: name 'trainer' is not defined

In [83]:
# 在测试集上进行预测
x_test = data_test['Phrase'].values.tolist()  # 没有sentiment
test_encodings = tokenizer(x_test, truncation=True, padding=True, return_tensors='pt')
test_dataset = SentimentTestDataset(test_encodings)

In [1]:
preds = trainer.predict(test_dataset)  # 预测结果

NameError: name 'trainer' is not defined

In [91]:
preds

PredictionOutput(predictions=array([[-2.8357358,  0.663277 ,  2.5971327,  1.6330429, -2.437308 ],
       [-3.0690255,  0.2920592,  2.5112534,  1.9263854, -2.1462643],
       [-3.0371137, -0.2481   ,  5.0157795,  1.8348434, -2.6696217]],
      dtype=float32), label_ids=None, metrics={'test_runtime': 0.0925, 'test_samples_per_second': 32.416, 'test_steps_per_second': 10.805})

In [100]:
probs = torch.from_numpy(preds[0]).softmax(1)  # 转为概率

In [103]:
predictions = probs.numpy()
y_labels = predictions.argmax(axis=1)

array([2, 2, 2, ..., 1, 1, 1])

In [108]:
data_test

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,156061,8545,An intermittently pleasing but mostly routine ...,2
1,156062,8545,An intermittently pleasing but mostly routine ...,2
2,156063,8545,An,2
3,156064,8545,intermittently pleasing but mostly routine effort,2
4,156065,8545,intermittently pleasing but mostly routine,2
...,...,...,...,...
66287,222348,11855,"A long-winded , predictable scenario .",1
66288,222349,11855,"A long-winded , predictable scenario",1
66289,222350,11855,"A long-winded ,",1
66290,222351,11855,A long-winded,1


In [110]:
data_test.loc[:, 'Sentiment'] = y_labels
submit_data = data_test.loc[:, ['PhraseId', 'Sentiment']]
submit_data.to_csv('submission_bert.csv', index=False)