### Import Libraries we need

In [1]:
import os

# Use only one GPU
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np

### Use Pre-trained bert model

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_NAME = "hfl/chinese-roberta-wwm-ext-large" # "hfl/chinese-roberta-wwm-ext-large" requires at least 16GB 

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# By setting num_labels to 1 will automatically enable regression mode
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels = 1).to(device)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext-large were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the mod

### Data Preprocessing

In [3]:
file_path = "data/train.csv"
data = pd.read_csv(file_path, index_col=0)

In [4]:
def tail_truncate(text):
    if len(text) <= 510:
        return text
    
    return text[-511:-1]

data['fact'] = [tail_truncate(t) for t in data.fact.tolist()]

In [5]:
train_data, val_data = train_test_split(data, train_size=0.8, random_state=42)
train_data

Unnamed: 0_level_0,fact,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id_30893,罪犯张都树，男，1994年xx月xx日出生，汉族，山西省浑源人，现在山西省太原第三监狱服刑，...,8
id_45279,告人池杰犯运输毒品罪，判处有期徒刑十一年，并处罚金人民币一万元（已缴纳）。刑期自2017年x...,7
id_16398,教所服刑，以被告人林凯利犯盗窃罪，判处有期徒刑八年，并处罚金人民币二万。服刑期间，发现漏罪。...,12
id_13653,二年和并处罚金人民币五万元不变；于2019年xx月xx日作出（2019）桂02刑更845号刑...,6
id_13748,x日投送扎兰屯监狱服刑。执行过程中，刑期无变动。截止2018年xx月xx日，剩余刑期十个月十...,7
...,...,...
id_11284,伤害罪，判处有期徒刑三年六个月。该判决已发生法律效力，罪犯李伟于2017年xx月xx日入监服...,0
id_44732,建议书报送本院审理。本院依法组成合议庭进行了审理，现已审理终结。 执行机关提出，罪犯欧九固在...,9
id_38158,罪犯姜群，男，1971年xx月xx日出生，汉族，出生地辽宁省沈阳市，初中文化，现在辽宁省沈阳...,9
id_860,1年xx月xx日起至2021年xx月xx日止。判决发生法律效力后于2012年xx月xx日交付...,12


In [6]:
# Encode the text
train_encodings = tokenizer(train_data.fact.tolist(), truncation=True, padding=True, max_length=512)
valid_encodings = tokenizer(val_data.fact.tolist(), truncation=True, padding=True, max_length=512)

In [7]:
# Make it torch friendly
class TorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels = None, length = None):
        self.encodings = encodings
        self.labels = labels
        self.length = length

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        # If predicting then no label given
        if self.labels is not None:
            item["labels"] = torch.tensor([self.labels[idx]])
            item["labels"] = float(item["labels"])
        
        return item

    def __len__(self):
        if self.length is not None:
            return self.length
        
        return len(self.labels)

In [8]:
# convert our tokenized data into a torch Dataset
train_dataset = TorchData(train_encodings, train_data.label)
valid_dataset = TorchData(valid_encodings, val_data.label)

### Compute metrics

In [9]:
def exact_acc(labels, logits):
    logits = np.round(logits)
    cnt = 0
    for idx, label in enumerate(labels):
        if label == logits[idx]:
            cnt += 1
    return cnt / len(labels)

def v_standard(labels, logits):
    logits = np.round(logits)
    v_vec = np.abs(np.log(logits + 1) - np.log(labels + 1))
    
    res = []
    for v in v_vec:
        if v <= 0.2:
            res.append(1)
        elif v <= 0.4:
            res.append(0.8)
        elif v <= 0.6:
            res.append(0.6)
        elif v <=0.8:
            res.append(0.4)
        elif v <= 1.0:
            res.append(0.2)
        else:
            res.append(0)
    
    return res

def final_score(labels, logits):
    ext_acc = exact_acc(labels, logits)
    v = v_standard(labels, logits)
    return np.sum(v) * 0.7 + ext_acc * 0.3

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    v = v_standard(labels, logits)
    ext_acc = exact_acc(labels, logits)
    score = np.sum(v) * 0.7 + ext_acc * 0.3 # final_score(labels, logits)

    return {"rmse": rmse, "mae": mae, "r2": r2, "v": v_standard, "ext_acc": ext_acc, "score": score}

### Training

In [10]:
num_epochs = 150

In [11]:
# Specifiy the arguments for the trainer  
training_args = TrainingArguments(
    output_dir ='./results',          
    num_train_epochs = num_epochs,     
    per_device_train_batch_size = 8,  # Can't be too big 
    per_device_eval_batch_size = 8,   
    weight_decay = 0.01,               
    learning_rate = 2e-5,
    logging_dir = './logs',            
    save_total_limit = 2,   # By setting this, we only save best and last model
    load_best_model_at_end = True,     
    metric_for_best_model = 'score',    
    evaluation_strategy = "epoch",
    save_strategy = "epoch"
)

# Call the Trainer
trainer = Trainer(
    model = model,                         
    args = training_args,                  
    train_dataset = train_dataset,         
    eval_dataset = valid_dataset,          
    compute_metrics = compute_metrics_for_regression,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=10)]
)

In [None]:
# Train the model
trainer.train()

In [13]:
trainer.save_model("results/best")

Saving model checkpoint to results/best
Configuration saved in results/best/config.json
Model weights saved in results/best/pytorch_model.bin


In [14]:
# Call the summary
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 10001
  Batch size = 8


  0%|          | 0/1251 [00:00<?, ?it/s]

{'eval_loss': 3.4907639026641846,
 'eval_mse': 3.4907641410827637,
 'eval_rmse': 1.8683587312698364,
 'eval_mae': 1.0228413343429565,
 'eval_r2': 0.8383862463467547,
 'eval_score': 6551.873554644536,
 'eval_runtime': 245.2143,
 'eval_samples_per_second': 40.785,
 'eval_steps_per_second': 5.102,
 'epoch': 52.0}

### Get Result for Submission

In [15]:
best_model_path = 'results/best/pytorch_model.bin'
checkpoint = torch.load(best_model_path)
model.load_state_dict(checkpoint)

<All keys matched successfully>

In [16]:
test_set = pd.read_csv('data/testA.csv')

test_set['fact'] = [tail_truncate(t) for t in test_set.fact.tolist()]

In [17]:
encodings = tokenizer(test_set.fact.tolist(), truncation=True, padding=True, max_length=512)

test_dataset = TorchData(encodings, length=25001)

In [18]:
# 对测试数据预测，结果为浮点数
pred_labels = trainer.predict(test_dataset)[0]

***** Running Prediction *****
  Num examples = 25001
  Batch size = 8


  0%|          | 0/3126 [00:00<?, ?it/s]

In [19]:
submit = pd.read_csv('data/submission.csv')
id_label_list = []

for index,row in submit.iterrows():
    idx = int(row['id'].replace("id_", '')) - 50000
    label = int(np.round(pred_labels[idx][0]))
    id_label_list.append([row['id'], label])

df = pd.DataFrame(data=id_label_list, columns=['id','label'])
df.to_csv('submission.csv', index=False)