### Import Libraries we need

In [1]:
import os

# Use only one GPU
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import torch, random
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np

### Use Pre-train bert model

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_NAME = "hfl/chinese-roberta-wwm-ext" # "hfl/chinese-roberta-wwm-ext-large" requires at least 16GB 

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# By setting num_labels to 1 will automatically enable regression mode
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels = 1).to(device)

Some weights of the model checkpoint at nghuyong/ernie-3.0-base-zh were not used when initializing ErnieForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing ErnieForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ErnieForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-3.0-base-zh and are newly initialized: ['classifier.weight', 'cla

### Data Preprocessing

In [3]:
file_path = "data/train.csv"
data = pd.read_csv(file_path, index_col=0)

In [4]:
train_data, val_data = train_test_split(data, train_size=0.8, random_state=42)
train_data

Unnamed: 0_level_0,fact,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id_30893,罪犯张都树，男，1994年xx月xx日出生，汉族，山西省浑源人，现在山西省太原第三监狱服刑，...,8
id_45279,罪犯池杰，男，1992年xx月xx日出生，汉族，福建省大田县人，初中文化，原住福建省大田县建...,7
id_16398,罪犯林凯利，男，1995年xx月xx日出生于广东省清远市英德市横石水镇，汉族。现在湖南省未成...,12
id_13653,罪犯黄贝策，男，1965年xx月xx日出生，汉族，广西壮族自治区宾阳县人，初中文化，原住广西...,6
id_13748,罪犯王卡康，男，1968年xx月xx日出生，汉族，初中文化，捕前系哈尔滨铁路局海拉尔列车段整...,7
...,...,...
id_11284,罪犯李伟，男，1997年xx月xx日出生，汉族，海南省万宁市人，初中文化，农民，捕前住海南省...,0
id_44732,罪犯欧九固，男，1972年xx月xx日生，汉族，广西壮族自治区蒙山县人，初中文化，原住广西壮...,9
id_38158,罪犯姜群，男，1971年xx月xx日出生，汉族，出生地辽宁省沈阳市，初中文化，现在辽宁省沈阳...,9
id_860,罪犯吴礼善，男，1973年xx月xx日出生，汉族，广西鹿寨县人，小学文化，原住广西鹿寨县导江...,12


In [5]:
# Encode the text
train_encodings = tokenizer(train_data.fact.tolist(), truncation=True, padding=True, max_length=512)
valid_encodings = tokenizer(val_data.fact.tolist(), truncation=True, padding=True, max_length=512)

In [6]:
# Make it torch friendly
class TorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels = None, length = None):
        self.encodings = encodings
        self.labels = labels
        self.length = length

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        # If predicting then no label given
        if self.labels is not None:
            item["labels"] = torch.tensor([self.labels[idx]])
            item["labels"] = float(item["labels"])
        
        return item

    def __len__(self):
        if self.length is not None:
            return self.length
        
        return len(self.labels)

In [7]:
# convert our tokenized data into a torch Dataset
train_dataset = TorchData(train_encodings, train_data.label)
valid_dataset = TorchData(valid_encodings, val_data.label)

### Compute metrics

In [8]:
def exact_acc(labels, logits):
    logits = np.round(logits)
    cnt = 0
    for idx, label in enumerate(labels):
        if label == logits[idx]:
            cnt += 1
    return cnt / len(labels)

def v_standard(labels, logits):
    logits = np.round(logits)
    v_vec = np.abs(np.log(logits + 1) - np.log(labels + 1))
    
    res = []
    for v in v_vec:
        if v <= 0.2:
            res.append(1)
        elif v <= 0.4:
            res.append(0.8)
        elif v <= 0.6:
            res.append(0.6)
        elif v <=0.8:
            res.append(0.4)
        elif v <= 1.0:
            res.append(0.2)
        else:
            res.append(0)
    
    return res

def final_score(labels, logits):
    ext_acc = exact_acc(labels, logits)
    v = v_standard(labels, logits)
    return np.sum(v) * 0.7 + ext_acc * 0.3

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    # smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)
    score = final_score(labels, logits)

    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "score": score}

### Training

In [9]:
num_epochs = 150

In [10]:
# Specifiy the arguments for the trainer  
training_args = TrainingArguments(
    output_dir ='./results',          
    num_train_epochs = num_epochs,     
    per_device_train_batch_size = 8,  # Can't be too big 
    per_device_eval_batch_size = 8,   
    weight_decay = 0.01,               
    learning_rate = 4e-5,
    logging_dir = './logs',            
    save_total_limit = 2,   # By setting this, we only save best and last model
    load_best_model_at_end = True,     
    metric_for_best_model = 'score',    
    evaluation_strategy = "epoch",
    save_strategy = "epoch"
)

# Call the Trainer
trainer = Trainer(
    model = model,                         
    args = training_args,                  
    train_dataset = train_dataset,         
    eval_dataset = valid_dataset,          
    compute_metrics = compute_metrics_for_regression,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=10)]
)

In [None]:
# Train the model
trainer.train()

In [12]:
trainer.save_model("results/best")

Saving model checkpoint to results/best
Configuration saved in results/best/config.json
Model weights saved in results/best/pytorch_model.bin


In [13]:
# Call the summary
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 10001
  Batch size = 20


  0%|          | 0/501 [00:00<?, ?it/s]

{'eval_loss': 4.0539069175720215,
 'eval_mse': 4.053907871246338,
 'eval_rmse': 2.0134317874908447,
 'eval_mae': 1.176129698753357,
 'eval_r2': 0.8123140847990389,
 'eval_score': 6471.35090690931,
 'eval_runtime': 74.5055,
 'eval_samples_per_second': 134.232,
 'eval_steps_per_second': 6.724,
 'epoch': 24.0}

### Get Result for Submission

In [14]:
best_model_path = 'results/best/pytorch_model.bin'
checkpoint = torch.load(best_model_path)
model.load_state_dict(checkpoint)

<All keys matched successfully>

In [15]:
test_set = pd.read_csv('data/testA.csv')

# 截取掉第一句话
# test_set['fact'] = test_set['fact'].map(lambda x: x[x.find('。') + 1:], na_action='ignore')

test_set

Unnamed: 0,id,fact
0,id_50000,罪犯吴智信，男，1990年xx月xx日出生，汉族，小学文化，农民，原户籍所在地广西陆川县平乐...
1,id_50001,罪犯高九峰，男，1954年xx月xx日出生于福建省南安市，汉族，初中文化，现在福建省清流监狱...
2,id_50002,罪犯罗被策，又名罗义成，男，1979年xx月xx日生，汉族，中专文化，湖南省衡阳县人，现在湖...
3,id_50003,罪犯黄笑雪，男，1983年xx月xx日出生于湖南省龙山县，汉族，现在湖南省星城监狱服刑，以被...
4,id_50004,罪犯曾先，男，1987年xx月xx日出生，汉族，四川省西昌市人，初中文化，现在四川省攀西监狱...
...,...,...
24996,id_74996,罪犯熊军，男，1971年xx月xx日生，汉族，湖北省仙桃市人，普通高中毕业，现在吉林省长春铁...
24997,id_74997,罪犯张子，男，1976年xx月xx日出生于广西壮族自治区合浦县，汉族，小学文化，现在广西壮族...
24998,id_74998,罪犯曲义伯，女，1981年xx月xx日出生，汉族，出生地辽宁省大连市，中专文化，现在辽宁省女...
24999,id_74999,罪犯马被以，女，1979年xx月xx日出生于湖南省永顺县，土家族，初中文化。现在湖南省未成年...


In [16]:
encodings = tokenizer(test_set.fact.tolist(), truncation=True, padding=True, max_length=512)

test_dataset = TorchData(encodings, length=25001)

In [17]:
# 对测试数据预测，结果为浮点数
pred_labels = trainer.predict(test_dataset)[0]

***** Running Prediction *****
  Num examples = 25001
  Batch size = 20


  0%|          | 0/1251 [00:00<?, ?it/s]

In [19]:
submit = pd.read_csv('data/submission.csv')
id_label_list = []

for index,row in submit.iterrows():
    idx = int(row['id'].replace("id_", '')) - 50000
    label = int(np.round(pred_labels[idx][0]))
    id_label_list.append([row['id'], label])

df = pd.DataFrame(data=id_label_list, columns=['id','label'])
df.to_csv('submission.csv', index=False)