### Import Libraries we need

In [1]:
import os

# Use only one GPU
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
import numpy as np

### Use Pre-trained bert model

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_NAME = "hfl/chinese-roberta-wwm-ext-large" # "hfl/chinese-roberta-wwm-ext-large" requires at least 16GB 

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# By setting num_labels to 1 will automatically enable regression mode
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels = 32).to(device)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext-large were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the mod

### Data Preprocessing

In [3]:
file_path = "data/train.csv"
data = pd.read_csv(file_path, index_col=0)

In [4]:
def tail_truncate(text):
    if len(text) <= 510:
        return text
    
    return text[-511:-1]

data['fact'] = [tail_truncate(t) for t in data.fact.tolist()]

In [5]:
possible_labels = data.label.unique()

label_dict = {}

for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

label_dict

{12: 0,
 13: 1,
 8: 2,
 7: 3,
 6: 4,
 11: 5,
 0: 6,
 14: 7,
 10: 8,
 9: 9,
 5: 10,
 4: 11,
 2: 12,
 24: 13,
 3: 14,
 17: 15,
 1: 16,
 23: 17,
 15: 18,
 18: 19,
 16: 20,
 20: 21,
 19: 22,
 21: 23,
 22: 24,
 27: 25,
 25: 26,
 26: 27,
 28: 28,
 29: 29,
 33: 30,
 30: 31}

In [6]:
data['nlabel'] = data.label.replace(label_dict)

data.nlabel

id
id_0         0
id_1         1
id_2         2
id_3         3
id_4         4
            ..
id_49996     3
id_49997    11
id_49998    15
id_49999    21
id_50000     8
Name: label, Length: 50001, dtype: int64

In [7]:
train_data, val_data = train_test_split(data, train_size=0.8, random_state=42)
train_data

Unnamed: 0_level_0,fact,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id_30893,罪犯张都树，男，1994年xx月xx日出生，汉族，山西省浑源人，现在山西省太原第三监狱服刑，...,2
id_45279,告人池杰犯运输毒品罪，判处有期徒刑十一年，并处罚金人民币一万元（已缴纳）。刑期自2017年x...,3
id_16398,教所服刑，以被告人林凯利犯盗窃罪，判处有期徒刑八年，并处罚金人民币二万。服刑期间，发现漏罪。...,0
id_13653,二年和并处罚金人民币五万元不变；于2019年xx月xx日作出（2019）桂02刑更845号刑...,4
id_13748,x日投送扎兰屯监狱服刑。执行过程中，刑期无变动。截止2018年xx月xx日，剩余刑期十个月十...,3
...,...,...
id_11284,伤害罪，判处有期徒刑三年六个月。该判决已发生法律效力，罪犯李伟于2017年xx月xx日入监服...,6
id_44732,建议书报送本院审理。本院依法组成合议庭进行了审理，现已审理终结。 执行机关提出，罪犯欧九固在...,9
id_38158,罪犯姜群，男，1971年xx月xx日出生，汉族，出生地辽宁省沈阳市，初中文化，现在辽宁省沈阳...,9
id_860,1年xx月xx日起至2021年xx月xx日止。判决发生法律效力后于2012年xx月xx日交付...,0


In [8]:
# Encode the text
train_encodings = tokenizer(train_data.fact.tolist(), truncation=True, padding=True, max_length=512)
valid_encodings = tokenizer(val_data.fact.tolist(), truncation=True, padding=True, max_length=512)

In [9]:
# Make it torch friendly
class TorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels = None, length = None):
        self.encodings = encodings
        self.labels = labels
        self.length = length

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        # If predicting then no label given
        if self.labels is not None:
            item["labels"] = torch.tensor([self.labels[idx]])
            item["labels"] = int(item["labels"])
        
        return item

    def __len__(self):
        if self.length is not None:
            return self.length
        
        return len(self.labels)

In [10]:
# convert our tokenized data into a torch Dataset
train_dataset = TorchData(train_encodings, train_data.nlabel)
valid_dataset = TorchData(valid_encodings, val_data.nlabel)

### Compute metrics

In [24]:
def exact_acc(labels, logits):
    cnt = 0
    for idx, label in enumerate(labels):
        if label == logits[idx]:
            cnt += 1
    return cnt / len(labels)

def v_standard(labels, logits):
    v_vec = np.abs(np.log(logits + 1) - np.log(labels + 1))
    
    res = []
    for v in v_vec:
        if v <= 0.2:
            res.append(1)
        elif v <= 0.4:
            res.append(0.8)
        elif v <= 0.6:
            res.append(0.6)
        elif v <=0.8:
            res.append(0.4)
        elif v <= 1.0:
            res.append(0.2)
        else:
            res.append(0)
    
    return res

def final_score(labels, logits):
    ext_acc = exact_acc(labels, logits)
    v = v_standard(labels, logits)
    return np.sum(v) * 0.7 + ext_acc * 0.3

def compute_metrics_for_classification(eval_pred):
    logits, labels = eval_pred
    flatten_logits = np.argmax(logits, axis=1).flatten()
    flatten_labels = labels.flatten()
    
    print("logits:", flatten_logits)
    print("labels:", flatten_labels)

    v = v_standard(flatten_labels, flatten_logits)
    ext_acc = exact_acc(flatten_labels, flatten_logits)
    score = np.sum(v) * 0.7 + ext_acc * 0.3 # final_score(labels, logits)

    return {"v": np.sum(v), "ext_acc": ext_acc, "score": score}

### Training

In [25]:
num_epochs = 150

In [26]:
# Specifiy the arguments for the trainer  
training_args = TrainingArguments(
    output_dir ='./results',          
    num_train_epochs = num_epochs,     
    per_device_train_batch_size = 8,  # Can't be too big 
    per_device_eval_batch_size = 8,   
    weight_decay = 0.01,               
    learning_rate = 2e-5,
    logging_dir = './logs',            
    save_total_limit = 2,   # By setting this, we only save best and last model
    load_best_model_at_end = True,     
    metric_for_best_model = 'score',    
    evaluation_strategy = "epoch",
    save_strategy = "epoch"
)

# Call the Trainer
trainer = Trainer(
    model = model,                         
    args = training_args,                  
    train_dataset = train_dataset,         
    eval_dataset = valid_dataset,          
    compute_metrics = compute_metrics_for_classification,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=10)]
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# Train the model
trainer.train()

In [None]:
trainer.save_model("results/best")

### Get Result for Submission

In [None]:
best_model_path = 'results/best/pytorch_model.bin'
checkpoint = torch.load(best_model_path)
model.load_state_dict(checkpoint)

In [30]:
test_set = pd.read_csv('data/testA.csv')

test_set['fact'] = [tail_truncate(t) for t in test_set.fact.tolist()]

In [32]:
encodings = tokenizer(test_set.fact.tolist(), truncation=True, padding=True, max_length=512)

test_dataset = TorchData(encodings, length=25001)

In [33]:
# 对测试数据预测，结果为浮点数
pred_labels = trainer.predict(test_dataset)[0]

***** Running Prediction *****
  Num examples = 25001
  Batch size = 128


In [41]:
flatten_pred_labels = [np.argmax(labels).flatten()[0] for labels in pred_labels]

In [43]:
submit = pd.read_csv('data/submission.csv')
label_dict_inverse = {v: k for k, v in label_dict.items()}

id_label_list = []

for index,row in submit.iterrows():
    idx = int(row['id'].replace("id_", '')) - 50000
    label = label_dict_inverse[flatten_pred_labels[idx]]
    id_label_list.append([row['id'], label])

df = pd.DataFrame(data=id_label_list, columns=['id','label'])
df.to_csv('submission.csv', index=False)