In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tqdm import tqdm
from transformers import BertTokenizerFast, AutoTokenizer, BertModel, AutoModel

device = "cuda" if torch.cuda.is_available() else "cpu"

def tokenize_data(file_path, mode):
  def read_data(file_path):
    with open(file_path, 'r', encoding="utf-8") as f:
        data_in = json.load(f)
    if mode == 'test':
        datas = [{'id': x[0],'text1': x[1], 'text2': x[2]} for x in data_in]
    else:
        datas = [{'id': x[0],'text1': x[1], 'text2': x[2], 'relation': x[3]} for x in data_in]
    return datas

  datas = read_data(file_path)
  #tokenizer = BertTokenizerFast.from_pretrained("ckiplab/albert-base-chinese")
  #tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
  #tokenizer = AutoTokenizer.from_pretrained("luhua/chinese_pretrain_mrc_roberta_wwm_ext_large")
  tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-electra-180g-large-discriminator")
  
  text1_tokenized = tokenizer([data["text1"] for data in datas], add_special_tokens=False, truncation=True)
  text2_tokenized = tokenizer([data["text2"] for data in datas], add_special_tokens=False, truncation=True)

  return datas, text1_tokenized, text2_tokenized


test_datas, test_text1_tokenized, test_text2_tokenized = tokenize_data('/kaggle/input/nlp-final/Final Project Task 1/team_test.json', mode = 'test')
print(len(test_datas))

In [2]:
class text_Dataset(Dataset):
    def __init__(self, datas, tokenized_text1, tokenized_text2,mode):
        self.datas = datas
        self.tokenized_text1 = tokenized_text1
        self.tokenized_text2 = tokenized_text2
        self.max_len = 254
        self.mode = mode

        # Input sequence length = [CLS] + text1 + [SEP] + text2 + [SEP]
        self.max_seq_len = 1 + self.max_len + 1 + self.max_len + 1

    def __len__(self):
        return len(self.datas)

    def __getitem__(self, idx):
        data = self.datas[idx]
        ID = data['id']
        tokenized_text1 = self.tokenized_text1[idx]
        tokenized_text2 = self.tokenized_text2[idx]

        # add special tokens (101: CLS, 102: SEP)
        input_ids_text1 = [101] + tokenized_text1.ids[:self.max_len] + [102]
        input_ids_text2 = tokenized_text2.ids[:self.max_len] + [102]

        # Pad sequence and obtain inputs to model
        input_ids, token_type_ids, attention_mask = self.padding(input_ids_text1, input_ids_text2)
        if self.mode != 'test':
            label = data['relation']
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), label
        else:
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), ID

    def padding(self, input_ids_text1, input_ids_text2):
        # Pad zeros if sequence length is shorter than max_seq_len
        padding_len = self.max_seq_len - len(input_ids_text1) - len(input_ids_text2)
        # Indices of input sequence tokens in the vocabulary
        input_ids = input_ids_text1 + input_ids_text2 + [0] * padding_len
        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
        token_type_ids = [0] * len(input_ids_text1) + [1] * len(input_ids_text2) + [0] * padding_len
        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
        attention_mask = [1] * (len(input_ids_text1) + len(input_ids_text2)) + [0] * padding_len

        return input_ids, token_type_ids, attention_mask


In [3]:
class BERTClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BERTClassifier, self).__init__()
        #self.bert = BertModel.from_pretrained("ckiplab/albert-base-chinese")
        #self.bert = BertModel.from_pretrained("bert-base-chinese")
        #self.bert = BertModel.from_pretrained("luhua/chinese_pretrain_mrc_roberta_wwm_ext_large")
        self.bert = AutoModel.from_pretrained("hfl/chinese-electra-180g-large-discriminator")
        self.d_dim = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.25)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, token_type_ids, attention_mask):
        output = self.bert(input_ids=input_ids, token_type_ids = token_type_ids, attention_mask=attention_mask)
        #For ALBERT, BERT, RoBERTa
            #x = output.pooler_output
        #For ELECTRA
        x = output.last_hidden_state[:,0,:]#cls last_hidden
        x = self.dropout(x)
        x = self.fc(x)
        return x

In [4]:
test_set = text_Dataset(test_datas, test_text1_tokenized, test_text2_tokenized, mode = 'test')
test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0, pin_memory=True)
print(len(test_loader))

815


In [9]:
import numpy as np
model_best = BERTClassifier(num_classes = 3).to(device)
model_best.load_state_dict(torch.load("ver_best.ckpt"))
model_best.eval()
prediction = []
with torch.no_grad():
    for batch in tqdm(test_loader):
        data = [i.to(device) for i in batch]
        #print(data)
        input_ids, token_type_ids, attention_mask, ID = data
        test_pred = model_best(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        test_label = np.argmax(test_pred.cpu().data.numpy(), axis=1)
        prediction += test_label.tolist()
        

100%|██████████| 815/815 [00:53<00:00, 15.15it/s]


In [10]:
print(prediction)

[1, 2, 1, 0, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 0, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 0, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 0, 1, 1, 0, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, 0, 2, 1, 1, 1, 2, 1, 2, 0, 1, 1, 1, 2, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 0, 2, 2, 2, 0, 1, 2, 0, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 0, 0, 1, 1, 1, 0, 0, 2, 1, 1, 0, 1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 0, 2, 2, 2, 2, 0, 1, 1, 0, 1, 0, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 0, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 0, 1, 0, 2, 0, 1, 0, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 0, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 0, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 0, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 0, 0, 2, 2, 1, 0, 2, 1, 1, 1, 1, 1, 1, 

In [8]:
import pandas as pd
#create test csv
df = pd.DataFrame()
df["ID"] = [i+1 for i in range(len(test_set))]
df["y_pred"] = prediction
df.to_csv("ELECTRA_v1.csv",index = False)