In [1]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertConfig, BertForSequenceClassification, BertPreTrainedModel, BertModel
import json
from torch.utils.data import TensorDataset, random_split, Subset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import TrainingArguments, Trainer
from torch import nn
import torch.nn.functional as F
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, precision_recall_fscore_support
import pickle

In [2]:
device = 'cuda'

In [3]:
#test = json.load(open('D:\\download\\比賽\\CHEF-sbert\\Pipeline\\Data\\CHEF_train_hfl_pretraineds_0511sentBase_document_article_epoch50_0519_BM25F_V3.json', 'r', encoding='utf-8'))
test = json.load(open('D:\\download\\比賽\\CHEF-sbert\\Pipeline\\Data\\CHEF_test_hfl_pretraineds_0511sentBase_document_article_epoch50_0519_BM25F_V3.json', 'r', encoding='utf-8'))

#test = test[3154:3942]
#test = test[9297:11620]
labels_ = [row['label'] for row in test]

In [4]:
model_state_dict = torch.load('D:\\predict_hfl_pretraineds_0511sentBase_document_article_epoch50_0519_BM25F_V3_9297_epoch__11\\pytorch_model.bin', map_location=torch.device('cuda'))
model = BertForSequenceClassification.from_pretrained('D:\\hfl_pretraineds_0511sentBase_document_article_epoch50_0519', state_dict=model_state_dict, num_labels=3,output_attentions = False,output_hidden_states = False)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [5]:
#tokenizer = AutoTokenizer.from_pretrained('D:\\hfl')
tokenizer = BertTokenizer.from_pretrained('D:\\hfl_pretraineds_0511sentBase_document_article_epoch50_0519')
#tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)

In [8]:
sentence = json.load(open('D:\\download\\比賽\\CHEF-sbert\\Pipeline\\Data\\claim_cossim_hfl_pretraineds_0511sentBase_document_article_epoch50_0519_BM25F_V3.json', 'r', encoding='utf-8'))
#sentence = sentence[3154:3942]
#sentence = sentence[9297:11620]
sentence = sentence[11620:]

In [9]:
input_ids = []
attention_masks = []
labels = []
for i in range(len(sentence)):
    encoded_dict = tokenizer.encode_plus(
        sentence[i],  # Sentence to encode.
        add_special_tokens=False,  # Add '[CLS]' and '[SEP]'
        max_length= 512,  # Pad & truncate all sentences.
        padding='max_length',
        return_attention_mask=True,  # Construct attn. masks.
        return_tensors='pt',  # Return pytorch tensors.
        truncation=True
    )
    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
    #labels.append(2)
    labels.append(labels_[i])

In [10]:
input_ids = torch.cat(input_ids, dim=0).to(device)
attention_masks = torch.cat(attention_masks, dim=0).to(device)
labels = torch.tensor(labels, device='cuda')
test_dataset = TensorDataset(input_ids, attention_masks,labels)

In [11]:
test_dataloader = DataLoader(
        test_dataset,
        sampler=SequentialSampler(test_dataset),   
        batch_size=8 
    )

In [12]:
import time
import numpy as np
import os
from collections import Counter

import datetime
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [13]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [14]:
best_microf1 = 0
best_macrof1 = 0
best_recall = 0
best_precision = 0
best_prediction = None
best_ground_truth = None
t0 = time.time()
# Put the model in evaluation mode
#model.forward()
model.eval()
print("Running Validation...")
# Tracking variables
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0
all_prediction = np.array([])
all_ground_truth = np.array([])
all_logits = np.array([])
#all_domains = np.array([])
# Evaluate data for one epoch
for batch in test_dataloader:
    # Unpack this training batch from our dataloader.
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    #b_domains = batch[3].to('cpu').numpy()
    with torch.no_grad():
        outputs = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask,
            labels=b_labels
        )
        loss, logits = outputs[0], outputs[1]
    # Accumulate the validation loss.
    total_eval_loss += loss.sum().item()
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()
    #print('qqqq = ',all_prediction )
    all_prediction = np.concatenate((all_prediction, pred_flat), axis=None)
    #print('aaaa = ',all_prediction )
    all_ground_truth = np.concatenate((all_ground_truth, labels_flat), axis=None)
    #all_domains = np.concatenate((all_domains, b_domains), axis=None)
    if len(all_logits) == 0:
        all_logits = logits
    else:
        all_logits = np.concatenate((all_logits, logits), axis=0)

# Calculate the average loss over all of the batches.
avg_val_loss = total_eval_loss / len(test_dataloader)
# Measure how long the validation run took.
validation_time = format_time(time.time() - t0)
print('Validation Elapsed: {:}.'.format(validation_time))
# print(bert_type)
c = Counter()
for pred in all_prediction:
    c[int(pred)] += 1
print(c)
pre, recall, f1, _ = precision_recall_fscore_support(all_ground_truth, all_prediction, average='micro')
print("       F1 (micro): {:.2%}".format(f1))
microf1 = f1
pre, recall, f1, _ = precision_recall_fscore_support(all_ground_truth, all_prediction, average='macro')
print("Precision (macro): {:.2%}".format(pre))
print("   Recall (macro): {:.2%}".format(recall))
print("       F1 (macro): {:.2%}".format(f1))

Running Validation...
Validation Elapsed: 0:09:26.
Counter({0: 4512, 1: 2599, 2: 1927})
       F1 (micro): 21.32%
Precision (macro): 33.33%
   Recall (macro): 7.11%
       F1 (macro): 11.72%


  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
with open('D:\\download\\比賽\\all_prediction_predict_hfl_pretraineds_0511sentBase_document_article_epoch50_0519_BM25F_V3_9297.pickle', 'wb') as f:
    pickle.dump(all_prediction, f)

In [15]:
import pickle
with open('D:\\download\\比賽\\CHEF-sbert\\Pipeline\\Data\\all_prediction_hfl_pretraineds_0511sentBase_document_article_epoch50_0519_BM25F_V3_9297_epoch_11.pickle', 'rb') as f:
    label = (pickle.load(f))

In [16]:
c_ = Counter()
for pred in label:
    c_[int(pred)] += 1
print(c_)

Counter({2: 1070, 0: 729, 1: 524})


In [17]:
c = 0
for i in range(len(label)):
    if label[i] == all_prediction[i]:
        c+=1
c

701

In [18]:
from_py = 0
for i in range(len(labels)):
    if label[i] == labels[i]:
        from_py += 1
from_py/len(label)

In [None]:
from_model = 0
for i in range(len(labels)):
    if all_prediction[i] == labels[i]:
        from_model += 1
from_model/len(label)

epoch 1 : Counter({1: 663, 0: 124, 2: 1})
epoch 4 : Counter({0: 409, 1: 305, 2: 74})
        F1 (micro): 54.95%
        Precision (macro): 54.01%
        Recall (macro): 49.96%
        F1 (macro): 49.79%
        F1 (micro_read): 40.23%

epoch 10 : Counter({0: 334, 1: 332, 2: 122})
           F1 (micro): 62.18%
           Precision (macro): 60.65%
           Recall (macro): 58.80%
           F1 (macro): 59.32%

In [13]:
all_prediction

array([1., 0., 1., 1., 2., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0.,
       1., 1., 1., 2., 0., 2., 1., 0., 1., 0., 0., 1., 1., 1., 1., 0., 0.,
       0., 2., 0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 2., 0., 1., 1., 0.,
       2., 2., 2., 0., 0., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1.,
       1., 2., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0.,
       0., 2., 1., 2., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.,
       0., 1., 2., 0., 0., 0., 1., 2., 2., 0., 0., 1., 1., 0., 2., 0., 1.,
       1., 1., 2., 1., 2., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1.,
       1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0.,
       0., 1., 1., 1., 2., 1., 1., 0., 1., 1., 1., 0., 0., 2., 1., 1., 0.,
       0., 0., 0., 1., 1., 0., 0., 0., 2., 2., 2., 0., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 2., 1., 2., 1., 0., 2., 0., 1., 1., 0., 0., 1., 0.,
       1., 0., 0., 0., 1., 0., 2., 1., 0., 0., 1., 0., 0., 0., 0., 0., 2.,
       0., 0., 1., 0., 1.

epoch 0 : Counter({1: 705, 0: 82, 2: 1})
          F1 (micro): 41.88%
          Precision (macro): 63.98%
          Recall (macro): 35.37%
          F1 (macro): 26.34
          F1 (micro_read): 40%
epoch 1 : Counter({0: 425, 1: 339, 2: 24})
          F1 (micro): 55.71%
          Precision (macro): 59.14%
          Recall (macro): 48.51%
          F1 (macro): 46.18%
          F1 (micro_read): 40.74%
          
epoch 2 : Counter({0: 366, 1: 317, 2: 105})
          F1 (micro): 60.91%
          Precision (macro): 59.61%
          Recall (macro): 56.88%
          F1 (macro): 57.35%
          F1 (micro_read): %

In [20]:
len(label)

2323

### Counter({1: 705, 0: 82, 2: 1})
       F1 (micro): 41.88%
Precision (macro): 63.98%
   Recall (macro): 35.37%
       F1 (macro): 26.34%
            
       
Counter({0: 425, 1: 339, 2: 24})
       F1 (micro): 55.71%
Precision (macro): 59.14%
   Recall (macro): 48.51%
       F1 (macro): 46.18%