In [None]:
import torch
import torch.nn as nn
import numpy as np
from GPT2 import GPT2Model, GPT2Tokenizer

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = 'cuda' #'cuda'

tokenizer = GPT2Tokenizer(
    'GPT2/bpe/vocab.json',
    'GPT2/bpe/chinese_vocab.model',
    max_len=512)

class GPT2classification(nn.Module):
    def __init__(self):
        super(GPT2classification, self).__init__()
        
        self.GPT2model = GPT2Model(
                            vocab_size=30000,
                            layer_size=12,
                            block_size=1024,
                            embedding_dropout=0.0,
                            embedding_size=768,
                            num_attention_heads=12,
                            attention_dropout=0.0,
                            residual_dropout=0.0)

        self.mlp =  nn.Sequential(
                nn.Linear(30000, 512),
                nn.ReLU(),
                nn.Linear(512, 256),
                nn.ReLU(),
                nn.Linear(256, 2),
            )

    def forward(self, x):
        x = self.GPT2model(x)
        x = x[:,-1]
        x = self.mlp(x)
        return x

In [None]:
import json
from GPT2.samplers import RandomSampler
from torch.utils.data import TensorDataset
from tqdm import tqdm

def load_tnews_data(data_path, data_type, tokenizer, few_shot=False, seq_length=1024):

    filename = os.path.join(data_path, data_type+'.json')
    objs = []
    with open(filename, 'r', encoding='utf-8') as fin:
        for line in fin:
            objs.append(json.loads(line.strip()))

    # objs = objs[:len(objs)//50]
    
    pad_id = tokenizer.encoder['<pad>']

    all_tokens = []
    all_last_idx = []
    all_labels = []
    
    for _, obj in enumerate(tqdm(objs)):
        sentence = obj['sentence']
        tokenized_sentence = tokenizer.encode(sentence)[:seq_length-20]
        
        if obj['label_desc'] == 'positive':
            label = 1
        else:
            label = 0
        
        all_labels.append(label)

        tokens = tokenized_sentence
        token_length = len(tokens)
        front_pad = [pad_id] * (seq_length - token_length)
        front_pad.extend(tokens)
        tokens = front_pad

        all_last_idx.append(token_length)
        all_tokens.append(tokens)
    
    all_tokens = torch.tensor(all_tokens, dtype=torch.long)
    all_last_idx = torch.tensor(all_last_idx, dtype=torch.long)
    all_labels = torch.tensor(all_labels, dtype=torch.long)
    dataset = TensorDataset(all_tokens, all_last_idx, all_labels)

    return dataset
  
def collect_fcn(batch):
    bs = len(batch)
    batch_tokens = []
    batch_idx = []
    batch_labels = []
    for i in range(bs):
        batch_tokens.append(batch[i][0])
        batch_idx.append(batch[i][1])
        batch_labels.append(batch[i][2])
    batch_tokens = torch.stack(batch_tokens)
    batch_idx = torch.stack(batch_idx)
    batch_labels = torch.stack(batch_labels)

    return batch_tokens, batch_idx, batch_labels

In [None]:
model = GPT2classification()
saved = torch.load("../models/financial_sentiment_7.pth")
model.load_state_dict(saved.state_dict())
model.eval()
model.to(device)

print('loaded')

In [None]:
test_set = load_tnews_data('../dataset/Finbert_processed', 'test_fincpm', tokenizer)
sampler = RandomSampler(test_set)
test_dataloader = torch.utils.data.DataLoader(test_set,
                                                batch_size = 1,
                                                sampler=sampler,
                                                num_workers=0,
                                                collate_fn = collect_fcn,
                                                pin_memory=True)

In [None]:
num_positive = 0
num_negative = 0

num_positive_true = 0
num_negative_true = 0

list_positive = []
list_negative = []

for i, batch in enumerate(tqdm(test_dataloader)):
    token, last_idx, label = (x.to(device) for x in batch)
    
    output = model(token)
    
    if label[0] == 1:
        if output[0, 1] >= output[0, 0]:
            num_positive_true = num_positive_true + 1
        num_positive = num_positive + 1
        list_positive.append(output.detach().cpu().numpy())
    else:
        if output[0, 0] >= output[0, 1]:
            num_negative_true = num_negative_true + 1
        num_negative = num_negative + 1
        list_negative.append(output.detach().cpu().numpy())


In [None]:
print(num_positive_true/num_positive)
print(num_negative_true/num_negative)

In [None]:
print(num_positive_true)

In [None]:
# TODO 数据的图换成表格问一下会不会太简单，重新训练一个直接用-1的版本
# 0.9858585858585859
# 0.9047619047619048