In [21]:
import pandas as pd
from datasets import Dataset

data = pd.read_csv("Data/dataset.csv", header=None)
data.columns = ["input1", "input2", "input3", "label"]

def preprocess(example):
    input1 = example['input1'] if example['input1'] else ""
    input2 = example['input2'] if example['input2'] else ""
    
    return {
        "input": input1 + input2
    }

dataset = Dataset.from_pandas(data)
dataset = dataset.map(preprocess)
dataset = dataset.remove_columns(column_names=["input1", "input2", "input3"])

dataset = dataset.train_test_split(test_size=0.15)
train_set, test_set = dataset['train'], dataset['test']

train_set, test_set

  0%|          | 0/2000 [00:00<?, ?ex/s]

(Dataset({
     features: ['label', 'input'],
     num_rows: 1700
 }),
 Dataset({
     features: ['label', 'input'],
     num_rows: 300
 }))

In [10]:
from transformers import GPT2Tokenizer, GPT2Model, XLNetTokenizer, XLNetModel

gpt2_tokenizer = GPT2Tokenizer.from_pretrained("/Data/model/gpt2")
xlnet_tokenizer = XLNetTokenizer.from_pretrained("/Data/model/xlnet-base-cased")

gpt2 = GPT2Model.from_pretrained("/Data/model/gpt2")
xlnet = XLNetModel.from_pretrained("/Data/model/xlnet-base-cased")

Some weights of the model checkpoint at /datas/zyq/personal/D6791/xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
import torch
import torch.nn as nn
from torchtext.vocab import GloVe, Vectors
import fasttext
import os

cbow = fasttext.load_model('/Data/model/cbow/cc.en.300.bin')
skip_gram = fasttext.load_model("/Data/model/skipgram/wiki.en.bin")
glove = GloVe(name='6B', dim=300)



In [12]:
class CustomModel(nn.Module):
    def __init__(self, num_classes: int):
        super(CustomModel, self).__init__()
        self.gpt_embedding = gpt2.wte.half()
        self.xlnet_embedding = xlnet.word_embedding.half()

        # glove = GloVe(name='6B', dim=300)
        self.glove_embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=False).half()
        
        self.cbow_embedding = nn.Embedding.from_pretrained(torch.from_numpy(cbow.get_input_matrix()), freeze=False).half()

        self.skipgram_embedding = nn.Embedding.from_pretrained(torch.from_numpy(skip_gram.get_input_matrix()), freeze=False).half()

        self.linear = nn.Linear(
            self.xlnet_embedding.embedding_dim + self.gpt_embedding.embedding_dim + self.glove_embedding.embedding_dim + self.cbow_embedding.embedding_dim + self.skipgram_embedding.embedding_dim, 
            num_classes
        ).half()

    def forward(self, x_gpt, x_xlnet, x_glove, x_cbow):
        gpt_emb = self.gpt_embedding(x_gpt)
        xlnet_emb = self.xlnet_embedding(x_xlnet)
        glove_emb = self.glove_embedding(x_glove)
        cbow_emb = self.cbow_embedding(x_cbow)
        skipgram_emb = self.skipgram_embedding(x_cbow)

        gpt_emb_mean = torch.mean(gpt_emb, dim=1)
        xlnet_emb_mean = torch.mean(xlnet_emb, dim=1)
        glove_emb_mean = torch.mean(glove_emb, dim = 1)
        cbow_emb_mean = torch.mean(cbow_emb, dim = 1)
        skipgram_emb_mean = torch.mean(skipgram_emb, dim = 1)

        merge_emb = torch.concat([gpt_emb_mean, xlnet_emb_mean, glove_emb_mean, cbow_emb_mean, skipgram_emb_mean], dim=1)
        
        output = self.linear(merge_emb)
        
        # output = nn.Softmax(output)

        return output
    
model = CustomModel(3)

In [22]:
from torchtext.data import get_tokenizer
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def tokenizer(input, stoi):
    output = []
    for data in input:
        if data in stoi.keys():
            output.append(stoi[data])
        else:
            output.append(0)
            
    return torch.tensor(output)

def cbow_tokenize(ft, input):
    input = fasttext.tokenize(input)
    output = []
    for data in input:
        output.append(
            ft.get_word_id(data)
        )
    output = [0 if d == -1 else d for d in output]
    return torch.tensor(output)

def collate_fn(example):
    input = [data['input'] for data in example]
    label = [data['label'] for data in example]

    encode1 = gpt2_tokenizer(input, return_tensors='pt', padding=True)
    
    encode2 = xlnet_tokenizer(input, return_tensors='pt', padding=True)

    encode3 = [tokenizer(glove_tokenizer(data), glove.stoi) for data in input]
    encode3 = pad_sequence(encode3, batch_first=True)

    encode4 = [cbow_tokenize(cbow, data) for data in input]
    encode4 = pad_sequence(encode4, batch_first=True)
    
    return encode1, encode2, encode3, encode4, torch.LongTensor(label)


glove_tokenizer = get_tokenizer('basic_english')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
glove_tokenizer = get_tokenizer('basic_english')

# ft

train_loader = DataLoader(train_set, batch_size=2, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(test_set, batch_size=2, shuffle=False, collate_fn=collate_fn, drop_last=False)

In [53]:
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score

class AvgMetric:
    def __init__(self):
        self.total = 0.0
        self.count = 0
    
    def update(self, value):
        self.total += value
        self.count += 1
    
    def reset(self):
        self.total = 0.0
        self.count = 0
    
    def compute(self):
        if self.count == 0:
            return None
        return self.total / self.count
    
roc = AvgMetric()
f1 = AvgMetric()
recall = AvgMetric()
precision = AvgMetric()
loss_train = AvgMetric()
loss_test = AvgMetric()

In [118]:
def softmax(x):
    exps = np.exp(x) 
    return exps / np.sum(exps, axis=0)

from transformers import AdamW
from tqdm.auto import tqdm
import numpy as np
import warnings

warnings.filterwarnings("ignore")

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

device = "cuda:5"
model.to(device)

epochs = 10
count = 0
pbar = tqdm(total = epochs * len(train_loader))
for epoch in range(epochs):
    pbar.set_description(f"epoch: [{epoch+1}/{epochs}]")
    optimizer.zero_grad()
    loss_train.reset()
    for data in train_loader:
        gpt_token, xlnet_token, glove_token, cbow_token, label = data
        output = model(gpt_token['input_ids'].to(device), xlnet_token['input_ids'].to(device), glove_token.to(device), cbow_token.to(device))
        loss = criterion(output, label.to(device))
        loss_train.update(loss.cpu())

        loss.backward()
        optimizer.step()
        pbar.update(1)
        pbar.set_postfix({"train loss": loss_train.compute()})
        count += 1

        if count == 100:
            break
    break

model.eval()
with torch.no_grad():
    loss_test.reset()
    y_true = []
    y_test = []
    y_score = []
    for data in valid_loader:
        gpt_token, xlnet_token, glove_token, cbow_token, label = data
        output = model(gpt_token['input_ids'].to(device), xlnet_token['input_ids'].to(device), glove_token.to(device), cbow_token.to(device))
        sm = nn.Softmax(dim=-1)
        output = sm(output).detach().cpu()
        output = np.array(output)
        y_test.extend(np.argmax(output, axis=1))
        y_score.extend(output.astype('float64'))
        y_true.extend(np.array(label))

    print(y_test)
    print(y_true)
    print(y_score)
    print(np.array(y_true).shape, np.array(y_score).shape)

    f1 = f1_score(y_true=y_true, y_pred=y_test, average='macro')
    recall = recall_score(y_true=y_true, y_pred=y_test, average='macro') 
    precision = precision_score(y_true=y_true, y_pred=y_test, average='macro') 
    y_score_normalized = np.array(y_score) / np.sum(y_score, axis=1, keepdims=True)
    roc = roc_auc_score(y_true=y_true, y_score=y_score_normalized, average='macro', multi_class='ovr')

f1, recall, precision, roc

  0%|          | 0/8500 [00:00<?, ?it/s]

[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]
[2, 2, 0, 0, 0, 0, 2, 0, 0, 1, 0, 2, 0, 0, 0, 1, 1, 0, 2, 1, 2, 2, 2, 1, 0, 2, 0, 0, 0, 1, 2, 0, 0,

(0.3704815433072474,
 0.39866255144032925,
 0.3626094265740001,
 0.6335519279613879)

In [115]:
# from sklearn.metrics import roc_auc_score

# y_true = [0, 1, 2, 0, 1, 2]
# y_score = [
#     [0.9, 0.1, 0.0],
#     [0.2, 0.7, 0.1],
#     [0.1, 0.3, 0.6],
#     [0.8, 0.1, 0.1],
#     [0.3, 0.4, 0.3],
#     [0.2, 0.5, 0.3]
# ]

# auc = roc_auc_score(y_true, y_score, multi_class='ovr')
# print(auc)

0.9375
