In [1]:
import os
import nltk
import numpy as np
import torch
import json
import math
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score
from sklearn.cluster import DBSCAN
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer
from torch.utils.tensorboard import SummaryWriter

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [3]:
class dataset(DataLoader):
    def __init__(self, data_root, setlen):
        self.text_path = []
        self.label_path = []
        for i in range(setlen):
            if setlen==900 and (i == 70 or i==259):
                continue
            self.text_path.append(data_root+'problem-{}.txt'.format(str(i+1)))
            self.label_path.append(data_root+'truth-problem-{}.json'.format(str(i+1)))

    def __len__(self) -> int:
        return len(self.text_path)
    
    def __getitem__(self, item):
        paragraphs = []
        for line in open(self.text_path[item]):
            paragraphs.append(line)

        with open(self.label_path[item]) as json_file:
            truth = json.load(json_file)

        return (paragraphs, truth)
                



In [6]:
class BERT_MLP_Model(nn.Module):
    def __init__(self, num_classes=512):
        super(BERT_MLP_Model, self).__init__()
        self.nltktokenizer = nltk.word_tokenize
        self.pos_tagger = nltk.pos_tag
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.mlp = nn.Sequential(
            nn.Linear(768, 1024),  # BERT的输出维度为768
            nn.ReLU(),
            nn.Linear(1024, num_classes),
            nn.LayerNorm(num_classes)
        )

    def forward(self, text):
        tokens = self.nltktokenizer(text)
        pos_tags = self.pos_tagger(tokens)
        pos_tags = [p for w, p in pos_tags]
        #inputs = ['[CLS]'] + pos_tags + ['[SEP]']
        input_ids = self.berttokenizer.encode(pos_tags, max_length=256,truncation=True)
        input_ids_tensor = torch.tensor([input_ids]).to(device)
        #print(input_ids_tensor.shape)
        outputs = self.bert(input_ids_tensor).last_hidden_state[:,1:-1,:]
        outputs = self.pooling(outputs.permute(0, 2, 1)).permute(0, 2, 1).squeeze(1)
        outputs = self.mlp(outputs)
        return outputs

In [7]:
model = BERT_MLP_Model().to(device)
model('I like apple.').shape

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([1, 512])

In [8]:

"""
dataset1: training 4200, validation 900
dataset2: training 4200, validation 900
dataset3: training 4200, validation 900
label format: {'author': int -> number of authors occur in this file 
                'changes': int list -> length equals to num_paragraghs-1
                                        every time a new paragragh appears-> 0=unchanged, 1=changed
                }
"""
training_path1 = "./release/pan23-multi-author-analysis-dataset1/pan23-multi-author-analysis-dataset1-train/"
val_path1 = "./release/pan23-multi-author-analysis-dataset1/pan23-multi-author-analysis-dataset1-validation/"
training_path2 = "./release/pan23-multi-author-analysis-dataset2/pan23-multi-author-analysis-dataset2-train/"
val_path2 = "./release/pan23-multi-author-analysis-dataset2/pan23-multi-author-analysis-dataset2-validation/"
training_path3 = "./release/pan23-multi-author-analysis-dataset3/pan23-multi-author-analysis-dataset3-train/"
val_path3 = "./release/pan23-multi-author-analysis-dataset3/pan23-multi-author-analysis-dataset3-validation/"

In [9]:
Training_set2 = dataset(data_root=training_path2, setlen=4200)
trainingloader2 = DataLoader(dataset=Training_set2,batch_size=1,shuffle=True)
Val_set2 = dataset(data_root=val_path2,setlen=900)
valloader2 = DataLoader(dataset=Val_set2,batch_size=1,shuffle=True)

In [12]:
def loss_fn(truth, para_embeddings, threshold=4):
    #sim_loss = nn.MSELoss()
    weights = truth['changes']#.to(device)
    n = truth['authors'].to(device)
    d_loss = 0.0
    for i, weight in enumerate(weights):
        weight = weight.to(device)
        D2 = torch.cdist(para_embeddings[i], para_embeddings[i+1])
        #print(D2)
        l = (1-weight) * D2 + weight * torch.max(threshold - D2, torch.tensor(0.0).to(device))
        # if weight == 0:
        #     d_loss = sim_loss(para_embeddings[i], para_embeddings[i+1])
        # if weight == 1:
        #     d = 1 - sim_loss(para_embeddings[i], para_embeddings[i+1])
        #     d_loss = torch.max(d, torch.tensor(0.0).to(device))
        # d_loss = d_loss if d_loss<threshold else 2*d_loss
        d_loss = d_loss + l
    d_loss = d_loss/len(weights)

    # embs = torch.cat(para_embeddings,dim=0)
    # distances = torch.cdist(embs,embs,p=2,compute_mode='donot_use_mm_for_euclid_dist')
    # clusters = torch.zeros(embs.shape[0], dtype=torch.long)
    # cluster_count = torch.Tensor([0]).to(device)
    # cluster_count.requires_grad=True
    # for i in range(embs.shape[0]):
    #     if clusters[i] != 0:
    #         continue
    #     cluster_count = cluster_count + 1
    #     clusters[i] = cluster_count

    #     for j in range(embs.shape[0]):
    #         if distances[i, j] <= 0.5:
    #             if clusters[j] == 0:
    #                 clusters[j] = cluster_count

    #distances = F.pairwise_distance(torch.cat(tensors, dim=0), torch.cat(tensors, dim=0))

    # c_loss = abs(cluster_count-n)
    #print(c_loss)
    
    return 10*d_loss #+ c_loss

In [13]:
def evaluate(model, loader, threshold=4):
    cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6)
    acc_txt_count = 0
    txt_count = 0
    acc_class_count = 0
    para_results = []
    true_label = []

    loop = tqdm(enumerate(loader, start=len(loader)), total=len(loader), leave=False)
    for step, (paragraphs, truth) in loop:
        para_result = []
        para_embeddings = []
        for para in paragraphs:
            #tags = tagger(para[0])
            para_embedding = model(para[0])
            para_embedding = F.normalize(para_embedding, p=2, dim=1)
            para_embeddings.append(para_embedding)
        
        n = truth['authors'].to(device)
        weights = truth['changes']
        for i, weight in enumerate(weights):
            score = cos_sim(para_embeddings[i], para_embeddings[i+1])
            #print(score)
            para_results.append(int(score<0))
            true_label.append(weight.item())
        

        # acc_txt_count = acc_txt_count + all(x==y for x,y in zip(weights,para_result))
        # txt_count = txt_count + 1

        # dbscan = DBSCAN(eps=0.5, min_samples=1,metric='precomputed')
        # embs = torch.cat(para_embeddings,dim=0)
        # distances = torch.cdist(embs,embs,p=2,compute_mode='donot_use_mm_for_euclid_dist')
        # clusters = torch.zeros(embs.shape[0], dtype=torch.long)
        # cluster_count = torch.Tensor([0]).to(device)
        # cluster_count.requires_grad=True
        # for i in range(embs.shape[0]):
        #     if clusters[i] != 0:
        #         continue
        #     cluster_count = cluster_count + 1
        #     clusters[i] = cluster_count

        #     for j in range(embs.shape[0]):
        #         if distances[i, j] <= 0.5:
        #             if clusters[j] == 0:
        #                 clusters[j] = cluster_count
        # if cluster_count == n: acc_class_count+=1

        # para_results = para_results+para_result

    F1 = f1_score(para_results, true_label)
    acc_para = sum(x==y for x,y in zip(para_results, true_label))
    acc_para = acc_para/len(para_results)
    # acc_txt = acc_txt_count/txt_count
    # acc_cluster = acc_class_count/txt_count

    print(f"F1 score: {F1:.4f}, acc_para: {acc_para:.4f}")#, acc_txt: {acc_txt:.4f}, acc_clu: {acc_cluster:.4f} ")
    
    return F1,acc_para# ,acc_txt,acc_class_count



In [14]:
def train(model, trainloader, valloader, epochs, optimizer, threshold, save_freq, loss_fn=loss_fn):
    cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6)
    writer = SummaryWriter("./log/")
    low = torch.tensor(0.0).to(device)
    acc = 0.0
    for epoch in range(epochs):
        loop = tqdm(enumerate(trainloader, start=epoch * len(trainloader)), total=len(trainloader), leave=False)
        for step, (paragraphs, truth) in loop:
            optimizer.param_groups[0]['lr'] = 0.0000001#adjust_learning_rate(epochs, batch_size, trainloader, step)
            optimizer.zero_grad()
            weights = truth['changes']
            for i, weight in enumerate(weights):
                weight = weight.to(device)
                optimizer.zero_grad()
                f1 = model(paragraphs[i][0])
                f1 = F.normalize(f1, p=2, dim=1)
                f2 = model(paragraphs[i+1][0])
                f2 = F.normalize(f2, p=2, dim=1)
                cs = cos_sim(f1,f2)
                loss = (1-weight) * (1-cs) + weight * (cs+1)
                loss.backward()
                optimizer.step()

            writer.add_scalar("Loss/train", loss, epoch)

            if step % int(save_freq) == 0 and step:
                with open(os.path.join("./log/", 'logs.txt'), 'a') as log_file:
                    log_file.write(f'Epoch: {epoch}, Step: {step}, Train loss: {loss.cpu().detach().numpy()} \n')

                state = dict(epoch=epoch + 1, model=model.state_dict(),
                         optimizer=optimizer.state_dict())

                #torch.save(state, os.path.join('.', 'checkpoints', f'checkpoint_{step}_steps.pth'))
            if step % 4000 == 0 and step:
                with torch.no_grad():
                    F1,acc_para= evaluate(model,valloader,threshold=threshold)
                    if acc_para > acc:
                        acc = acc_para
                        torch.save(model, os.path.join('.', 'checkpoints', f'b est_tag_{acc}_acc_{F1}_F1.pth'))
            loop.set_description(f'Epoch [{epoch}/{epochs}]')
            loop.set_postfix(loss = loss.cpu().detach().numpy())
            
        print(f'Loss for epoch {epoch} is {loss.cpu().detach().numpy()}')
    print('End of the Training. Saving final checkpoints.')
    state = dict(epoch=epochs, model=model.state_dict(),
                 optimizer=optimizer.state_dict())
    torch.save(state, os.path.join('.', 'checkpoints',  'final_checkpoint.pth'))
    writer.flush()
    writer.close()    
                
                

In [15]:
model = BERT_MLP_Model().to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                lr=0.1,
                betas=(0.9, 0.999),
                eps=1e-08,
                weight_decay=0,
                amsgrad=False)
epochs = 100
lr = 0.1
batch_size = 1
threshold = 4

In [None]:
train(model,trainingloader2,valloader2,epochs,optimizer,threshold,1000)