In [25]:
import os
import nltk
import numpy as np
import torch
import json
import math
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score
from sklearn.cluster import DBSCAN
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer
from torch.utils.tensorboard import SummaryWriter
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict, stopwords
import string
import pickle

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

In [None]:
class dataset(DataLoader):
    def __init__(self, data_root, setlen):
        self.text_path = []
        self.label_path = []
        for i in range(setlen):
            if setlen==900 and (i == 70 or i==259):
                continue
            self.text_path.append(data_root+'problem-{}.txt'.format(str(i+1)))
            self.label_path.append(data_root+'truth-problem-{}.json'.format(str(i+1)))

    def __len__(self) -> int:
        return len(self.text_path)
    
    def __getitem__(self, item):
        paragraphs = []
        for line in open(self.text_path[item]):
            paragraphs.append(line)

        with open(self.label_path[item]) as json_file:
            truth = json.load(json_file)

        return (paragraphs, truth)

In [None]:
"""
dataset1: training 4200, validation 900
dataset2: training 4200, validation 900
dataset3: training 4200, validation 900
label format: {'author': int -> number of authors occur in this file 
                'changes': int list -> length equals to num_paragraghs-1
                                        every time a new paragragh appears-> 0=unchanged, 1=changed
                }
"""
training_path1 = "./release/pan23-multi-author-analysis-dataset1/pan23-multi-author-analysis-dataset1-train/"
val_path1 = "./release/pan23-multi-author-analysis-dataset1/pan23-multi-author-analysis-dataset1-validation/"
training_path2 = "./release/pan23-multi-author-analysis-dataset2/pan23-multi-author-analysis-dataset2-train/"
val_path2 = "./release/pan23-multi-author-analysis-dataset2/pan23-multi-author-analysis-dataset2-validation/"
training_path3 = "./release/pan23-multi-author-analysis-dataset3/pan23-multi-author-analysis-dataset3-train/"
val_path3 = "./release/pan23-multi-author-analysis-dataset3/pan23-multi-author-analysis-dataset3-validation/"

In [None]:
Training_set2 = dataset(data_root=training_path2, setlen=4200)
trainingloader2 = DataLoader(dataset=Training_set2,batch_size=1,shuffle=True)
Val_set2 = dataset(data_root=val_path2,setlen=900)
valloader2 = DataLoader(dataset=Val_set2,batch_size=1,shuffle=True)

In [None]:
class StyleSpy(nn.Module):
    def __init__(self, n_features=512, hidden_size=1024,padding='max_length', dropout=0.1):
        super(StyleSpy,self).__init__()
        self.padding = padding
        self.berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.ffn_hidden = nn.Sequential(nn.Linear(self.bert.config.hidden_size, hidden_size),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(hidden_size, n_features),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(n_features, n_features),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.LayerNorm(normalized_shape=n_features)
        )

        self.ffn_cls = nn.Sequential(nn.Linear(self.bert.config.hidden_size, hidden_size),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(hidden_size, n_features),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(n_features, n_features),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.LayerNorm(normalized_shape=n_features)
        )

        
        # Freeze the BERT part
        for param in self.bert.parameters():
            param.requires_grad = False

    def tokenize(self, text):
        input_ids = self.berttokenizer.encode(text, add_special_tokens=True, padding=self.padding, truncation=True, max_length=256)
        attention_mask = [int(id > 0) for id in input_ids]

        input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)
        attention_mask = torch.tensor(attention_mask).unsqueeze(0).to(device)

        return input_ids, attention_mask

    def forward(self, text):
        input_ids, attention_mask = self.tokenize(text)
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:,1:-1,:]
        hidden_state = self.pooling(hidden_state.permute(0, 2, 1)).permute(0, 2, 1).squeeze(1)
        cls_token = outputs.pooler_output

        hidden_state = self.ffn_hidden(hidden_state)
        #cls_token = self.ffn_cls(cls_token)
        #features = torch.cat((cls_token,hidden_state),dim=1)

        return hidden_state #features


class BERT_MLP_Model(nn.Module):
    def __init__(self, num_classes=512):
        super(BERT_MLP_Model, self).__init__()
        self.nltktokenizer = nltk.word_tokenize
        self.pos_tagger = nltk.pos_tag
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.mlp = nn.Sequential(
            nn.Linear(768, 1024),  # BERT的输出维度为768
            nn.ReLU(),
            nn.Linear(1024, num_classes),
            nn.LayerNorm(num_classes)
        )

    def forward(self, text):
        tokens = self.nltktokenizer(text)
        pos_tags = self.pos_tagger(tokens)
        pos_tags = [p for w, p in pos_tags]
        #inputs = ['[CLS]'] + pos_tags + ['[SEP]']
        input_ids = self.berttokenizer.encode(pos_tags, max_length=256,truncation=True)
        input_ids_tensor = torch.tensor([input_ids]).to(device)
        #print(input_ids_tensor.shape)
        outputs = self.bert(input_ids_tensor).last_hidden_state[:,1:-1,:]
        outputs = self.pooling(outputs.permute(0, 2, 1)).permute(0, 2, 1).squeeze(1)
        outputs = self.mlp(outputs)
        return outputs

In [None]:
def Get_Sty_Features(text):
    # avg word len
    tokens = nltk.word_tokenize(text)
    total_chars = sum(len(word) for word in tokens)
    average_word_length = total_chars / len(tokens)
    # avg sentence len
    sentences = nltk.sent_tokenize(text)
    total_words = sum(len(nltk.word_tokenize(sentence)) for sentence in sentences)
    average_sentence_length = total_words / len(sentences)
    # para len by chars
    paragraph_length_chars = len(text)
    # para len by tokens
    paragraph_length_words = len(tokens)
    # para len by sentences
    paragraph_length_sents = len(sentences)
    # Type-Token Ratio
    total_tokens = len(tokens)
    unique_types = len(set(tokens))
    type_token_ratio = unique_types / total_tokens
    # avg syllables
    cmud = cmudict.dict()
    total_syllables = sum([len(cmud.get(word.lower(), [[None]])[0]) for word in tokens])
    average_syllables_per_word = total_syllables / len(tokens)
    # Flesch-Kincaid readability score
    flesch_reading_ease = 206.835 - (1.015 * average_sentence_length) - (84.6 * average_syllables_per_word)
    # Stopwords Count
    stopwords_list = stopwords.words("english")
    stopwords_count = len([token for token in tokens if token.lower() in stopwords_list])
    # Function words Count
    function_words = nltk.pos_tag(tokens)
    function_words_count = len([word for word, pos in function_words if pos.startswith("FW")])
    # Punctuation Marks Ratio
    punctuation_count = sum([1 for token in tokens if token in string.punctuation])
    punctuation_ratio = punctuation_count / len(tokens)
    return [average_word_length, average_sentence_length, paragraph_length_chars, 
            paragraph_length_words, paragraph_length_sents, type_token_ratio, 
            average_syllables_per_word, flesch_reading_ease, stopwords_count, 
            function_words_count, punctuation_ratio]

    


In [None]:
Sem = torch.load("./checkpoints/best_0.7297875374304862_acc_0.6318243637070139_F1_4_thre.pth").to(device)
Syn = torch.load("./checkpoints/best_tag_0.6188507058320263_acc_0.4705882352941176_F1.pth").to(device)

In [None]:
a = torch.load('./trainingFea.pth')
Sty = LogisticRegression(max_iter=100000)
X = a['features']
Y = a['labels']
X = [t.tolist() for t in X]
Y = [t.item() for t in Y]
Sty.fit(X,Y)


In [None]:
pre = []
tar = []
cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6)
loop = tqdm(enumerate(valloader2, start=len(valloader2)), total=len(valloader2), leave=False)
for step, (paragraphs, truth) in loop:
    para_result = []
    sem_embeddings = []
    syn_embeddings = []
    for para in paragraphs:
        sem_embeddings.append(Sem(para[0]))
        syn_embedding = Syn(para[0])
        syn_embedding = F.normalize(syn_embedding, p=2, dim=1)
        syn_embeddings.append(syn_embedding)

    weights = truth['changes']
    for i, weight in enumerate(weights):
        tar.append(weight.item())

        sem_score = torch.cdist(sem_embeddings[i], sem_embeddings[i+1]).item()
        syn_score = cos_sim(syn_embeddings[i], syn_embeddings[i+1]).item()

        sty1 = Get_Sty_Features(paragraphs[i][0])
        sty2 = Get_Sty_Features(paragraphs[i+1][0])
        sty = [abs(x-y) for x, y in zip(sty1,sty2)]
        sty_score = Sty.predict([sty])

        p = 0 #less than 0: unchange
        p = p + 0.9*(sem_score-3.1) - 0.3*(syn_score-0.9) + 0.35*(sty_score-0.5)
        pre.append(int(p>0))
    acc = accuracy_score(tar, pre)
    f1 = f1_score(tar, pre, average='binary')
    print(f"F1 score: {f1:.4f}, acc_para: {acc:.4f}")



