# Library


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import warnings 
warnings.filterwarnings('ignore')
import re
from os.path import join
from tqdm import tqdm
from collections import defaultdict as dd
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
import numpy as np
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, DebertaV2ForSequenceClassification
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from transformers.optimization import AdamW
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from tqdm import trange
from sklearn.metrics import classification_report, precision_recall_fscore_support, average_precision_score
import logging
import utils
import settings
### add 
import pandas as pd
from sklearn.model_selection import KFold 
import gc

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

global is_clean, fuzz_ratio, model_name
model_name = 'scibert'
is_clean = False
fuzz_ratio = 80
NFOLDS = 4
MAX_SEQ_LENGTH = 512
BATCH_SIZE = 128

2024-06-12 15:34:14,532 Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-06-12 15:34:14,532 NumExpr defaulting to 8 threads.


# Function

In [3]:
# Function to clean RAM & vRAM
def clean_memory():
    gc.collect()
    # ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()
    
def clean_text_line(line): 
    ## 得到 Abstract 信息
    out_text = re.sub('<[^>]*>', '', line)
    out_text = re.sub(r'\t+', '\t', out_text)
    out_text = re.sub(r'\n+', '\n', out_text)
    out_text = re.sub(r'[\n\t]+', '\n', out_text)
    return out_text

class BertInputItem(object):
    """An item with all the necessary attributes for finetuning BERT."""

    def __init__(self, text, input_ids, input_mask, segment_ids, label_id):
        self.text = text
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


def convert_examples_to_inputs(example_texts, example_labels, max_seq_length, tokenizer, verbose=0):
    """Loads a data file into a list of `InputBatch`s."""
    
    input_items = []
    examples = zip(example_texts, example_labels)
    for (ex_index, (text, label)) in enumerate(examples):

        # Create a list of token ids
        input_ids = tokenizer.encode(f"[CLS] {text} [SEP]")
        if len(input_ids) > max_seq_length:
            input_ids = input_ids[:max_seq_length]

        # All our tokens are in the first input segment (id 0).
        segment_ids = [0] * len(input_ids)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label

        input_items.append(
            BertInputItem(text=text,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          label_id=label_id))
        
    return input_items


def get_data_loader(features, max_seq_length, batch_size, shuffle=True): 

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size, num_workers=8)
    return dataloader


# 生成 test submission

In [4]:
def gen_kddcup_test_submission_bert(fold, model_name="deberta"):
    print("model name", model_name)
    data_dir = join(settings.DATA_TRACE_DIR, "PST")
    papers = utils.load_json(data_dir, "paper_source_trace_test_wo_ans.json")

    if model_name == "bert":
        BERT_MODEL = "bert-base-uncased"
    elif model_name == "scibert":
        # BERT_MODEL = "allenai/scibert_scivocab_uncased"
        BERT_MODEL = './bert_models/scibert_scivocab_uncased/'
    elif model_name == 'roberta-base': 
        BERT_MODEL = './bert_models/dsp_roberta_base_dapt_cs_tapt_sciie_3219/'
    else:
        raise NotImplementedError
        
    tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)

    sub_example_dict = utils.load_json(data_dir, "submission_example_test.json")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("device", device)
    
    model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL, num_labels = 2)
    OUTPUT_DIR = join(settings.OUT_DIR, "kddcup", model_name, f'num_fold={NFOLDS}',f'fold_{fold}')
    model.load_state_dict(torch.load(join(OUTPUT_DIR, "pytorch_model.bin")))
    model.to(device)
    model.eval()

    xml_dir = join(data_dir, "paper-xml")
    sub_dict = {}

    for paper in tqdm(papers):
        cur_pid = paper["_id"]
        file = join(xml_dir, cur_pid + ".xml")
        f = open(file, encoding='utf-8')
        xml = f.read()
        bs = BeautifulSoup(xml, "xml")
        f.close()

        references = bs.find_all("biblStruct")
        bid_to_title = {}
        n_refs = 0
        for ref in references:
            if "xml:id" not in ref.attrs:
                continue
            bid = ref.attrs["xml:id"]
            if ref.analytic is None:
                continue
            if ref.analytic.title is None:
                continue
            bid_to_title[bid] = ref.analytic.title.text.lower()
            b_idx = int(bid[1:]) + 1
            if b_idx > n_refs:
                n_refs = b_idx

        bib_to_contexts = utils.find_bib_context(xml)
        bib_sorted = ["b" + str(ii) for ii in range(n_refs)]
        
        y_score = [0] * n_refs

        assert len(sub_example_dict[cur_pid]) == n_refs
        # continue

        contexts_sorted = [" ".join(bib_to_contexts[bib]) for bib in bib_sorted]
        contexts_sorted = [clean_text_line(line) for line in contexts_sorted]  ## 清洗数据 

        test_features = convert_examples_to_inputs(contexts_sorted, y_score, MAX_SEQ_LENGTH, tokenizer)
        test_dataloader = get_data_loader(test_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)

        predicted_scores = []
        for step, batch in enumerate(test_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            with torch.no_grad():
                r = model(input_ids, attention_mask=input_mask,
                                            token_type_ids=segment_ids, labels=label_ids)
                tmp_eval_loss = r[0]
                logits = r[1]

            cur_pred_scores = logits[:, 1].to('cpu').numpy()
            predicted_scores.extend(cur_pred_scores)
        
        for ii in range(len(predicted_scores)):
            bib_idx = int(bib_sorted[ii][1:])
            y_score[bib_idx] = float(utils.sigmoid(predicted_scores[ii]))
        
        sub_dict[cur_pid] = y_score
    utils.dump_json(sub_dict, OUTPUT_DIR, f"test_submission_{model_name}.json")


# Run

In [6]:
for fold in range(NFOLDS): 
    gen_kddcup_test_submission_bert(fold, model_name=model_name)

# 均值处理 

import json
fold = 0

## 初始化 json 文件
submit_data_dir = f'out/kddcup/{model_name}/num_fold={NFOLDS}/fold_{fold}'
sub_example_dict = utils.load_json(submit_data_dir, f"test_submission_{model_name}.json")

## 读取 剩余文件并进行 相加  
for fold in range(1, NFOLDS): 
    ## 每次读取 
    submit_data_dir = f'out/kddcup/{model_name}/num_fold={NFOLDS}/fold_{fold}'
    sub_dict = utils.load_json(submit_data_dir, f"test_submission_{model_name}.json") 
    
    ## 将 sub_dict[key] + sub_example_dict[key] 
    for key, value in sub_dict.items():
        sub_example_dict[key] = np.add(sub_example_dict[key], value) 

## 进行均值处理  
for key, value in sub_example_dict.items():
    sub_example_dict[key] = (sub_example_dict[key] / NFOLDS).tolist()

os.makedirs(f'out/kddcup/{model_name}/num_fold={NFOLDS}/fold_final', exist_ok=True)
utils.dump_json(sub_example_dict, f'out/kddcup/{model_name}/num_fold={NFOLDS}/fold_final', f"test_submission_{model_name}.json")

2024-06-12 15:36:21,480 loading paper_source_trace_test_wo_ans.json ...
2024-06-12 15:36:21,489 paper_source_trace_test_wo_ans.json loaded
2024-06-12 15:36:21,524 loading submission_example_test.json ...
2024-06-12 15:36:21,526 submission_example_test.json loaded


model name scibert
device cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert_models/scibert_scivocab_uncased/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|                                                                                          | 0/394 [00:03<?, ?it/s]
2024-06-12 15:36:26,357 dumping test_submission_scibert.json ...
2024-06-12 15:36:26,358 test_submission_scibert.json dumped.
2024-06-12 15:36:26,362 loading paper_source_trace_test_wo_ans.json ...
2024-06-12 15:36:26,370 paper_source_trace_test_wo_ans.json loaded
2024-06-12 15:36:26,398 loading submission_example_test.json ...
2024-06-12 15:36:26,401 submission_example_test.json loaded


model name scibert
device cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert_models/scibert_scivocab_uncased/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|                                                                                          | 0/394 [00:03<?, ?it/s]
2024-06-12 15:36:30,784 dumping test_submission_scibert.json ...
2024-06-12 15:36:30,785 test_submission_scibert.json dumped.
2024-06-12 15:36:30,790 loading paper_source_trace_test_wo_ans.json ...
2024-06-12 15:36:30,799 paper_source_trace_test_wo_ans.json loaded
2024-06-12 15:36:30,828 loading submission_example_test.json ...
2024-06-12 15:36:30,830 submission_example_test.json loaded


model name scibert
device cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert_models/scibert_scivocab_uncased/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|                                                                                          | 0/394 [00:03<?, ?it/s]
2024-06-12 15:36:35,411 dumping test_submission_scibert.json ...
2024-06-12 15:36:35,412 test_submission_scibert.json dumped.
2024-06-12 15:36:35,416 loading paper_source_trace_test_wo_ans.json ...
2024-06-12 15:36:35,424 paper_source_trace_test_wo_ans.json loaded
2024-06-12 15:36:35,453 loading submission_example_test.json ...
2024-06-12 15:36:35,455 submission_example_test.json loaded


model name scibert
device cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert_models/scibert_scivocab_uncased/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|                                                                                          | 0/394 [00:03<?, ?it/s]
2024-06-12 15:36:40,055 dumping test_submission_scibert.json ...
2024-06-12 15:36:40,057 test_submission_scibert.json dumped.
2024-06-12 15:36:40,061 loading test_submission_scibert.json ...
2024-06-12 15:36:40,061 test_submission_scibert.json loaded
2024-06-12 15:36:40,062 loading test_submission_scibert.json ...
2024-06-12 15:36:40,063 test_submission_scibert.json loaded
2024-06-12 15:36:40,064 loading test_submission_scibert.json ...
2024-06-12 15:36:40,065 test_submission_scibert.json loaded
2024-06-12 15:36:40,066 loading test_submission_scibert.json ...