In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="1"

from abc import ABCMeta
import argparse
import datetime 
import collections
from collections import defaultdict
import json
import math
import os
import random
import pickle
import sys
import io
from tqdm import tqdm, trange
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, WeightedRandomSampler
from torch.utils.data.distributed import DistributedSampler
from torch.nn import CrossEntropyLoss
from pathlib import Path
from collections import Counter

from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
from pytorch_pretrained_bert.modeling import BertForQuestionAnsweringQC4QA

from pytorch_pretrained_bert.optimization import BertAdam
from utils.ConfigLogger import config_logger
from utils.evaluate import f1_score, exact_match_score, metric_max_over_ground_truths
from utils.BERTRandomSampler import BERTRandomSampler

PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
                                               Path.home() / '.pytorch_pretrained_bert'))

from da_data_utils import * 
############################
import importlib, types, argparse
from utils.ConfigLogger import config_logger
print('torch: ',torch.cuda.is_available())

torch:  True


In [9]:
def prediction_stage(args, device, tokenizer, logger, debug=False):
    # Load a trained model that you have fine-tuned
    output_model_file = os.path.join(args.output_dir, args.output_model_file)
    model_state_dict = torch.load(output_model_file)
    model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict, args=args)
    model.to(device)
    # Read prediction samples
    read_limit = None
    if debug:
        read_limit = 100 # 샘플 100개만 사용
    logger.info("***** Reading Prediction Samples *****")
    eval_features, eval_examples = read_features_and_examples(args, args.predict_file, tokenizer, logger,
            use_simple_feature=False, read_examples=True, limit=read_limit)
    acc, f1 = evaluation_stage(args, eval_examples, eval_features, device, model, logger)
    logger.info('***** Prediction Performance *****')
    logger.info('EM is %.5f, F1 is %.5f', acc, f1)


def evaluate_acc_and_f1(predictions, raw_data, logger, threshold=-1, all_probs=None):
    f1 = exact_match = total = 0
    eval_threshold = True
    if threshold is None or all_probs is None:
        eval_threshold = False
    for sample in raw_data:
        if (sample.qas_id not in predictions) or (eval_threshold and sample.qas_id not in all_probs):
            message = 'Unanswered question ' + sample.qas_id + ' will receive score 0.'
            logger.warn(message)
            continue
        if not eval_threshold or (eval_threshold and all_probs[sample.qas_id] >= threshold):
            ground_truths = sample.orig_answers
            prediction = predictions[sample.qas_id]
            exact_match += metric_max_over_ground_truths(
                exact_match_score, prediction, ground_truths)
            f1 += metric_max_over_ground_truths(
                f1_score, prediction, ground_truths)
            total += 1

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return exact_match, f1


def keep_high_prob_samples(all_probs, all_features, prob_threshold, removed_feature_index, all_indices,
        keep_generated=False):
    '''
    셀프 트레이닝용: 높은 확률의 예측을 pseudo-label로 변환
    '''
    new_train_features = []
    for i, feature in enumerate(all_features):
        if keep_generated:
            if feature.example_index not in removed_feature_index and all_probs[feature.example_index] > prob_threshold:
                feature.start_position, feature.end_position = all_indices[i][0] = all_indices[i][1]
                new_train_features.append(feature)
                removed_feature_index.add(feature.example_index)
        else:
            if all_probs[feature.example_index] > prob_threshold:
                feature.start_position, feature.end_position = all_indices[i][0], all_indices[i][1]
                new_train_features.append(feature)
    return new_train_features, removed_feature_index


def compare_performance(args, best_acc, best_f1, acc, f1, model, logger):
    if not (best_f1 is None or best_acc is None):
        if best_acc < acc:
            logger.info('Current model BEATS previous best model, previous best is EM = %.5F, F1 = %.5f',
                best_acc, best_f1)
            best_acc, best_f1 = acc, f1
            logger.info('Current best model has been saved!')
            model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
            torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, args.output_model_file))
        else:
            logger.info('Current model CANNOT beat previous best model, previous best is EM = %.5F, F1 = %.5f',
                best_acc, best_f1)
    else:
        best_acc, best_f1 = acc, f1
    return best_acc, best_f1


def evaluation_stage(args, eval_examples, eval_features, device, model, logger, generate_prob_th=0.6,
        removed_feature_index=None, global_step=None, best_acc=None, best_f1=None, generate_label=False):
    if not global_step:
        logger.info("***** Running Evaluation Stage *****")
    else:
        logger.info("***** Running Predictions *****")
    logger.info("  Num orig examples = %d", len(eval_examples))
    logger.info("  Num split examples = %d", len(eval_features))
    logger.info("  Batch size = %d", args.predict_batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)

    model.eval()
    all_results = []
    logger.info("Start evaluating")
    for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():
             batch_start_logits, batch_end_logits, _ = model(input_ids, segment_ids, input_mask)
        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(RawResult(unique_id=unique_id,
                start_logits=start_logits,
                end_logits=end_logits))

    if global_step:
        prediction_file_name = 'predictions_' + str(global_step) + f'_{datetime.datetime.now().strftime("%Y%m%d_%H:%M:%S")}.json'
        nbest_file_name = 'nbest_predictions_' + str(global_step) + f'_{datetime.datetime.now().strftime("%Y%m%d_%H:%M:%S")}.json'
        output_prediction_file = os.path.join(args.output_dir, prediction_file_name)
        output_nbest_file = os.path.join(args.output_dir, nbest_file_name)
    else:
        output_prediction_file = os.path.join(args.output_dir, f'predictions_{datetime.datetime.now().strftime("%Y%m%d_%H:%M:%S")}.json')
        output_nbest_file = os.path.join(args.output_dir, f'nbest_predictions_{datetime.datetime.now().strftime("%Y%m%d_%H:%M:%S")}.json')
    all_predictions, all_probs, all_indices = write_predictions(args, eval_examples, eval_features, all_results,
        args.n_best_size, args.max_answer_length,
        args.do_lower_case, output_prediction_file,
        output_nbest_file, args.verbose_logging, logger, args.output_prediction)
    if generate_label:
        return keep_high_prob_samples(all_probs, eval_features, generate_prob_th, removed_feature_index, all_indices,
                keep_generated=args.keep_previous_generated)
    else:
        acc, f1 = evaluate_acc_and_f1(all_predictions, eval_examples, logger)
        logger.info('Current EM is %.5f, F1 is %.5f', acc, f1)
        if not (best_f1 is None or best_acc is None):
            best_acc, best_f1 = compare_performance(args, best_acc, best_f1, acc, f1, model, logger)
            return best_acc, best_f1
        else:
            return acc, f1


def generate_self_training_samples(args, train_examples, train_features, device, model, removed_feature_index,
        new_generated_train_features, generate_prob_th, logger):
    '''
    타겟 도메인 데이터에서 pseudo-label 생성
    '''
    logger.info('***** Generating training data for this epoch *****')
    if args.keep_previous_generated:
        train_features_removed_previous = []
        for index in range(len(train_features)):
            if index not in removed_feature_index:
                train_features_removed_previous.append(train_features[index])
    else:
        train_features_removed_previous = train_features
    cur_train_features, removed_feature_index = \
        evaluation_stage(args, train_examples, train_features_removed_previous, device, model, logger,
            removed_feature_index=removed_feature_index, generate_label=True, generate_prob_th=generate_prob_th)
    if len(cur_train_features) == 0:
        logger.info("  No new training samples were generated, training procedure ends")
        return None, None
    if args.keep_previous_generated:
        new_generated_train_features.extend(cur_train_features)
    else:
        new_generated_train_features = cur_train_features
    return new_generated_train_features, removed_feature_index


def get_bert_model_parameters(model):
    '''
    역할:BERT optimizer 파라미터 그룹 생성 (weight decay 적용/미적용 분리)
    반환: optimizer_grouped_parameters
    '''
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    return optimizer_grouped_parameters


def comb_adversarial_training_stage(args, target_train_features, target_train_examples, source_train_features,
            eval_features, eval_examples, removed_feature_index, new_generated_train_features, model, epoch,
            device, best_acc, best_f1, logger):
    '''
    1. Self-training: 타겟 데이터에서 pseudo-label 생성
    2. Question Type Probability 기반 Source-Target 매칭:
        타겟 배치의 각 샘플에서 q_type_prob 상위 3개를 추출하여
        각 타입에 해당하는 소스 샘플을 샘플링 (타겟 1개 → 소스 3개)
    3. Combined Training: 타겟(pseudo) + 소스(labeled) 데이터를 concat하여 학습
    '''

    def sample_source_batch_top3(source_data, source_q_type_dict, sample_pointer, q_type_probs):
        """
        q_type_probs 기반 상위 3개 타입에서 소스 샘플링
        Args:
            q_type_probs: [batch_size, 6] 텐서
        Returns:
            타겟 배치의 최대 3배 크기 소스 배치
        """
        output_idx = []

        for q_type_prob in q_type_probs:  # 각 타겟 샘플
            # 상위 3개 q_type 인덱스 추출
            top3_indices = torch.topk(q_type_prob, k=3).indices.tolist()

            for q_type in top3_indices:
                # 해당 타입에 소스 샘플이 없으면 스킵
                if len(source_q_type_dict[q_type]) == 0:
                    continue

                next_q_idx = sample_pointer[q_type] % len(source_q_type_dict[q_type])
                output_idx.append(source_q_type_dict[q_type][next_q_idx])
                sample_pointer[q_type] += 1

        input_ids_source, input_masks_source, segment_ids_source, start_positions_source, end_positions_source, \
            q_types_source = [], [], [], [], [], []
        for idx in output_idx:
            input_ids_source.append(source_data[idx][0].unsqueeze(0))
            input_masks_source.append(source_data[idx][1].unsqueeze(0))
            segment_ids_source.append(source_data[idx][2].unsqueeze(0))
            start_positions_source.append(source_data[idx][3].unsqueeze(0))
            end_positions_source.append(source_data[idx][4].unsqueeze(0))
            q_types_source.append(source_data[idx][5].unsqueeze(0))

        return torch.vstack(input_ids_source), torch.vstack(input_masks_source), torch.vstack(segment_ids_source), \
            torch.cat(start_positions_source, -1), torch.cat(end_positions_source, -1), torch.cat(q_types_source, -1)

    # Generate self-training samples
    # 1. Pseudo-label 생성
    new_generated_train_features, removed_feature_index = generate_self_training_samples(args, target_train_examples,
        target_train_features, device, model, removed_feature_index, new_generated_train_features, args.generate_prob_th,
        logger)
    if new_generated_train_features is None:
        sys.exit()
    
    logger.info('\n')
    logger.info('====================  Start Adversarial Training Stage  ====================')
    
    # q_type_prob 추출 (데이터에서 가져오기)
    all_q_type_probs = []
    for f in new_generated_train_features:
        # InputFeatures에 q_type_prob가 있는지 확인
        if hasattr(f, 'q_type_prob') and f.q_type_prob is not None:
            all_q_type_probs.append(f.q_type_prob)
        else:
            # q_type_prob가 없으면 one-hot 인코딩 사용
            prob = [0.0] * 6
            prob[f.q_type] = 1.0
            all_q_type_probs.append(prob)
    
    all_input_ids = torch.tensor([f.input_ids for f in new_generated_train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in new_generated_train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in new_generated_train_features], dtype=torch.long)
    all_start_positions = torch.tensor([f.start_position for f in new_generated_train_features], dtype=torch.long)
    all_end_positions = torch.tensor([f.end_position for f in new_generated_train_features], dtype=torch.long)
    all_q_types = torch.tensor([f.q_type for f in new_generated_train_features], dtype=torch.long)
    all_q_type_probs = torch.tensor(all_q_type_probs, dtype=torch.float)
    
    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
        all_start_positions, all_end_positions, all_q_types, all_q_type_probs)
    
    source_input_ids = torch.tensor([f.input_ids for f in source_train_features], dtype=torch.long)
    source_input_mask = torch.tensor([f.input_mask for f in source_train_features], dtype=torch.long)
    source_segment_ids = torch.tensor([f.segment_ids for f in source_train_features], dtype=torch.long)
    source_start_positions = torch.tensor([f.start_position for f in source_train_features], dtype=torch.long)
    source_end_positions = torch.tensor([f.end_position for f in source_train_features], dtype=torch.long)
    source_q_types = []
    source_q_type_dict = {
        0: [],
        1: [],
        2: [],
        3: [],
        4: [],
        5: []
    }
    for idx, f in enumerate(source_train_features):
        source_q_types.append(f.q_type)
        source_q_type_dict[f.q_type].append(idx)
    source_q_types = torch.tensor(source_q_types, dtype=torch.long)
    for key in source_q_type_dict.keys():
        random.shuffle(source_q_type_dict[key])
    sample_pointer = [0] * 6
    source_data = TensorDataset(source_input_ids, source_input_mask, source_segment_ids, source_start_positions, 
        source_end_positions, source_q_types)

    train_sampler = BERTRandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
    
    data_len = len(new_generated_train_features)
    logger.info("  Num split examples = %d", data_len)
    logger.info("  Batch size = %d", args.train_batch_size)
    num_train_steps = math.ceil(data_len / args.train_batch_size / args.gradient_accumulation_steps)
    if num_train_steps == 0 and data_len > 0:
        num_train_steps = 1
    t_total = num_train_steps
    logger.info("  Num steps = %d", num_train_steps)

    loss_sum = 0
    optimizer_grouped_parameters = get_bert_model_parameters(model)
    optimizer = BertAdam(optimizer_grouped_parameters,
            lr=args.adapt_learning_rate,
            warmup=args.warmup_proportion,
            t_total=t_total)
    global_step = 0

    # 타겟 배치 순회(pseudo-labeling->소스 매칭->결합 학습)
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        model.train()
        # 소스 배치 샘플링 (question type probability 기반 상위 3개)
        batch_source = sample_source_batch_top3(source_data, source_q_type_dict, sample_pointer, batch[-1])
        batch = tuple(t.to(device) for t in batch[:-1])  # q_type_probs는 제외
        batch_source = tuple(t.to(device) for t in batch_source)
        input_ids, input_masks, segment_ids, start_positions, end_positions, q_types = batch
        input_ids_source, input_masks_source, segment_ids_source, start_positions_source, end_positions_source, q_types_source = batch_source
        # 타겟 + 소스 concat
        input_ids = torch.cat((input_ids, input_ids_source), 0)
        input_masks = torch.cat((input_masks, input_masks_source), 0)
        segment_ids = torch.cat((segment_ids, segment_ids_source), 0)
        start_positions = torch.cat((start_positions, start_positions_source), 0)
        end_positions = torch.cat((end_positions, end_positions_source), 0)
        q_types = torch.cat((q_types, q_types_source), 0)
        # QC4QA loss 계산 및 역전파
        loss = model.forward_ours(input_ids, segment_ids, input_masks, start_positions,
                end_positions, q_types, lambda_c=args.lambda_c)
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps
        
        loss.backward()
        
        if (step + 1) % args.gradient_accumulation_steps == 0:
            # modify learning rate with special warm up BERT uses
            lr_this_step = args.adapt_learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
        
    final_acc, final_f1 = None, None
    if epoch == args.num_train_epochs - 1:
        final_acc, final_f1 = evaluation_stage(args, eval_examples, eval_features, device, model,
            global_step=global_step, best_acc=None, best_f1=None, logger=logger)
        best_acc, best_f1 = compare_performance(args, best_acc, best_f1, final_acc, final_f1, model, logger)
    else:
        best_acc, best_f1 = evaluation_stage(args, eval_examples, eval_features, device, model,
            global_step=global_step, best_acc=best_acc, best_f1=best_f1, logger=logger)
    return best_acc, best_f1, final_acc, final_f1


def prepare_model(args, device):
    # Source 도메인에서 fine-tuned 모델 로드(run_source.py는 사전학습 모델 로드)
    input_model_file = os.path.join(args.input_dir, args.input_model_file)
    model_state_dict = torch.load(input_model_file)
    model = BertForQuestionAnsweringQC4QA.from_pretrained(args.bert_model, state_dict=model_state_dict, args=args)
    model.to(device)
    return model


def adaptation_stage(args, tokenizer, device, logger, debug=True):
    '''
    역할: 도메인 적응 메인 루프
    '''
    ### 데이터 수 조절 ### 
    sample_limit = 100
    
    model = prepare_model(args, device)
    best_acc, best_f1 = 0, 0
    
    read_limit = None
    if debug:
        read_limit = 50

    ## Read target training examples
    logger.info("***** Reading Target Unlabeled Training Samples *****")
    train_features, train_examples = read_features_and_examples(args, args.target_train_file, tokenizer, logger,
        use_simple_feature=False, read_examples=True, limit=read_limit)

    ## Read source training examples
    logger.info("***** Reading Source Training Samples *****")
    source_train_features, _ = read_features_and_examples(args, args.source_train_file, tokenizer, logger,
        use_simple_feature=False, read_examples=True, limit=read_limit)

    # Read evaluation samples
    logger.info("***** Reading Evaluation Samples *****")
    eval_features, eval_examples = read_features_and_examples(args, args.target_predict_file, tokenizer, logger,
        use_simple_feature=False, read_examples=True, limit=read_limit)

    removed_feature_index = set()
    new_generated_train_features = []
    final_acc, final_f1 = 0.0, 0.0
    for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
        logger.info('\n')
        logger.info(' ###########  Start Training Epoch %d  ###########', epoch + 1)
        logger.info('\n')
        best_acc, best_f1, final_acc, final_f1 = comb_adversarial_training_stage(args, train_features, train_examples,
                source_train_features, eval_features, eval_examples, removed_feature_index, new_generated_train_features,
                model, epoch, device, best_acc, best_f1, logger)
        logger.info('\n')
        logger.info(' ###########  End Training Epoch %d  ###########', epoch + 1)
        logger.info('\n')

    # Save the final trained model
    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(args.output_dir, args.output_model_file + '.final')
    torch.save(model_to_save.state_dict(), output_model_file)
    logger.info('The final model has been save')
    logger.info('*** The Training Stage is Ended ***')
    logger.info('\n\nBest EM is %.5f. Best F1 is %.5f', best_acc, best_f1)
    logger.info('\n\nFinal EM is %.5f. Best F1 is %.5f', final_acc, final_f1)

In [None]:
# argument parsing
args = argparse.Namespace(
    debug = True, # sample_limit 사이즈로 디버깅
    bert_model='bert-base-uncased',
    do_adaptation=True,
    do_predict=False,
    do_lower_case=True,
    source_train_file="../../data/squad/train-v1.1_classified_qtype_prob.jsonl",
    target_train_file="../../data/cnn/cnn_train_classified_qtype_prob.jsonl",
    target_predict_file="../../data/cnn/cnn_dev.json",
    input_dir="../../model/qa/squad",
    input_model_file="best_model_0916.bin",
    output_dir="../../model/qa/squad2target",
    output_model_file="adaptation_20251105_1300.bin",
    logger_path="../../../model/qa/squad2target",
    max_seq_length=512,
    seed=42,
    gradient_accumulation_steps=1,
    train_batch_size=12,
    predict_batch_size=12,
    num_workers=4,
    evaluation_interval=2000,
    loss_logging_interval=500,
    train_learning_rate=3e-5,
    num_train_epochs=2,
    warmup_proportion=0.1,
    n_best_size=20,
    max_answer_length=30,
    verbose_logging=False,
    use_simple_feature=False,
    generate_prob_th=0.6,
    keep_previous_generated=False,
    use_BN=True,
    output_prediction=True,
    source_sampling_ratio=3,
    doc_stride=128,
    max_query_length=64,
    adapt_learning_rate=1e-5,
    lambda_c=0.1,
    sample_limit= 100  ### 디버깅용 데이터셋 크기
)

In [None]:
# 3) 로거/디바이스/토크나이저 준비
logger = config_logger(args.logger_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
print(f"device = {device}")

11/05/2025 12:25:53 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/gayeon44/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


device = cuda


In [11]:
# 4) 입력 모델 로드 및 적응 단계 실행
model = prepare_model(args, device)
adaptation_stage(args, tokenizer, device, logger, debug=True)

11/05/2025 12:26:01 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /home/gayeon44/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
11/05/2025 12:26:01 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /home/gayeon44/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmpn60klqn6
11/05/2025 12:26:03 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "max_seq_length": 512,
  "num_attention_heads": 12,
  "num_hidden_layers"