In [1]:
from pathlib import Path

from summary_abstractive.module_model_handler.ver2 import (
    FaiseqTranslationModelHandlerVer2WordEmbeddings, 
    TranslationResultContainer,
    EvaluationTargetTranslationPair)

import json
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PATH_MODEL_BART_CNN = Path("/workdir/kmitsuzawa/Project/neurips-2025/ConstraintsFact-Dreyer-2023/abstractive-factual-tradeoff/tests/testresources/models/bart.large.cnn")
assert PATH_MODEL_BART_CNN.exists()

In [3]:
PATH_DATASET_CNN = Path("/workdir/kmitsuzawa/Project/neurips-2025/ConstraintsFact-Dreyer-2023/abstractive-factual-tradeoff/tests/testresources/datasets/constraints_fact_v1.0/cnn_dailymail/collect.json")
assert PATH_DATASET_CNN.exists()

In [4]:
PATH_CACHE_DIR_BASE = Path("/workdir/kmitsuzawa/DATA/mitsuzaw/project_UCA/MT_MMD/flagging_dreyer_2023/summary_cache")
assert PATH_CACHE_DIR_BASE.exists()

In [None]:
summary_model_handler = FaiseqTranslationModelHandlerVer2WordEmbeddings(
    path_cache_dir=PATH_CACHE_DIR_BASE,
    path_dir_fairseq_model=PATH_MODEL_BART_CNN
)

In [7]:
with PATH_DATASET_CNN.open('r') as f:
    seq_dataset_obj = [json.loads(_line) for _line in f.readlines()]
# end with

In [8]:
import logging

from summary_abstractive import logger_module
from datetime import datetime

path_log_dir = Path("/workdir/kmitsuzawa/DATA/mitsuzaw/project_UCA/MT_MMD/flagging_dreyer_2023/Dreyer_2023-constraints_fact_CNN-2025-07-10/generations") / f'{datetime.now().isoformat()}.log'

logger = logging.getLogger('main')
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler(path_log_dir)
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logger_module.formatter)

std_handler = logging.StreamHandler()
std_handler.setLevel(logging.DEBUG)
std_handler.setFormatter(logger_module.formatter)

logger.addHandler(file_handler)
logger.addHandler(std_handler)

# re-setting the log level.
logging.getLogger('fairseq').setLevel(logging.WARNING)

In [10]:
# -------------------------------------------
# getting git commit id
import git
repo = git.Repo(search_parent_directories=True)
sha = repo.head.object.hexsha
logger.info(f'Current Git Commit: {sha}')

2025-07-10 19:20:04,351 - main - /tmp/ipykernel_361810/3888030051.py - INFO - Current Git Commit: a352a44b446aa232472af5e3e6d87f6991dee469


In [17]:
n_sampling = 25
tau_parameters = [float(f'{_tau:.1f}') for _tau in np.arange(0.1, 1.0, 0.1)]

In [None]:
for _obj in seq_dataset_obj:
    _document_id: str = str(_obj['document_id'])

    _document_full: str = _obj['document_full']
    _document_original: str = _obj['document_original']
    
    _penalty_command: str = _obj['abstractiveness_constraint']
    
    assert _document_full == _document_original

    _input_record = EvaluationTargetTranslationPair(sentence_id=_document_id, source=_document_full, target="")

    logger.info('=' * 30)
    logger.info(f"document-id = {_document_id}")
    for _tau in tau_parameters:
        summary_model_handler.translate_sample_multiple_times(
            input_text=_input_record,
            n_sampling=n_sampling,
            temperature=_tau,
            penalty_command=_penalty_command
        )
        logger.info(f"done tau={_tau}")

2025-07-10 19:25:23,504 - main - /tmp/ipykernel_361810/469521946.py - INFO - document-id = 0
INFO:main:document-id = 0
2025-07-10 19:25:53,496 - main - /tmp/ipykernel_361810/469521946.py - INFO - done tau=0.1
INFO:main:done tau=0.1
2025-07-10 19:26:24,005 - main - /tmp/ipykernel_361810/469521946.py - INFO - done tau=0.2
INFO:main:done tau=0.2
2025-07-10 19:26:54,740 - main - /tmp/ipykernel_361810/469521946.py - INFO - done tau=0.3
INFO:main:done tau=0.3
2025-07-10 19:27:24,834 - main - /tmp/ipykernel_361810/469521946.py - INFO - done tau=0.4
INFO:main:done tau=0.4
