In [4]:
import os
import json
from typing import Dict, List
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score as bert_score
from parent import parent as parent_function

class TextInputHandler:
    def __init__(self, file_path: str):
        """
        Initialize with the path to the file.
        """
        self.file_path = file_path

    def read_file(self) -> str:
        """
        Read the entire content of the file.
        """
        if not os.path.exists(self.file_path):
            raise FileNotFoundError(f"File {self.file_path} not found.")
        with open(self.file_path, 'r', encoding='utf-8') as file:
            return file.read()

    def read_lines(self) -> List[str]:
        """
        Read the file line by line into a list.
        """
        if not os.path.exists(self.file_path):
            raise FileNotFoundError(f"File {self.file_path} not found.")
        with open(self.file_path, 'r', encoding='utf-8') as file:
            return file.readlines()

class EvaluationMetric:
    def evaluate(self, reference: str, hypothesis: str, table_data: dict = None) -> Dict:
        """
        Evaluate the hypothesis against the reference.
        Should be overridden by subclasses.
        """
        raise NotImplementedError("This method should be overridden by subclasses.")

class ROUGEEvaluation(EvaluationMetric):
    def __init__(self):
        """
        Initialize the ROUGE evaluator.
        """
        self.rouge = Rouge()

    def evaluate(self, reference: str, hypothesis: str, table_data: dict = None) -> Dict:
        """
        Evaluate using ROUGE metrics.
        """
        scores = self.rouge.get_scores(hypothesis, reference, avg=True)
        return scores

class BLUEEvaluation(EvaluationMetric):
    def evaluate(self, reference: str, hypothesis: str, table_data: dict = None) -> float:
        """
        Evaluate using BLEU score.
        """
        reference_tokens = [reference.split()]
        hypothesis_tokens = hypothesis.split()
        score = sentence_bleu(reference_tokens, hypothesis_tokens)
        return score

class BertScoreEvaluation(EvaluationMetric):
    def evaluate(self, reference: str, hypothesis: str, table_data: dict = None) -> Dict:
        """
        Evaluate using BERTScore metrics.
        """
        P, R, F1 = bert_score([hypothesis], [reference], lang='en')
        return {"precision": P.mean().item(), "recall": R.mean().item(), "f1": F1.mean().item()}

class PARENTEvaluation(EvaluationMetric):
    def __init__(self, table_file: str):
        """
        Initialize the PARENT evaluator with the table file.
        """
        self.table_file = table_file

    def read_table_file(self) -> List[dict]:
        """
        Read the table file into a list of dictionaries.
        """
        with open(self.table_file, mode="r", encoding='utf8') as f:
            tables = [json.loads(line) for line in f if line.strip()]
        return tables

    def evaluate(self, references: List[str], hypotheses: List[str], tables: List[dict]) -> Dict:
        """
        Evaluate using PARENT metrics.
        """
        assert len(references) == len(hypotheses) == len(tables), "Mismatch in the number of references, hypotheses, and tables"
        
        references_split = [ref.split() for ref in references]
        hypotheses_split = [hyp.split() for hyp in hypotheses]

        precision, recall, f_score = parent_function(
            hypotheses_split,
            references_split,
            tables,
            avg_results=True,
            n_jobs=32,
            use_tqdm=False
        )
        return {"precision": precision, "recall": recall, "f1": f_score}

class EvaluationPipeline:
    def __init__(self, reference_file: str, hypothesis_file: str, metric: EvaluationMetric, table_file: str = None):
        """
        Initialize the evaluation pipeline with reference and hypothesis files, and the metric.
        """
        self.reference_handler = TextInputHandler(reference_file)
        self.hypothesis_handler = TextInputHandler(hypothesis_file)
        self.metric = metric
        self.table_file = table_file

    def run(self) -> Dict:
        """
        Run the evaluation pipeline and return the scores.
        """
        references = self.reference_handler.read_lines()
        hypotheses = self.hypothesis_handler.read_lines()
        if isinstance(self.metric, PARENTEvaluation):
            table_data = self.metric.read_table_file()
            return self.metric.evaluate(references, hypotheses, table_data)
        return self.metric.evaluate("\n".join(references), "\n".join(hypotheses))

def select_metric(metric_name: str, table_file: str = None) -> EvaluationMetric:
    """
    Select the evaluation metric based on the given name.
    """
    if metric_name.lower() == 'rouge':
        return ROUGEEvaluation()
    elif metric_name.lower() == 'bleu':
        return BLUEEvaluation()
    elif metric_name.lower() == 'bertscore':
        return BertScoreEvaluation()
    elif metric_name.lower() == 'parent':
        if table_file is None:
            raise ValueError("Table file must be provided for PARENT evaluation.")
        return PARENTEvaluation(table_file)
    else:
        raise ValueError(f"Unknown metric: {metric_name}")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

## USE CASE FOR PARENT
if __name__ == "__main__":
    # Specify file paths
    reference_file = 'data/wb_test_output.txt'
    hypothesis_file = 'data/wb_predictions.txt'
    table_file = 'data/wb_test_tables.jl'
    
    # Choose the evaluation metric by input string
    metric_name = 'parent'  # or 'rouge', 'bleu', 'bertscore'
    metric = select_metric(metric_name, table_file=table_file if metric_name == 'parent' else None)
    
    # Run the evaluation pipeline
    pipeline = EvaluationPipeline(reference_file, hypothesis_file, metric, table_file=table_file)
    scores = pipeline.run()
    print(scores)




{'precision': 0.7974652832334177, 'recall': 0.4502892917429524, 'f1': 0.5528647763011489}


In [6]:


## USE CASE FOR OTHER METRICS

# Usage Example
if __name__ == "__main__":
    # Specify file paths
    reference_file = 'data/B_ref.txt'
    hypothesis_file = 'data/B_out.txt'
    
    # Choose the evaluation metric by input string
    # metric_name = 'rouge' 
    # metric_name = 'bleu'
    metric_name = 'bertscore'
    metric = select_metric(metric_name)
    
    # Run the evaluation pipeline
    pipeline = EvaluationPipeline(reference_file, hypothesis_file, metric)
    scores = pipeline.run()
    print(scores)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': 0.8495630025863647, 'recall': 0.8725723624229431, 'f1': 0.8609139919281006}
