In [None]:
#!/usr/bin/env python3
import sys
import sacrebleu
from nltk.translate.meteor_score import meteor_score

def detokenize(tokens):
    """
    A simple detokenizer: joins tokens with a space.
    Adjust if your tokens require special handling.
    """
    return " ".join(tokens)

def read_and_detokenize(filename):
    """
    Reads a file where each line is a tokenized sentence,
    splits the line into tokens, detokenizes them, and returns a list of sentences.
    """
    sentences = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            tokens = line.strip().split()
            # If you have a more sophisticated detokenizer, call it here.
            sentences.append(detokenize(tokens))
    return sentences

def main():
    if len(sys.argv) < 3:
        print("Usage: python evaluate_scores.py <reference_file> <prediction_file>")
        sys.exit(1)
    
    reference_file = sys.argv[1]  # e.g. FinalTestGreedy.txt
    prediction_file = sys.argv[2]  # e.g. Finaltesting_label2.txt

    # Read and detokenize the references and predictions
    refs = read_and_detokenize(reference_file)
    preds = read_and_detokenize(prediction_file)
    
    if len(refs) != len(preds):
        print("Warning: Number of reference sentences and predictions differ!")
    
    bleu_output_file = "bleu_scores.txt"
    meteor_output_file = "meteor_scores.txt"
    
    bleu_scores = []
    meteor_scores = []
    
    with open(bleu_output_file, "w", encoding='utf-8') as out_bleu, \
         open(meteor_output_file, "w", encoding='utf-8') as out_meteor:
        
        for ref, pred in zip(refs, preds):
            # sacreBLEU expects the prediction as a string and references as a list
            bleu = sacrebleu.sentence_bleu(pred, [ref], smooth_method='exp')
            meteor = meteor_score([ref], pred)
            bleu_scores.append(bleu.score)
            meteor_scores.append(meteor)
            
            print(f"Reference: {ref}\nPrediction: {pred}")
            print(f"BLEU: {bleu.score:.2f}, METEOR: {meteor:.2f}\n")
            
            out_bleu.write(f"{bleu.score}\n")
            out_meteor.write(f"{meteor}\n")
    
    print("Evaluation complete!")
    print(f"Average BLEU: {sum(bleu_scores)/len(bleu_scores):.2f}")
    print(f"Average METEOR: {sum(meteor_scores)/len(meteor_scores):.2f}")
    print(f"BLEU scores saved to: {bleu_output_file}")
    print(f"METEOR scores saved to: {meteor_output_file}")

if __name__ == "__main__":
    main()
