In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
import json
import re
import os
from rouge_score import rouge_scorer

input_folder = "/content/drive/My Drive/DatabaseTesi/TEST_RAG/afterValidationFirstLastCompl"
output_folder = "/content/drive/My Drive/DatabaseTesi/TEST_RAG/afterValidationFirstLastComplRouge"
original_folder = "/content/drive/My Drive/DatabaseTesi/TEST_RAG/test"

def compute_rouge_l(predicted_json, actual_json):
    scorer = rouge_scorer.RougeScorer(['rougeL'])
    scores = scorer.score(actual_json, predicted_json)
    return scores['rougeL'].fmeasure

def extract_clean_json(prediction_json):
    json_match = re.search(r'```json\n(.*?)\n```', prediction_json, re.DOTALL)
    if json_match:
        return json_match.group(1)
    else:
        return prediction_json  

def process_files(input_folder, output_folder, original_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".json"):
            prediction_file_path = os.path.join(input_folder, filename)

            with open(prediction_file_path, 'r', encoding='utf-8') as f:
                prediction_data = json.load(f)

            original_filename = filename.replace("predictions_and_validation_results_", "").replace(".json", "")
            original_file_path = os.path.join(original_folder, original_filename)

            if not os.path.exists(original_file_path):
                print(f"File originale non trovato: {original_file_path}")
                continue

            with open(original_file_path, 'r', encoding='utf-8') as f:
                original_data = json.load(f)

            rouge_results = {}

            for step_id, prediction in prediction_data.items():
                original_step = original_data.get('steps', {}).get(str(step_id), None)
                if not original_step:
                    print(f"Passo {step_id} non trovato nel file originale per {filename}.")
                    continue

                predicted_json_content = extract_clean_json(prediction['json'])
                original_json_content = json.dumps(original_step, indent=4)  

                rouge_l_score = compute_rouge_l(predicted_json_content, original_json_content)

                rouge_results[str(step_id)] = {
                    "step": step_id,
                    "validation_score": rouge_l_score
                }

            output_file_path = os.path.join(output_folder, f"rouge_validation_results_{filename}")
            with open(output_file_path, 'w', encoding='utf-8') as f:
                json.dump(rouge_results, f, ensure_ascii=False, indent=4)

            print(f"File di risultato scritto: {output_file_path}")

process_files(input_folder, output_folder, original_folder)