In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
import os
import re
import json
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from huggingface_hub import InferenceClient

# Impostazioni di InferenceClient
client = InferenceClient(
    provider="hf-inference",
    api_key="your_HF_token"
)

workflow_directory = "/content/drive/My Drive/DatabaseTesi/TEST_RAG/test"
save_directory = "/content/drive/My Drive/DatabaseTesi/TEST_RAG/afterValidationHalf"
directory = "/content/drive/My Drive/DatabaseTesi/TEST_RAG/noAnnotation"
progress_file = "/content/drive/My Drive/DatabaseTesi/TEST_RAG/progressHalf.json" 

def read_files_from_directory(directory):
    files = []
    filenames = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                files.append(f.read())
                filenames.append(filename)
    return files, filenames

def get_top_similar_files(workflow_steps, files, filenames, top_n=3):
    vectorizer = TfidfVectorizer().fit_transform([workflow_steps] + files)
    similarity_matrix = cosine_similarity(vectorizer[0:1], vectorizer[1:])
    similarity_scores = similarity_matrix.flatten()
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    return [(filenames[i], files[i], similarity_scores[i]) for i in top_indices]

def make_inference_request_with_retry(messages, max_retries=3):
    for attempt in range(max_retries):
        try:
            completion = client.chat.completions.create(
                model="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
                messages=messages,
            )
            return completion.choices[0].message.content
        except Exception as e:
            print(f"Errore durante l'inferenza: {e}. Tentativo {attempt + 1} di {max_retries}.")
            if attempt < max_retries - 1:
                time.sleep(5)  # Aspetta 5 secondi prima di riprovare
            else:
                raise

def save_progress(progress, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(progress, f, ensure_ascii=False, indent=4)
    print(f"Progressi salvati in {file_path}")

def load_progress(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    return {}

progress = load_progress(progress_file)

workflow_files = []
workflow_filenames = []

for filename in os.listdir(workflow_directory):
    file_path = os.path.join(workflow_directory, filename)
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                workflow_files.append(json.load(f))  
                workflow_filenames.append(filename)
            except json.JSONDecodeError:
                print(f"Errore nel caricare il file {filename} come JSON.")

mode = "half"  # options: "half", "first_last", "last", "first_last_compl"
steps_to_predict = []
annotations = {}

for idx, workflow in enumerate(workflow_files):
    print(f"Elaborazione del workflow file: {workflow_filenames[idx]}")

    if workflow_filenames[idx] in progress:
        print(f"File {workflow_filenames[idx]} già elaborato, saltando...")
        continue

    num_steps = len(workflow["steps"])
    if mode == "half":
        start = num_steps // 2
        steps_to_predict = list(range(start, num_steps))
    elif mode == "first_last":
        steps_to_predict = list(range(1, num_steps - 1))
    elif mode == "last":
        steps_to_predict = [num_steps - 1]
    elif mode == "first_last_compl":
        steps_to_predict = list(range(1, num_steps - 1))


    for step_id in steps_to_predict:
        step = workflow["steps"].get(str(step_id), {})
        if "annotation" in step:
            annotations[step_id] = step["annotation"]

    files, filenames = read_files_from_directory(directory)

    def validate_prediction(predicted_step, actual_step):
        json_match = re.search(r'```json\n(.*?)\n```', predicted_step, re.DOTALL)
        predicted_step_clean = json_match.group(1) if json_match else ""

        if not predicted_step_clean or not actual_step:
            return 0.0  

        vectorizer = TfidfVectorizer().fit_transform([predicted_step_clean, actual_step])

        if vectorizer.shape[0] < 2:
            return 0.0  

        similarity = cosine_similarity(vectorizer[0:1], vectorizer[1:])[0, 0]
        return similarity

    predictions = {}
    validation_results = {}

    for step_id in steps_to_predict:
        step_context = json.dumps(workflow["steps"].get(str(step_id - 1), {})) if step_id > 0 else ""
        top_similar_files = get_top_similar_files(step_context, files, filenames)
        context = "\n\n".join([f"File: {filename}\nContent: {content[:500]}..." for filename, content, _ in top_similar_files])
        file_names = "\n".join([f"{filename}" for filename, _, _ in top_similar_files])
        annotation_text = annotations.get(step_id, "")

        messages = [
            {"role": "system", "content": "You are an AI that generates JSON structures based on given workflow steps."},
            {"role": "user", "content": f"Given the following workflow step context: {step_context}\n\n" \
                                       f"The following files are similar:\n{context}\n\n" \
                                       f"These are the top 3 similar file names:\n{file_names}\n\n" \
                                       f"Annotation: {annotation_text}\n\n" \
                                       f"Predict the full JSON for step {step_id}. Ensure all fields are included."}
        ]

        try:
            predicted_step = make_inference_request_with_retry(messages)
            predictions[step_id] = predicted_step

            actual_step = json.dumps(workflow["steps"].get(str(step_id), {}))  
            validation_score = validate_prediction(predicted_step, actual_step)
            validation_results[step_id] = validation_score
        except Exception as e:
            print(f"Errore durante l'inferenza per il passo {step_id}: {e}")
            continue 

    result_data = {}
    for step_id in steps_to_predict:
        json_match = re.search(r'```json\n(.*?)\n```', predictions[step_id], re.DOTALL)
        json_content = json_match.group(1) if json_match else "JSON not found"

        result_data[step_id] = {
            "json": json_content,
            "validation_score": validation_results[step_id]
        }

    if not os.path.exists(save_directory):
        os.makedirs(save_directory)

    output_file_path = os.path.join(save_directory, f"predictions_and_validation_results_{workflow_filenames[idx]}.json")
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(result_data, f, ensure_ascii=False, indent=4)

    progress[workflow_filenames[idx]] = "processed"
    save_progress(progress, progress_file)

    print(f"Risultati salvati in: {output_file_path}")