In [1]:
### Cargamos el modelo phi-4
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "microsoft/phi-4-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_lm = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype="auto")
generator = pipeline("text-generation", model=model_lm, tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.01s/it]
Device set to use cuda:0


In [2]:
prompt = "<|user|>\n¿Qué indica un valor alto del índice SDI en el agua de alimentación de RO?\n<|assistant|>"
respuesta = generator(prompt, max_new_tokens=256, do_sample=True, temperature=0.5)[0]["generated_text"]
respuesta

'<|user|>\n¿Qué indica un valor alto del índice SDI en el agua de alimentación de RO?\n<|assistant|>El Índice de Dispersión de Inorgánicos (SDI) es una medida utilizada para evaluar la capacidad de un sistema de agua de rociada (RO) para eliminar inorgánicos disueltos, como metales pesados y compuestos de sílice, de la agua. Un valor alto del SDI indica una mayor presencia de estos contaminantes en el agua de alimentación del sistema de RO.\n\nSi el SDI de tu agua de alimentación es alto, podría indicar que el sistema de RO necesita una mayor cantidad de intercambio de iones o que el proceso de filtración no está funcionando eficientemente para remover estos contaminantes. Esto podría deberse a varias razones, como:\n\n1. **Contaminación de Fuentes de Agua**: La fuente de agua de alimentación puede estar contaminada con altos niveles de metales pesados o compuestos de sílice.\n2. **Necesidad de Mejora del Sistema de RO**: El sistema de RO podría necesitar una mejora, como un filtro de 

### RAG System

In [3]:
import pickle
#Usaremos los embedding y metadatos usados anteriormente
with open("../outputs/embeddings_y_metadatos.pkl", "rb") as f:
    data = pickle.load(f)

embeddings = data["embeddings"]
metadatos = data["metadatos"]

In [4]:
#Uso FAISS para indexar
#FAISS es una librería desarrollada por Meta (Facebook) para hacer búsqueda rápida de vectores por similitud, ideal cuando tienes muchos embeddings (como en RAG).
import faiss
import numpy as np
embedding_matrix = np.array(embeddings)

# Crear índice FAISS (búsqueda por similitud L2 o Euclidiana)
dim = embedding_matrix.shape[1] 
index = faiss.IndexFlatL2(dim)  

# Agregar los vectores al índice
index.add(embedding_matrix)

# Guardar el índice en disco
faiss.write_index(index, "../outputs/faiss_index.index")

In [5]:
# Obtengo las chunks mas relevantes y genero la pregunta al modelo 
def responder_con_phi4_con_contexto(pregunta, modelo_embedding, k=5, inEnglish= False):
    # Embeddear la pregunta
    pregunta_vec = modelo_embedding.encode([pregunta])

    # Buscar k chunks relevantes en FAISS
    D, I = index.search(np.array(pregunta_vec), k)

    # Recuperar los chunks y sus títulos
    chunks_usados = []
    contexto = ""

    for idx in I[0]:
        doc = metadatos[idx]
        chunk_text = doc["chunk"].strip()
        titulo = doc.get("id_doc", "Sin título")

        chunks_usados.append({
            "titulo": titulo,
            "chunk": chunk_text
        })

        contexto += f"- {chunk_text}\n"
    idioma = "La respuesta tiene que ser en ingles" if inEnglish else "" 
    # Construir prompt para Phi-4
    prompt = f"<|user|>\nUsa el siguiente contexto para responder la pregunta de manera clara y precisa.\n\nContexto:\n{contexto}\nPregunta: {pregunta} {idioma}\n<|assistant|>"
    
    # Generar respuesta
    output = generator(
        prompt,
        max_new_tokens=300,
        temperature=0.5,
        do_sample=True
    )[0]["generated_text"]

    respuesta = output[len(prompt):].strip()

    return {
        "pregunta": pregunta,
        "respuesta": respuesta,
        "chunks_usados": chunks_usados
    }

In [6]:
from sentence_transformers import SentenceTransformer
# Uso el modelo que use en el sentence embedding para codificar mi pregunta
modelo_embedding = SentenceTransformer("distiluse-base-multilingual-cased-v1")

#Uso la misma pregunta que el caso anterior
resultado = responder_con_phi4_con_contexto(
    "¿Qué indica un valor alto del índice SDI en el agua de alimentación de RO?",
    modelo_embedding
)

print("🔹 Pregunta:", resultado["pregunta"])
print("📣 Respuesta generada:\n", resultado["respuesta"])
print("\n📚 Chunks utilizados:")
for i, chunk in enumerate(resultado["chunks_usados"], 1):
    print(f"{i}. 📝 Documento: {chunk['titulo']}\n   📄 Texto: {chunk['chunk'][:200]}...")

🔹 Pregunta: ¿Qué indica un valor alto del índice SDI en el agua de alimentación de RO?
📣 Respuesta generada:
 Un valor alto del índice SDI (Scalability Index) en el agua de alimentación de RO (Reverse Osmosis) indica un alto potencial de escorrentía de carbonato de calcio (CaCO3). Esto significa que el agua de alimentación es propensa a formar depósitos de carbonato de calcio en las membranas RO, lo que puede llevar a una reducción en la eficiencia del proceso de filtrado y potencialmente a fallos en el sistema. Por lo tanto, un valor alto del SDI requiere una atención especial en el tratamiento del agua de alimentación para reducir el potencial de escorrentía de carbonato de calcio. (Fuente: Contexto proporcionado)

📚 Chunks utilizados:
1. 📝 Documento: 7.5 RO FOULING substance (anaysis solution).pdf
   📄 Texto: . the sdi is calculated from the rate of plugging of a 0.45 m membrane filter when water is passed through at a constant applied gauge pressure . the method is described below 

### Evaluaremos Metrica

In [8]:

import pandas as pd
# Cargar el archivo CSV
df_qa = pd.read_csv("../dataQA/qa.txt")
df_qa.head()

Unnamed: 0,chunk,question,answer
0,seawater seawater tds mgl considered standard ...,What types of water are classified based on To...,Water is classified into categories like seawa...
1,recovery limit salinity andor boron concentrat...,Why is it important to limit product recovery ...,Limiting product recovery is important to ensu...
2,design boynton beach fl membrane softening wat...,How is the maximum recovery value determined f...,The maximum recovery value is determined by co...
3,range rather absolute value temperature variat...,Why is average temperature used for performanc...,Average temperature is used because membrane p...
4,risk scaling due water scarcity environmental ...,Why must scaling substances be removed from tr...,"Even after secondary treatment, wastewater can..."


In [9]:
df_qa.describe()

Unnamed: 0,chunk,question,answer
count,260,260,260
unique,260,260,260
top,seawater seawater tds mgl considered standard ...,What types of water are classified based on To...,Water is classified into categories like seawa...
freq,1,1,1


### GENERACION RAG

In [10]:
import json
# Generaremos las respuestas con el modelo con RAG y lo almacenaremos
for i, row in df_qa.iterrows():
    pregunta = row["question"]
    resultado = responder_con_phi4_con_contexto(pregunta, modelo_embedding,5,True)
    df_qa.at[i, "answer_modelo_rag"] = resultado["respuesta"]
    df_qa.at[i, "retrieved"] = json.dumps(resultado["chunks_usados"])

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


### GENERACION MODELO SIN RAG

In [None]:
import json
# Generaremos las respuestas con el modelo con RAG y lo almacenaremos
for i, row in df_qa.iterrows():
    pregunta = row["question"]
    prompt = f"<|user|>\n{pregunta}\n<|assistant|>"
    resultado =  generator(prompt, max_new_tokens=300, do_sample=True, temperature=0.5)[0]["generated_text"]
    respuesta = resultado[len(prompt):].strip()
    df_qa.at[i, "answer_modelo"] = respuesta

In [None]:
df_qa.head()

Unnamed: 0,chunk,question,answer,answer_modelo_rag,retrieved
0,seawater seawater tds mgl considered standard ...,What types of water are classified based on To...,Water is classified into categories like seawa...,Water can be classified based on Total Dissolv...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis..."
1,recovery limit salinity andor boron concentrat...,Why is it important to limit product recovery ...,Limiting product recovery is important to ensu...,Limiting product recovery in reverse osmosis (...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis..."
2,design boynton beach fl membrane softening wat...,How is the maximum recovery value determined f...,The maximum recovery value is determined by co...,The maximum recovery value for membrane soften...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis..."
3,range rather absolute value temperature variat...,Why is average temperature used for performanc...,Average temperature is used because membrane p...,El texto proporcionado no menciona explícitame...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis..."
4,risk scaling due water scarcity environmental ...,Why must scaling substances be removed from tr...,"Even after secondary treatment, wastewater can...",Scaling substances must be removed from treate...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis..."


In [None]:
df_qa.iloc[0]["retrieved"]

'[{"titulo": "7.5 RO FOULING substance (anaysis solution).pdf", "chunk": "achieve electroneutrality is recommended . page 24 of 182 trademark of the dow chemical company form no . 609000710705 213 of 865 table 2.5 water analysis for ronf sample identification ....................................................................................................... ..................................... feed source ................................................................................................................. ........................................ conductivit y ................................................... ph ............... temperature c ...................... ..... feed water analysis please give units mgl as ion or ppm as caco3 or meql na ..................... nh4 k ..................... ..................... mg2 ..................... ca2 ..................... ba2 ..................... sr2 ..................... fe2 ..................... fe tot ..................

## ROUGUE SCORE

In [None]:
import os
output_dir = os.path.join("..", "resultados")
# El ROUGE score (Recall-Oriented Understudy for Gisting Evaluation) es una métrica ampliamente usada para evaluar la calidad de textos generados automáticamente
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
## Definimos funcion para hallar ROUGE
def calculate_rouge_score(column_name,data_name):
    # Evaluar ROUGE para cada par respuesta_modelo - respuesta_referencia
    rouge_scores = []
    
    for i, row in df_qa.iterrows():
        ref = row["answer"]  # respuesta de referencia
        gen = row[column_name]  # respuesta generada
        score = scorer.score(ref, gen)
        rouge_scores.append({
            "ROUGE-1": score["rouge1"].fmeasure,
            "ROUGE-2": score["rouge2"].fmeasure,
            "ROUGE-L": score["rougeL"].fmeasure
        })
        
    #Resultados
    # Convertir a DataFrame y unirlo al original
    df_rouge = pd.DataFrame(rouge_scores)
    df_resultado = pd.concat([df_qa.reset_index(drop=True), df_rouge], axis=1)

    # Mostrar puntajes promedio
    promedios = df_rouge.mean()
    print("🔍 Promedios ROUGE:")
    print(promedios.round(4))
    
    # Ver resultados por pregunta
    print("\n📌 Ejemplos con ROUGE:")
    print(df_resultado[["question", "ROUGE-1", "ROUGE-2", "ROUGE-L"]])

    # Guardar
    output_path = os.path.join(output_dir, data_name)
    df_resultado.to_csv(output_path, index=False)
    print(f"\n✅ Guardado en {data_name}")

### Evaluaremos ROUGUE SCORE sin RAG

In [None]:

#ROUGE-1
#¿Qué mide? Coincidencias de unigramas (palabras individuales).
#ROUGE-2
#¿Qué mide? Coincidencias de bigramas (pares de palabras consecutivas).
#ROUGE-L
#¿Qué mide? La subsecuencia común más larga (LCS) entre el texto generado y la referencia.

#| Métrica     | Bueno  | Muy bueno | Excelente |
#| ----------- | ------ | --------- | --------- |
#| **ROUGE-1** | > 0.40 | > 0.50    | > 0.60    |
#| **ROUGE-2** | > 0.20 | > 0.30    | > 0.40    |
#| **ROUGE-L** | > 0.30 | > 0.40    | > 0.50    |
calculate_rouge_score("answer_modelo","evaluacion_con_rouge.csv")

🔍 Promedios ROUGE:
ROUGE-1    0.0989
ROUGE-2    0.0263
ROUGE-L    0.0763
dtype: float64

📌 Ejemplos con ROUGE:
                                              question   ROUGE-1   ROUGE-2  \
0    What types of water are classified based on To...  0.177606  0.038911   
1    Why is it important to limit product recovery ...  0.152091  0.038314   
2    How is the maximum recovery value determined f...  0.143426  0.048193   
3    Why is average temperature used for performanc...  0.159091  0.030534   
4    Why must scaling substances be removed from tr...  0.103704  0.029851   
..                                                 ...       ...       ...   
255  What are common techniques for chlorine remova...  0.073394  0.009259   
256  How is residual chlorine detected in water tre...  0.067114  0.013605   
257  Why is dechlorination critical before operatin...  0.064516  0.008130   
258  What is required after chlorination before res...  0.042194  0.000000   
259  What ORP reading confirms 

### Evaluaremos ROUGUE SCORE con RAG

In [None]:
calculate_rouge_score("answer_modelo_rag","evaluacion_rag_con_rouge.csv")

🔍 Promedios ROUGE:
ROUGE-1    0.1391
ROUGE-2    0.0363
ROUGE-L    0.1069
dtype: float64

📌 Ejemplos con ROUGE:
                                              question   ROUGE-1   ROUGE-2  \
0    What types of water are classified based on To...  0.197309  0.054299   
1    Why is it important to limit product recovery ...  0.205479  0.055556   
2    How is the maximum recovery value determined f...  0.220690  0.083916   
3    Why is average temperature used for performanc...  0.186667  0.071749   
4    Why must scaling substances be removed from tr...  0.265487  0.054054   
..                                                 ...       ...       ...   
255  What are common techniques for chlorine remova...  0.084507  0.018957   
256  How is residual chlorine detected in water tre...  0.049587  0.000000   
257  Why is dechlorination critical before operatin...  0.105882  0.023810   
258  What is required after chlorination before res...  0.060150  0.000000   
259  What ORP reading confirms 

In [None]:
df_qa["question"][1]

'Why is it important to limit product recovery in RO systems?'

In [None]:
df_qa["answer"][1]

'Limiting product recovery is important to ensure the salinity and boron levels in the product water meet required standards, as exceeding recovery limits may compromise water quality depending on site-specific conditions.'

In [None]:
df_qa["answer_modelo_rag"][1]

'Limiting product recovery in RO (Reverse Osmosis) systems is important for several reasons:\n\n1. Caso Scaling: High recovery rates can lead to increased concentrations of calcium, magnesium, and silica (Ca, Mg, and Si) ions in the permeate. When these ions exceed the solubility product (Ksp) of calcium sulfate (CaSO4), barium sulfate (BaSO4), or other scaling salts, precipitation occurs. This scaling can clog the RO membranes and reduce system efficiency.\n\n2. Feedwater Quality: Higher recovery rates can concentrate contaminants in the permeate, which may exceed the quality requirements for the intended use of the treated water. This could necessitate additional post-treatment steps or even lead to the rejection of the permeate if it does not meet the necessary standards.\n\n3. System Longevity: Operating at high recovery rates can increase the stress on the RO membranes and other system components due to the higher volume of water being processed. This can lead to more frequent fou

Aunque semanticamente el texto generado es parcialmente similar y tecnicamente correcto, el ROUGE bajo es justificado ya que muchas palabras no coincide y la estructura no coincide(gramaticalmente incorrecta), se evaluara modificar el prompt, ademas de implementar tecnicas de evaluacion del contexto extraido por el Retriever como el DSLR y finalmente el finetunning.

### BERT SCORE

In [None]:
### Definimos funcion que me calcula y muestra BERTSCORE
from bert_score import score
def calculate_bert_score(name_column,name_data):
    # Evaluar BERT SCORE para cada par respuesta_modelo - respuesta_referencia
    bert_scores = []
    for i, row in df_qa.iterrows():
        ref = row["answer"]  # respuesta de referencia
        gen = row[name_column]  # respuesta generada
        P, R, F1 = score([gen], [ref], lang="en",verbose=False)
        bert_scores.append({
            "PRECISION": P.item(),
            "RECALL": R.item(),
            "F1": F1.item()
        })
        

    #Resultados
    # Convertir a DataFrame y unirlo al original
    df_bert = pd.DataFrame(bert_scores)
    df_resultado_bert = pd.concat([df_qa.reset_index(drop=True), df_bert], axis=1)

    # Mostrar puntajes promedio
    promedios = df_bert.mean()
    print("🔍 Promedios BERTSCORE:")
    print(promedios.round(4))
    # Ver resultados por pregunta
    print("\n📌 Ejemplos con BERT:")
    print(df_resultado_bert[["question", "PRECISION", "RECALL", "F1"]])
    # Guardar
    output_path = os.path.join(output_dir, name_data)
    df_resultado_bert.to_csv(output_path, index=False)
    

### Calculamos BERT SCORE sin RAG

In [None]:
calculate_bert_score("answer_modelo","evaluacion_con_bert.csv")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

🔍 Promedios BERTSCORE:
PRECISION    0.7995
RECALL       0.8712
F1           0.8337
dtype: float64

📌 Ejemplos con BERT:
                                              question  PRECISION    RECALL  \
0    What types of water are classified based on To...   0.783743  0.850091   
1    Why is it important to limit product recovery ...   0.813159  0.871822   
2    How is the maximum recovery value determined f...   0.801137  0.873413   
3    Why is average temperature used for performanc...   0.805194  0.873872   
4    Why must scaling substances be removed from tr...   0.812015  0.873517   
..                                                 ...        ...       ...   
255  What are common techniques for chlorine remova...   0.787643  0.856531   
256  How is residual chlorine detected in water tre...   0.809292  0.856270   
257  Why is dechlorination critical before operatin...   0.797607  0.861866   
258  What is required after chlorination before res...   0.796293  0.872093   
259  What O

### Calculamos BERT SCORE RAG

In [None]:
calculate_bert_score("answer_modelo_rag","evaluacion_rag_con_bert.csv")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

🔍 Promedios BERTSCORE:
PRECISION    0.8198
RECALL       0.8829
F1           0.8500
dtype: float64

📌 Ejemplos con BERT:
                                              question  PRECISION    RECALL  \
0    What types of water are classified based on To...   0.786532  0.877145   
1    Why is it important to limit product recovery ...   0.855727  0.889647   
2    How is the maximum recovery value determined f...   0.846424  0.896047   
3    Why is average temperature used for performanc...   0.834434  0.897821   
4    Why must scaling substances be removed from tr...   0.873621  0.903980   
..                                                 ...        ...       ...   
255  What are common techniques for chlorine remova...   0.797148  0.868725   
256  How is residual chlorine detected in water tre...   0.790489  0.848710   
257  Why is dechlorination critical before operatin...   0.821544  0.873289   
258  What is required after chlorination before res...   0.810875  0.871679   
259  What O

### FRANQ 

In [None]:
import re
#Genero el prompt para generar los claims
def generar_prompt_atomic_claims(texto: str) -> str:
    prompt = f"""Your task is to extract atomic factual claims from the input text.

Each claim must:
1. **Atomicity**: Break down each statement into the smallest possible unit of factual information. Avoid grouping multiple facts in one claim.
2. **Context-Independent**: Each claim must be understandable and verifiable on its own without requiring additional context.
3. **Precise and Unambiguous**: Ensure the claims are specific and avoid combining related ideas.
4. **No Formatting**: The response must be a Python list of strings without any extra formatting, code blocks, or labels like "python".

### Example:
If the input text is:
"Mary is a five-year-old girl. She likes playing piano and doesn’t like cookies."

The output should be:
["Mary is a five-year-old girl.", "Mary likes playing piano.", "Mary doesn’t like cookies."]

Note that your response will be passed to the python interpreter, SO NO OTHER WORDS!

### Input:
{texto}

### Output:"""
    return prompt.strip()



def extraer_claims_con_phi4(respuesta_rag, generator, max_new_tokens=256):
    prompt = generar_prompt_atomic_claims(respuesta_rag)

    output = generator(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        truncation=True
    )[0]["generated_text"]

    try:
        listas = re.findall(r"\[[^\[\]]+\]", output, re.DOTALL)
        if listas:
            claims = eval(listas[-1])  
        else:
            print("⚠️ No se encontro una lista:")
            
            claims = []
    except Exception as e:
        print("⚠️ Error procesando claims. Output:")
        print(output)
        claims = []

    return claims

In [None]:
df_qa["claims_phi4"] = None  
# Usa la funcion generar los claims y almacenarlos en el Data Frame
for i, row in df_qa.iterrows():
    respuesta = row["answer_modelo_rag"]
    claims = extraer_claims_con_phi4(respuesta, generator)
    df_qa.at[i, "claims_phi4"] = claims

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore

In [None]:
df_qa.iloc[1]["claims_phi4"]

['Limiting product recovery in RO systems is important.',
 'Prevents precipitation of dissolved salts.',
 'Prevention of scaling is achieved by keeping salt concentration below solubility limit.',
 'Solubility of salts depends on temperature and pH.',
 'Reducing system recovery lowers salt concentration in concentrate.',
 'Scaling can cause operational issues.',
 'Scaling reduces efficiency of RO system.',
 'Scaling leads to increased maintenance costs.',
 'Adjusting operating variables controls scaling.',
 'Pretreatment or chemical dosing are less desirable methods.',
 'Adjusting operating variables has economic drawbacks.',
 'Increased energy consumption is a drawback of adjusting operating variables.',
 'Balance is crucial to minimize scaling and consider efficiency and cost-effectiveness.']

In [None]:
df_qa

Unnamed: 0,chunk,question,answer,answer_modelo_rag,retrieved,claims_phi4
0,seawater seawater tds mgl considered standard ...,What types of water are classified based on To...,Water is classified into categories like seawa...,Water can be classified based on Total Dissolv...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...",[Water can be classified based on Total Dissol...
1,recovery limit salinity andor boron concentrat...,Why is it important to limit product recovery ...,Limiting product recovery is important to ensu...,Limiting product recovery in RO (Reverse Osmos...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...",[Limiting product recovery in RO systems is im...
2,design boynton beach fl membrane softening wat...,How is the maximum recovery value determined f...,The maximum recovery value is determined by co...,The maximum recovery value for membrane soften...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...",[The maximum recovery value for membrane softe...
3,range rather absolute value temperature variat...,Why is average temperature used for performanc...,Average temperature is used because membrane p...,The average temperature is used for performanc...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...",[The performance of RO membranes is temperatur...
4,risk scaling due water scarcity environmental ...,Why must scaling substances be removed from tr...,"Even after secondary treatment, wastewater can...",Scaling substances must be removed from treate...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...",[Scaling substances must be removed from treat...
...,...,...,...,...,...,...
255,dechlorination methods include activated carbo...,What are common techniques for chlorine remova...,Activated carbon and sodium bisulfite effectiv...,Common techniques for chlorine removal before ...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...",[Sodium bisulfite reacts with chlorine to form...
256,chlorine presence is monitored using ORP meter...,How is residual chlorine detected in water tre...,By using oxidation-reduction potential (ORP) m...,Residual chlorine in water treatment is typica...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...",[Residual chlorine in water treatment is detec...
257,chlorine must be fully removed before startup ...,Why is dechlorination critical before operatin...,To prevent membrane exposure to residual chlor...,Dechlorination is critical before operating RO...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...",[Dechlorination is critical before operating R...
258,periodic flushing and verification of chlorine...,What is required after chlorination before res...,Flushing the system and verifying chlorine-fre...,"After chlorination, it is recommended to perfo...","[{""titulo"": ""7.5 RO FOULING substance (anaysis...","[After chlorination, it is recommended to perf..."


In [None]:
# Guardar
df_qa.to_csv("../dataQA/qa.csv", index=False)

### Probabilidad Fidelidad: Usamos los claims para verificar si son fieles al contexto

In [None]:
from sentence_transformers import CrossEncoder

# Modelo general entrenado en Natural Language Inference (NLI)
model_align = CrossEncoder("cross-encoder/nli-roberta-base", max_length=512)

def evaluar_fidelidad_alignscore(claims, retrieved_text, model):
    pairs = [(claim, retrieved_text) for claim in claims]
    
    # Devuelve logit scores (o softmax de 3 clases si usamos `predict(probs=True)`)
    logits = model.predict(pairs, apply_softmax=True)
    
    # Extraemos la probabilidad de entailment (índice 2)
    entailment_scores = [score[2] for score in logits]
    return entailment_scores

In [None]:
import ast
import json

# Paso 1: Concatenar todos los chunks recuperados
retrieved_text = ""
for retrieved in json.loads(df_qa.iloc[242]["retrieved"]):
    retrieved_text += retrieved["chunk"] + "\n"

# Paso 2: Asegurarse de que los claims estén en formato lista
claims_raw = df_qa.iloc[242]["claims_phi4"]
claims = ast.literal_eval(claims_raw) if isinstance(claims_raw, str) else claims_raw

# Paso 3: Evaluar AlignScore
scores = evaluar_fidelidad_alignscore(claims, retrieved_text, model_align)

# Paso 4: Mostrar resultados
for c, s in zip(claims, scores):
    print(retrieved_text)
    print(f"Claim: {c}\nAlignScore: {s}\n")

to determine the optimum chlorine dosage , best point of injection , ph , and contact time to prevent biofouling , astm d 1291 33 should be applied to a representative water sample . for further details , the handbook of chlorination 34 is recommended . seawater the major difference between the chlorination chemistry of seawater and that of brackish water is the presence of bromide in seawater in concentrations of typically 65 mgl . bromide reacts rapidly with hypochlorous acid to form hypobromous acid br hocl hobr cl thus , in chlorinated seawater the biocide is predominantly hobr rather than hocl . hypobromous acid then dissociates to hypobromite ion as follows hobr obr h hobr dissociation is less than hocl dissociation . at ph 8 , where 72 of hocl is dissociated , about 17 of hobr is dissociated . in other words , effective treatment can be performed at a higher ph than in brackish water , where no bromide is present . both hypobromous acid and hypobromite ions interfere with free r

### Probabilidad condicional: Dado que no es fiel si es factual

In [None]:
import torch
import torch.nn.functional as F

def log_prob_claim(claim: str, prompt: str, model, tokenizer) -> float:
    # Pongo en modo evaluacion el modelo
    model.eval()
    device = next(model.parameters()).device
    # Quiero ver que tan probable es que a partir de mi pregunta se genere mis claim
    input_text = prompt + claim
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
    input_ids = inputs["input_ids"].to(device)
    # Para modo inferencia
    with torch.no_grad():
        outputs = model(input_ids=input_ids, return_dict=True)

    logits = outputs.logits  # (1, seq_len, g)
    # Ajuste para alinear inputs y targets
    logits = logits[:, :-1, :]
    labels = input_ids[:, 1:]

    log_probs = F.log_softmax(logits, dim=-1)
    selected_log_probs = log_probs.gather(2, labels.unsqueeze(-1)).squeeze(-1)
    # Extraer la parte del claim
    prompt_len = len(tokenizer(prompt)["input_ids"])
    claim_log_probs = selected_log_probs[:, prompt_len:]
    print(claim_log_probs)

    if claim_log_probs.numel() == 0:
        return float("-inf")

    avg_log_prob = claim_log_probs.mean().item()
    return avg_log_prob


In [None]:
for claim in claims:
        avg_log_p = log_prob_claim(claim, df_qa.iloc[242]["question"], model_lm, tokenizer)
        joint_prob = torch.exp(torch.tensor(avg_log_p))
        print(f"→ joint p(c | x): {joint_prob:.10f}\n")


tensor([[-1.3250e+01, -2.8906e-01, -1.0781e+00, -3.2812e-01, -8.2031e-01,
         -1.7871e-01, -1.5723e-01, -1.2812e+00, -1.4551e-01, -1.3281e-01,
         -1.1475e-02, -1.0681e-03, -1.1484e+00, -1.0742e-02, -1.6308e-04,
         -7.6953e-01]], device='cuda:0', dtype=torch.bfloat16)
→ joint p(c | x): 0.2932990491



### Probabilidad condicional: Dado que es fiel si es factual

In [None]:
nli_model = CrossEncoder("cross-encoder/nli-roberta-base", device="cuda" if torch.cuda.is_available() else "cpu")

def calcular_max_nli(claim: str, retrieved_chunks: list, model_nli) -> float:
    # Preparar pares (premisa, hipótesis)
    pairs = [(chunk["chunk"], claim) for chunk in retrieved_chunks]
    
    # Predecir probabilidades
    probs = model_nli.predict(pairs, apply_softmax=True)  # shape: (k, 3)

    max_score = 0.0
    for prob in probs:
        entail = prob[2]         # índice 2: entailment
        contradict = prob[0]     # índice 0: contradiction

        denom = entail + contradict
        if denom > 0:
            ratio = entail / denom
            max_score = max(max_score, ratio)

    return max_score

In [None]:
retrieved_chunks = []
for retrieved in json.loads(df_qa.iloc[0]["retrieved"]):
    retrieved_chunks.append({"chunk": retrieved["chunk"]})
for claim in claims:
    max_nli_score = calcular_max_nli(claim, retrieved_chunks, nli_model)
    print(f"✅ Claim: {claim}")
    print(f"→ MaxNLI Score: {max_nli_score:.4f}\n")

✅ Claim: The speed of light in a vacuum is approximately 299,792 kilometers per second.
→ MaxNLI Score: 0.5175



### FRANQ A NUESTRA DATA

In [None]:
def calcular_FRANQ_respuesta(claims, retrieved_chunks, question, model_align, model_nli, model_lm, tokenizer):
    franq_claims=[]
    # Concatenamos los chunks recuperados
    retrieved_text = "\n".join(chunk["chunk"] for chunk in retrieved_chunks)

    for claim in claims:
        # AlignScore
        align = evaluar_fidelidad_alignscore([claim], retrieved_text, model_align)[0]

        # MaxNLI
        maxnli = calcular_max_nli(claim, retrieved_chunks, model_nli)

        # p(c | x)
        logp = log_prob_claim(claim, question, model_lm, tokenizer)

        # Normalizamos logp de [-10, 0] a [0, 1]
        logp_norm = max(min((logp + 10) / 10, 1), 0)

        # FRANQ individual para el claim
        fr_c = align * maxnli + (1 - align) * logp_norm
        franq_claims.append(fr_c)

    fr_score = np.mean(franq_claims)
    return fr_score

In [None]:
franq_scores = []

for i, row in df_qa.iterrows():
    try:
        # Parsear claims y chunks si están como strings
        claims_raw = row["claims_phi4"]
        claims = ast.literal_eval(claims_raw) if isinstance(claims_raw, str) else claims_raw

        retrieved_chunks_raw = row["retrieved"]
        retrieved_chunks = json.loads(retrieved_chunks_raw) if isinstance(retrieved_chunks_raw, str) else retrieved_chunks_raw

        # Calcular el score FRANQ para esa respuesta
        score = calcular_FRANQ_respuesta(
            claims=claims,
            retrieved_chunks=retrieved_chunks,
            question=row["question"],
            model_align=model_align,
            model_nli=nli_model,
            model_lm=model_lm,
            tokenizer=tokenizer
        )
    except Exception as e:
        print(f"⚠️ Error en fila {i}: {e}")
        score = None

    franq_scores.append(score)
# Convertir a DataFrame y unirlo al original
df_franq = pd.DataFrame(franq_scores)
df_resultado_franq = pd.concat([df_qa.reset_index(drop=True), df_franq], axis=1)

# Ver resultados por pregunta
print("\n📌 Ejemplos con FRANQ:")
print(df_resultado_franq)
# Guardar
output_path = os.path.join(output_dir, "evaluacion_rag_con_franq.csv")
df_resultado_franq.to_csv(output_path, index=False)


tensor([[-1.5391e+00, -1.3977e-02, -5.2002e-02, -3.3398e-01, -2.6941e-05,
         -2.2070e-01, -2.1815e-05, -5.0735e-04, -8.3923e-05, -2.0790e-04,
         -1.3828e-04, -7.6294e-06, -3.1471e-05, -9.6512e-04, -9.0820e-02,
         -7.1562e+00]], device='cuda:0', dtype=torch.bfloat16)
tensor([[-8.5547e-01, -3.2500e+00, -2.0625e+00, -5.4169e-04, -9.4727e-02,
         -2.7656e+00, -6.0938e-01, -1.6708e-03, -5.1953e-01, -5.8289e-03,
         -1.9360e-04, -1.0469e+00, -1.6809e-05, -2.9297e-03, -2.4109e-03,
         -3.9795e-02, -5.8899e-03, -5.3101e-03, -2.1406e+00, -4.7266e-01,
         -6.1328e-01, -5.7602e-04, -2.6245e-03, -1.5182e-03, -7.1716e-03,
         -1.0312e+00]], device='cuda:0', dtype=torch.bfloat16)
tensor([[-3.0518e-02, -5.4550e-04, -1.7578e-01, -2.6719e+00, -1.9297e+00,
         -4.9210e-04, -3.1641e-01, -1.9844e+00, -1.3379e-01, -6.2012e-02,
         -5.6250e-01, -7.1106e-03, -2.0752e-03, -8.0078e-02, -6.9885e-03,
         -2.0752e-03, -7.1716e-04, -1.8597e-04, -1.2734e+00,

In [None]:
df_franq

Unnamed: 0,0
0,0.923167
1,0.908828
2,0.980917
3,0.966632
4,0.960459
...,...
255,0.884149
256,0.937102
257,0.936589
258,0.888172


In [None]:
promedios = df_franq.mean()

promedios

0    0.873961
dtype: float32

## REFINAMIENTO DOCUMENTOS EXTRAIDOS: DSLR

### Separamos los documentso recuperados en oraciones

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def dividir_oraciones(texto):
    doc = nlp(texto)
    return [sent.text.strip() for sent in doc.sents]

In [None]:
import json
## Dividiremos en oraciones un documento recuperado
lista_chunks = json.loads(df_qa["retrieved"][2])
lista_chunks[0]["chunk"]

'more effective than long cleaning times e.g. , 30 seconds every 30 minutes . cleaning can also be carried out with cleaning chemicals as described in section 6. in batch processes like waste water treatment , cleaning the membranes after every batch is common practice . the cleaning procedure , cleaning chemicals , and frequency of cleaning need to be determined and optimized case by case . special care has to be taken not to allow a scaling layer to develop over time . 2.3.8 adjustment of operating variables when other scalecontrol methods do not work , the operating variables of the plant have to be adjusted in such a way that scaling will not occur . the precipitation of dissolved salts can be avoided by keeping their concentration below the solubility limit . this is accomplished by reducing the system recovery until the concentrate concentration is low enough . solubility depends also on temperature and ph . in the case of silica , increasing temperature and ph increases its solu

In [None]:
tokenized_chunk = dividir_oraciones(lista_chunks[0]["chunk"])
tokenized_chunk

['more effective than long cleaning times e.g. , 30 seconds every 30 minutes .',
 'cleaning can also be carried out with cleaning chemicals as described in section 6.',
 'in batch processes like waste water treatment , cleaning the membranes after every batch is common practice .',
 'the cleaning procedure , cleaning chemicals , and frequency of cleaning need to be determined and optimized case by case .',
 'special care has to be taken not to allow a scaling layer to develop over time .',
 '2.3.8 adjustment of operating variables when other scalecontrol methods do not work , the operating variables of the plant have to be adjusted in such a way that scaling will not occur .',
 'the precipitation of dissolved salts can be avoided by keeping their concentration below the solubility limit .',
 'this is accomplished by reducing the system recovery until the concentrate concentration is low enough .',
 'solubility depends also on temperature and ph .',
 'in the case of silica , increasing 

### Usamos un modelo para el re ranking por similaridad

In [None]:
question = df_qa["question"][2]
df_qa["question"][2]

'How is the maximum recovery value determined for membrane softening systems?'

In [None]:
from sentence_transformers import CrossEncoder

# Puedes usar un modelo como este:

def re_rank_oraciones(question, oraciones):
    reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
    pairs = [(question, sent) for sent in oraciones]
    scores = reranker.predict(pairs)
    ranked = sorted(zip(oraciones, scores), key=lambda x: x[1], reverse=True)
    return ranked

In [None]:
ranked_sentences=re_rank_oraciones(question,tokenized_chunk)
ranked_sentences

[('this is accomplished by reducing the system recovery until the concentrate concentration is low enough .',
  np.float32(-8.315925)),
 ('in batch processes like waste water treatment , cleaning the membranes after every batch is common practice .',
  np.float32(-10.723798)),
 ('solubility depends also on temperature and ph .', np.float32(-11.01448)),
 ('the precipitation of dissolved salts can be avoided by keeping their concentration below the solubility limit .',
  np.float32(-11.182241)),
 ('more effective than long cleaning times e.g. , 30 seconds every 30 minutes .',
  np.float32(-11.265989)),
 ('the cleaning procedure , cleaning chemicals , and frequency of cleaning need to be determined and optimized case by case .',
  np.float32(-11.274893)),
 ('in the case of silica , increasing temperature and ph increases its solubility see section 2.4.7.',
  np.float32(-11.306295)),
 ('2.3.8 adjustment of operating variables when other scalecontrol methods do not work , the operating vari

### Filtramos las oracion con treshold adaptativo(90% percentil)



In [None]:
import numpy as np

def filtrar_por_umbral(oraciones_ranked, percentil=90):
    scores = [score for _, score in oraciones_ranked]
    umbral = np.percentile(scores, percentil)
    oraciones_filtradas = [(sent, score) for sent, score in oraciones_ranked if score >= umbral]
    return oraciones_filtradas, umbral

In [None]:
filtered_sentences,treshhold=filtrar_por_umbral(ranked_sentences)
filtered_sentences

[('this is accomplished by reducing the system recovery until the concentrate concentration is low enough .',
  np.float32(-8.315925)),
 ('in batch processes like waste water treatment , cleaning the membranes after every batch is common practice .',
  np.float32(-10.723798))]

### Reconstruccion orden original

In [None]:
def reconstruir_contexto(oraciones_filtradas, oraciones_originales):
    oraciones_validas = set([sent for sent, _ in oraciones_filtradas])
    reconstruido = [sent for sent in oraciones_originales if sent in oraciones_validas]
    return reconstruido

In [None]:
refined_chunks=reconstruir_contexto(filtered_sentences,tokenized_chunk)
refined_chunks

['in batch processes like waste water treatment , cleaning the membranes after every batch is common practice .',
 'this is accomplished by reducing the system recovery until the concentrate concentration is low enough .']

## Aplicamos DSLR a nuestro Pipeline

In [None]:
## Funcion para implementar los 3 pasos de DSLR 
def refine_documents(sentence,pregunta):
    # Paso 1: Separar oraciones
    oraciones = dividir_oraciones(sentence)

    # Paso 2: Rankear oraciones
    oraciones_ranked = re_rank_oraciones(pregunta, oraciones)

    # Paso 3: Filtrar con percentil 90
    oraciones_filtradas, umbral = filtrar_por_umbral(oraciones_ranked, percentil=90)

    # Paso 4: Reconstruir en orden original
    oraciones_reconstruidas = reconstruir_contexto(oraciones_filtradas, oraciones)

    # Resultado final para pasar al LLM
    documento_refinado = " ".join(oraciones_reconstruidas)
    return documento_refinado
         
    

In [None]:
## Implementamos DSLR en nuestro pipeline
def responder_con_phi4_con_contexto_refinado(pregunta, modelo_embedding, k=5, inEnglish= False):
     # Embeddear la pregunta
    pregunta_vec = modelo_embedding.encode([pregunta])

    # Buscar k chunks relevantes en FAISS
    D, I = index.search(np.array(pregunta_vec), k)

    # Recuperar los chunks y sus títulos
    chunks_usados = []
    contexto = ""
    for idx in I[0]:
        doc = metadatos[idx]
        chunk_text = doc["chunk"].strip()
        titulo = doc.get("id_doc", "Sin título")

        chunks_usados.append({
            "titulo": titulo,
            "chunk": chunk_text
        })
    chunk_filtrado = [] 
    for document in chunks_usados:
        texto_refinado=refine_documents(document["chunk"],pregunta)
        chunk_filtrado.append({
            "titulo": titulo,
            "chunk": texto_refinado
        })
        contexto += f"- {texto_refinado}\n"
    idioma = "La respuesta tiene que ser obligatoriamente en ingles" if inEnglish else "" 
    # Construir prompt para Phi-4
    prompt = f"<|user|>\nUsa el siguiente contexto para responder la pregunta de manera clara y precisa.\n\nContexto:\n{contexto}\nPregunta: {pregunta} {idioma}\n<|assistant|>"
    
    # Generar respuesta
    output = generator(
        prompt,
        max_new_tokens=300,
        temperature=0.5,
        do_sample=True
    )[0]["generated_text"]

    respuesta = output[len(prompt):].strip()

    return {
        "pregunta": pregunta,
        "respuesta": respuesta,
        "chunks_usados": chunk_filtrado
    }

In [None]:
import json
# Generaremos las respuestas con el modelo con RAG y lo almacenaremos
for i, row in df_qa.iterrows():
    pregunta = row["question"]
    resultado = responder_con_phi4_con_contexto_refinado(pregunta, modelo_embedding,5,True)
    df_qa.at[i, "answer_modelo_rag_dslr"] = resultado["respuesta"]
    df_qa.at[i, "retrieved_dslr"] = json.dumps(resultado["chunks_usados"])

In [None]:
df_qa.head()

Unnamed: 0,chunk,question,answer,answer_modelo_rag,retrieved,answer_modelo,answer_modelo_rag_dslr,retrieved_dslr
0,seawater seawater tds mgl considered standard ...,What types of water are classified based on To...,Water is classified into categories like seawa...,Water can be classified based on Total Dissolv...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...",Total Dissolved Solids (TDS) levels are used t...,"Based on Total Dissolved Solids (TDS) levels, ...","[{""titulo"": ""7.5 RO FOULING substance (anaysis..."
1,recovery limit salinity andor boron concentrat...,Why is it important to limit product recovery ...,Limiting product recovery is important to ensu...,Limiting product recovery in RO (Reverse Osmos...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...",Limiting product recovery in reverse osmosis (...,Limiting product recovery in RO (Reverse Osmos...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis..."
2,design boynton beach fl membrane softening wat...,How is the maximum recovery value determined f...,The maximum recovery value is determined by co...,The maximum recovery value for membrane soften...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...","Membrane softening systems, such as those used...",The maximum recovery value for membrane soften...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis..."
3,range rather absolute value temperature variat...,Why is average temperature used for performanc...,Average temperature is used because membrane p...,The average temperature is used for performanc...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...","The term ""average temperature"" in the context ...",The average temperature is used for performanc...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis..."
4,risk scaling due water scarcity environmental ...,Why must scaling substances be removed from tr...,"Even after secondary treatment, wastewater can...",Scaling substances must be removed from treate...,"[{""titulo"": ""7.5 RO FOULING substance (anaysis...",Membrane reverse osmosis (RO) is a water purif...,"Scaling substances, such as calcium (Ca²⁺) and...","[{""titulo"": ""7.5 RO FOULING substance (anaysis..."


In [None]:
# Guardar
df_qa.to_csv("../dataQA/qa.csv", index=False)

In [None]:
index=0

In [None]:
df_qa["question"][index]

'What types of water are classified based on Total Dissolved Solids (TDS) levels?'

In [None]:
df_qa["answer"][index]

'Water is classified into categories like seawater, brackish water, slightly saline water, estuarine water, and salt lake water based on its TDS levels, which are estimated using conductivity and a conversion factor.'

In [None]:
df_qa["answer_modelo_rag_dslr"][index]

'Based on Total Dissolved Solids (TDS) levels, water can be classified into different types, such as:\n\n1. Freshwater: TDS levels below 1,000 mg/L (milligrams per liter).\n2. Brackish water: TDS levels between 1,000 and 10,000 mg/L.\n3. Saline water: TDS levels between 10,000 and 35,000 mg/L.\n4. Brine: TDS levels above 35,000 mg/L.\n\nIn the provided context, seawater with TDS levels of 35,000 mg/L is considered standard seawater, which is the most common type of saline water. Additionally, there is a mention of seawater with TDS levels as high as 45,000 mg/L, which would be classified as brine. The context also refers to the Baltic Sea with TDS levels as low as 7,000 mg/L, which falls into the brackish water category.'

In [None]:
df_qa["answer_modelo_rag"][index]

'Water can be classified based on Total Dissolved Solids (TDS) levels into different categories such as:\n\n1. Freshwater: TDS levels less than 1,000 mg/L (parts per million, ppm).\n2. Slightly Saline Water: TDS levels between 1,000 and 3,000 mg/L.\n3. Moderately Saline Water: TDS levels between 3,000 and 10,000 mg/L.\n4. Highly Saline Water: TDS levels between 10,000 and 35,000 mg/L.\n5. Brackish Water: TDS levels between 10,000 and 30,000 mg/L.\n6. Seawater: TDS levels around 35,000 mg/L, as mentioned in the context.\n\nThese classifications help in understanding the salinity and potential impact on water systems, including the performance of reverse osmosis (RO) or nanofiltration (NF) systems.'

In [None]:
df_qa["answer_modelo"][index]

'Total Dissolved Solids (TDS) levels are used to classify water into different categories based on its purity and suitability for various uses. The TDS levels are typically measured in milligrams per liter (mg/L) or parts per million (ppm). The classification based on TDS levels is as follows:\n\n1. Freshwater: TDS levels less than 1,000 mg/L (ppm)\n   - This type of water is generally considered safe for drinking and most other household uses. It is often referred to as "soft water."\n\n2. Slightly Hard Water: TDS levels between 1,000 mg/L and 3,000 mg/L (ppm)\n   - Water in this category is still suitable for most domestic purposes, but it may have a slight taste and may require minimal treatment for certain uses.\n\n3. Hard Water: TDS levels between 3,000 mg/L and 10,000 mg/L (ppm)\n   - Hard water has a higher concentration of dissolved minerals, which can lead to scaling and reduced efficiency in appliances like water heaters and boilers. It may require treatment to soften the wat

## Calculamos metricas RAG + DSLR

### ROUGE SCORE

In [None]:
calculate_rouge_score("answer_modelo_rag_dslr","evaluacion_rag_dslr_con_rouge.csv")

🔍 Promedios ROUGE:
ROUGE-1    0.1459
ROUGE-2    0.0380
ROUGE-L    0.1124
dtype: float64

📌 Ejemplos con ROUGE:
                                              question   ROUGE-1   ROUGE-2  \
0    What types of water are classified based on To...  0.236686  0.059880   
1    Why is it important to limit product recovery ...  0.201550  0.062992   
2    How is the maximum recovery value determined f...  0.201258  0.089172   
3    Why is average temperature used for performanc...  0.251497  0.060606   
4    Why must scaling substances be removed from tr...  0.256410  0.052174   
..                                                 ...       ...       ...   
255  What are common techniques for chlorine remova...  0.083721  0.018779   
256  How is residual chlorine detected in water tre...  0.202020  0.082474   
257  Why is dechlorination critical before operatin...  0.148148  0.037500   
258  What is required after chlorination before res...  0.070175  0.000000   
259  What ORP reading confirms 

### BERT SCORE

In [None]:
calculate_bert_score("answer_modelo_rag_dslr","evaluacion_rag_dslr_con_bert.csv")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

🔍 Promedios BERTSCORE:
PRECISION    0.8221
RECALL       0.8849
F1           0.8522
dtype: float64

📌 Ejemplos con BERT:
                                              question  PRECISION    RECALL  \
0    What types of water are classified based on To...   0.804627  0.867760   
1    Why is it important to limit product recovery ...   0.855789  0.888052   
2    How is the maximum recovery value determined f...   0.845124  0.890191   
3    Why is average temperature used for performanc...   0.850769  0.905027   
4    Why must scaling substances be removed from tr...   0.843867  0.895583   
..                                                 ...        ...       ...   
255  What are common techniques for chlorine remova...   0.782025  0.863368   
256  How is residual chlorine detected in water tre...   0.834602  0.902513   
257  Why is dechlorination critical before operatin...   0.826730  0.890720   
258  What is required after chlorination before res...   0.823923  0.867447   
259  What O

In [None]:
from sentence_transformers import SentenceTransformer
# Uso el modelo que use en el sentence embedding para codificar mi pregunta
modelo_embedding = SentenceTransformer("distiluse-base-multilingual-cased-v1")
# Generacion 
pregunta = "¿Qué indica un valor alto del índice SDI en el agua de alimentación de RO?"
resultado = responder_con_phi4_con_contexto_refinado(pregunta, modelo_embedding,5,True)
print("🔹 Pregunta:", resultado["pregunta"])
print("📣 Respuesta generada:\n", resultado["respuesta"])
print("\n📚 Chunks utilizados:")
for i, chunk in enumerate(resultado["chunks_usados"], 1):
    print(f"{i}. 📝 Documento: {chunk['titulo']}\n   📄 Texto: {chunk['chunk'][:200]}...")

🔹 Pregunta: ¿Qué indica un valor alto del índice SDI en el agua de alimentación de RO?
📣 Respuesta generada:
 Un valor alto del índice SDI (Silt Density Index) en el agua de alimentación de RO (Reversa Osmosis) indica una alta concentración de partículas finas, como silicio y bicarbonato. Esto puede afectar negativamente el rendimiento y la longevidad del sistema de RO, ya que puede llevar a un mayor engrasado y obstrucción de los membranas. Es esencial mantener un SDI bajo para asegurar un funcionamiento óptimo del sistema de RO.

📚 Chunks utilizados:
1. 📝 Documento: 7.5 RO FOULING substance (anaysis solution).pdf
   📄 Texto: . 23....
2. 📝 Documento: 7.5 RO FOULING substance (anaysis solution).pdf
   📄 Texto: temperature variation can impact the scaling potential of an ro system , especially when silica and bicarbonate levels in the feed water are high ....
3. 📝 Documento: 7.5 RO FOULING substance (anaysis solution).pdf
   📄 Texto: d 515 d 516...
4. 📝 Documento: 7.5 RO FOULING substan