In [1]:

import sys

sys.path.append(
    "C:\\Users\\lauth\\OneDrive\\Desktop\\LLM Assistant File - V2\\assistant\\sql_assistant_v3\\"
)


from src.components.tools.base.tools import action
from notebooks.multi_agent.base import ChatMessage, ChatTemplate

# Supervisor

In [2]:
@action
def chat_sql_expert(message: str, **kwargs) -> str:
    """
    To chat with an SQL expert AI to generate or refine SQL queries based on the user's needs.

    Args:
        message: A message that includes a detailed description of the current user task.
        tables: Comma separated list of tables.
    """
agents = {
    "chat_sql_expert": chat_sql_expert,
}

In [3]:
supervisor_prompt_3 = """You are a helpful assistant who responds to user's request in a fun, friendly but profesional way. 

You will be leading a team of AI experts to solve any to fulfill user requests.

To communicate with your team, you have been given access to this actions: {{agent_experts}}, user_calling.

The way you use the actions is by specifying a json blob, ending with '<end_action>'.
Specifically, this json should have an `action` key (name of the action to use) and an `action_input` key (input to the action).

The $ACTION_JSON_BLOB should only contain a SINGLE action, do NOT return a list of multiple actions. It should be formatted in json. Do not try to escape special characters. Here is the template of a valid $ACTION_JSON_BLOB:
{
  "action": $TOOL_NAME,
  "action_input": $INPUT
}<end_action>

Make sure to have the $INPUT as a dictionary in the right format for the action you are using, and do not put variable names as input if you can find the right values.

You should ALWAYS use the following format:

Thought: you should always think about **ONE ACTION** to take. Then use the action as follows:
Action:
$ACTION_JSON_BLOB
Observation: the result of the action
... (this Thought/Action/Observation can repeat N times, you should take several steps when needed. The $ACTION_JSON_BLOB must only use a SINGLE action at a time.)

You can use the result of the previous action as input for the next action.
The observation will always be a string: it can represent a file, like "image_1.jpg".
Then you can use it as input for the next action. You can do it for instance as follows:

Observation: "image_1.jpg"

Thought: I need to know what is the image that I received about in the previous observation, let's chat with the images expert.
Action:
{
  "action": "image_expert",
  "action_input": {"message": "I have an image here, ¿can you describe this image?", "image": "image_1.jpg"}
}<end_action>


To provide the final answer to your boss, use an action blob with "action": "user_calling". It is the only way you can talk with your boss, else you will be stuck on a loop. So your final output should look like this:
Action:
{
  "action": "user_calling",
  "action_input": {"answer": "insert your final answer here"}
}<end_action>


You only have access to communicate with this experts:
{{agents}}

- user_calling: To communicate any inquiry or answer to the user.
    Takes inputs: {'answer': {'type': 'string', 'description': 'A friendly message with the final answer or inquiry for the user.'}}
    Returns an output of type: any

Here are the rules you should always follow to solve your task:
1. ALWAYS provide a 'Thought:' sequence, and an 'Action:' sequence that ends with <end_action>, else you will fail.
2. Always use the right arguments for the actions. Never use variable names in the 'action_input' field, use the value instead.
3. Never re-do an action call that you previously did with the exact same parameters.
4. Do not perform an action if the user hasn’t provided all required parameters. Instead, politely ask for the missing information.
5. Your are not allowed to answer with many actions, **only one**.
4. Ensure your responses remain fun, friendly, and professional, maintaining a tone suitable for the context.

Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000."""

def get_task_prompt_3(
    agents,
):
    mask = """- {name}: {description}
    Takes inputs: {inputs}
    Returns an output of type: {output_type}"""

    agents_descriptions = "\n\n".join(
        [
            mask.format(
                name=agents[t].name,
                description=agents[t].description,
                inputs=agents[t].inputs,
                output_type=agents[t].output_type,
            )
            for t in agents
        ]
    )

    agents_experts = f"""{", ".join([agents[t].name for t in agents])}"""
    content = (
        supervisor_prompt_3.replace("{{agents}}", agents_descriptions)
        .replace("{{agent_experts}}", agents_experts)
    )

    return content

# SQL AGENTE

In [4]:
coder_agent_prompt = """You are an SQL SERVER 2014 expert analyst who can create any SQL SERVER 2014 query code for every task that your supervisor require.
By using JSON tool calls, you can retrieve information from database. 
You will be given a task by your supervisor and you have to solve as best as you can.

To do so, you have been given access to the following tools: {{tool_names}}, supervisor_call.
The way you use the tools is by specifying a json blob, ending with '<end_action>'.
Specifically, this json should have an `action` key (name of the tool to use) and an `action_input` key (input to the tool).

The $ACTION_JSON_BLOB should only contain a SINGLE action, do NOT return a list of multiple actions. It should be formatted in json. Do not try to escape special characters. Here is the template of a valid $ACTION_JSON_BLOB:
{
  "action": $TOOL_NAME,
  "action_input": $INPUT
}<end_action>

Make sure to have the $INPUT as a dictionary in the right format for the tool you are using, and do not put variable names as input if you can find the right values.

You should ALWAYS use the following format:

Thought: you should always think about **ONE ACTION** to take. Then use the action as follows:
Action:
$ACTION_JSON_BLOB

You can use the result of the previous action as input for the next action.
The observation will always be a string: it can represent a file, like "image_1.jpg".
Then you can use it as input for the next action. You can do it for instance as follows:

Observation: "image_1.jpg"

Thought: I need to transform the image that I received in the previous observation to make it green.
Action:
{
  "action": "image_transformer",
  "action_input": {"image": "image_1.jpg"}
}<end_action>


To provide the final answer to the task, use an action blob with "action": "supervisor_call". It is the only way to complete the task and call your supervisor, else you will be stuck on a loop. So your final output should look like this:
Action:
{
  "action": "supervisor_call",
  "action_input": {"answer": "insert your final answer here"}
}<end_action>


You only have acces to those tools:

{{tools}}

- supervisor_call: To communicate any inquiry or answer to your supervisor.
    Takes inputs: {'answer': {'type': 'any', 'description': 'A message to your supervisor, inquiry or final answer.'}}
    Returns an output of type: any

Here are the rules you should always follow to solve your task:
1. ALWAYS provide a 'Thought:' sequence, and an 'Action:' sequence that ends with <end_action>, else you will fail.
2. Always use the right arguments for the tools. Never use variable names in the 'action_input' field, use the value instead.
3. Never re-do a tool call that you previously did with the exact same parameters.
4. Your are not allowed to answer with many action calls, **ONLY ONE**.

Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000."""

def get_coder_agent_prompt(
    ts,
):
    mask = """- {name}: {description}
    Takes inputs: {inputs}
    Returns an output of type: {output_type}"""

    tools_descriptions = "\n\n".join(
        [
            mask.format(
                name=ts[t].name,
                description=ts[t].description,
                inputs=ts[t].inputs,
                output_type=ts[t].output_type,
            )
            for t in ts
        ]
    )

    tool_names = f"""{", ".join([ts[t].name for t in ts])}"""
    content = (
        coder_agent_prompt.replace("{{tool_names}}", tool_names)
        .replace("{{tools}}", tools_descriptions)
    )

    return content


In [5]:
@action
def retrieve_db_info(sub_queries: list) -> str:
    """
    Analize the provided sub-queries and searches semantically for relevant contextual real information to know how the data is structured and used within the business context.
    This information gives an idea of ​​how the data in the database is actually used.

    Args:
        sub_queries: A list of 7 smaller, focused queries derived from the main task. These queries have to be direct, short and focused on specific topics, This are not SQL code.
    """

tools_3 = {
    "retrieve_db_info": retrieve_db_info,
}

system = get_coder_agent_prompt(tools_3)

# Generando sub queries

In [6]:
from openpyxl import Workbook, load_workbook

# Cargar el archivo Excel existente o crear uno nuevo si no existe
def _save_excel(archivo_excel: str, data: list, sheet_name:str, column=1):    
    try:
        # Intenta cargar el archivo existente
        libro = load_workbook(archivo_excel)
    except FileNotFoundError:
        # Si el archivo no existe, crea uno nuevo
        libro = Workbook()

    # Selecciona la hoja llamada "nombres" o créala si no existe
    if sheet_name in libro.sheetnames:
        hoja = libro[sheet_name]
    else:
        hoja = libro.create_sheet(sheet_name)

    # Encuentra la primera fila vacía en la columna A
    fila_vacia = hoja.max_row + 1

    # Agrega los nuevos nombres a la hoja
    for i, nombre in enumerate(data, start=fila_vacia):
        hoja.cell(row=i, column=column, value=nombre)

    # Guarda los cambios en el archivo Excel
    libro.save(archivo_excel)

    print(f"Se han agregado {len(data)} {sheet_name} al archivo.")

In [9]:
from notebooks.multi_agent.base import (
    chat_stream_response,
    handle_agent_response,
)
import time

supervisor_system_prompt = get_task_prompt_3(agents)
sql_expert_system_prompt = get_coder_agent_prompt(tools_3)

def generar_preguntas(
    batch_requests,
    supervisor_llm,
    sql_llm,
    *,
    state="sup",
    current_chat: list[ChatMessage] = None,
    current_request_index=0,
):
    archivo_excel = r"C:\Users\lauth\OneDrive\Desktop\LLM Assistant File - V2\fine_tuning\embeddings\sheets\novo.xlsx"
    # Agente inicial (para generar el request al sql agent)
    current_request = batch_requests[current_request_index]

    current_chat = (
        [
            ChatMessage(role="system", content=supervisor_system_prompt),
            ChatMessage(
                role="user",
                content=f"""User:\n{current_request}""",
            ),
        ]
        if not current_chat
        else current_chat
    )

    if state == "sup":
        r, _, u = chat_stream_response(
            llm=supervisor_llm,
            chat_messages=ChatTemplate(current_chat),
            has_stream=True,
        )
        try:
            parsed_response = handle_agent_response(r, u)
        except Exception as e:
            current_chat.extend(
                [
                    ChatMessage(
                        role="assistant",
                        content=r,
                    ),
                    ChatMessage(
                        role="user",
                        content="You have to respect the output format please.",
                    ),
                ]
            )
            generar_preguntas(
                batch_requests=batch_requests,
                supervisor_llm=supervisor_llm,
                sql_llm=sql_llm,
                state="sup",
                current_chat=current_chat,
                current_request_index=current_request_index,
            )

        message_generated_by_llm = parsed_response.parameters[
            "message"
        ].strip()  # Mensage para el sql agent

        current_chat = [
            ChatMessage(
                role="system",
                content=sql_expert_system_prompt,
            ),
            ChatMessage(
                role="user",
                content=f"""Supervisor: {message_generated_by_llm}
            
Make sure you to follow this plan.
1. Use keywords and subquerys to find contextual info and some tables.
2. Create the code.""",
            ),
        ]
        state = "sql"

    # Agente de SQL (para generar subqueries)
    if state == "sql":
        res, _, u = chat_stream_response(
            llm=sql_llm, chat_messages=ChatTemplate(current_chat), has_stream=True
        )
        try:
            parsed_response_ = handle_agent_response(res, u)
        except Exception as e:
            current_chat.extend(
                [
                    ChatMessage(
                        role="assistant",
                        content=res,
                    ),
                    ChatMessage(
                        role="user",
                        content="You have to respect the output format please.",
                    ),
                ]
            )
            generar_preguntas(
                batch_requests=batch_requests,
                supervisor_llm=supervisor_llm,
                sql_llm=sql_llm,
                state="sql",
                current_chat=current_chat,
                current_request_index=current_request_index,
            )
            
            
        sub_queries = (
            parsed_response_.parameters["sub_queries"]
            if parsed_response_.parameters.get("sub_queries", None)
            else ""
        )
        
        _save_excel(archivo_excel, sub_queries, "sub_queries")

    if len(batch_requests)-1 > current_request_index:
        time.sleep(10)
        print(f"****************= {current_request_index} =****************")
        generar_preguntas(
                batch_requests=batch_requests,
                supervisor_llm=supervisor_llm,
                sql_llm=sql_llm,
                state="sup",
                current_chat=None,
                current_request_index=current_request_index+1,
            )
    else:
        return        
        


In [None]:
from notebooks.multi_agent.base import (
    llama_70b_hf,
    qwen_coder,
    openai_llm,
    qwen_instruct,
)

batch_requests = [
    "list the measurement system tag, flow computer with tag from fc-302 computers that are associated with a measurement measurement system", #1
    "the calibration points for the calibration with number: LMH 3305/2021",#2
    "the last uncertainty report for temperature in the measurement system with tag EMED-3138.12-050", #3
    "list the secondary equipments class, serials, equipment tag and the name of the equipment type linked to the measurement system with tag: EMED-3138.11-128", #4
    "the last uncertainty report for temperature in the equipment with tag EMED-3138.12-050-TT",# 5
    "when is the next calibration for the equipment with tag EMED-3138.12-015-DPT",# 6
    "list of the equipments serials associated to the measurement system with tag EMED-3138.11-128. Also with the variable of each equipment can read",# 7
    "the lastest uncertainty reports for the temperature and differential pressure measured in the measurement system with tag EMED-3138.12-050",# 8
    "the last uncertainty report for differential pressure in the equipment with tag EMED-3138.12-050-DPT",# 9
    "list the tags of the flow computers and the quantity of meter streams for each one",# 10
    "latest uncertainty value registered for the differential pressure in the measurement system with **tag** EMED-3138.12-050",# 11
    "list the measurement system tag, flow computer with tag from flow computers that are associated with a measurement measurement system and are flowboss",# 12
    "the calibration points for the equipment with tag: TE-EMED-3138.12-050",# 13
    "the most recent uncertainty result for the specific measurement system with tag EMED-3102-02-010",# 14
    "The user wants a daily Report for the flow computer with tag: FQI-EMED-3138.12-050 at august 2023",  # 15
    "list the measurement system tag, flow computer with tag and the computer type of flow computers that are associated with a measurement measurement system",# 16
    "The user wants a daily Report for the flow computer with tag: FQI-EMED-3138.12-050 at august 10th 2023",  # 17
    "Report for the flow computer with tag: FQI-EMED-3138.12-050 at august 10th",  # 18
    "The user wants a daily Report for the flow computer with tag: FQI-EMED-3138.12-050 at 2023-08-10",# 19  
    "list the tags of the equipments with its serials and the name of the equipment type associated and also its equipment classification to the measurement system with tag: EMED-3138.11-128",# 20
    "the uncertainty result for the specific measurement measurement system tagged as EMED-3102-02-010 at July 28th in 2023",# 21
    "the flow rate uncertainty fingerprint points for the curve of uncertainty with certificate number EMED-010-1",# 22
    "the calibration points for the calibration with number: LMH 5643/2023",# 23
    "The user wants the the date of the lastest report for the flow computer with tag: FQI-EMED-3138.12-050",  # 24
    "the initial conditions of static pressure, differential pressure, temperature and flow rate volume for uncertainty of the measurement measurement system with tag EMED-3102-02-010 at July 28th in 2023"# 25
]

generar_preguntas(batch_requests, llama_70b_hf, qwen_instruct)

# Generando scores a esas subqueries

In [2]:
from openpyxl import Workbook, load_workbook

# Cargar el archivo Excel existente o crear uno nuevo si no existe

def _save_scores_excel(data: list[dict], sheet_name:str):    
    archivo_excel = r"C:\Users\lauth\OneDrive\Desktop\LLM Assistant File - V2\fine_tuning\embeddings\sheets\preguntas.xlsx"
    try:
        # Intenta cargar el archivo existente
        libro = load_workbook(archivo_excel)
    except FileNotFoundError:
        # Si el archivo no existe, crea uno nuevo
        libro = Workbook()

    # Selecciona la hoja llamada "nombres" o créala si no existe
    if sheet_name in libro.sheetnames:
        hoja = libro[sheet_name]
    else:
        hoja = libro.create_sheet(sheet_name)

    # Encuentra la primera fila vacía en la columna A
    fila_vacia = hoja.max_row + 1

    # Agrega los nuevos nombres a la hoja
    for i, object_data in enumerate(data, start=fila_vacia):
        hoja.cell(row=i, column=1, value=object_data["user_input"])
        hoja.cell(row=i, column=2, value=object_data["text_id"])
        hoja.cell(row=i, column=3, value=object_data["score"])

    # Guarda los cambios en el archivo Excel
    libro.save(archivo_excel)

    print(f"Se han agregado {len(data)} {sheet_name} al archivo.")

In [3]:
import pandas as pd

subqueries_excel_path = r"C:\Users\lauth\OneDrive\Desktop\LLM Assistant File - V2\fine_tuning\embeddings\sheets\novo.xlsx"
subqueries_df = pd.read_excel(io=subqueries_excel_path, sheet_name="sub_queries")

subqueries_list = subqueries_df["question"].tolist()


In [53]:
import json
import time
from assistant.sql_assistant_v3.src.app.notebooks.multi_agent.base import (
    chat_stream_response
)

nlp_expert_system = """You are an expert in natural language processing and measuring semantic similarity between texts. Your task is to assign a similarity score between 0 and 1 between an input text and a list of summary target texts. The score should reflect how similar the content of each target text is to the input text.

The score range should be between:
0: Completely different (no significant relationship).
1: Identical or almost identical (same meaning, same words).
Any intermediate value: Represents a partial similarity, where a higher value indicates greater similarity.


This is the list of summaries for target texts:
c0: This text talks about flow computers, devices used in measurement engineering to collect, process, and transmit data (e.g., temperature, pressure, and fluid volume) from sensors and flow meters, aiding engineers in analysis and measurement tasks.

c1: This text talks about how flow computers generate and store organized reports containing operational data, stored in tables (main and detail) linked to a Modbus table for easier data interpretation.

c2: This text talks about the relationship between flow computers and measurement systems, where multiple systems can be assigned to a flow computer, but each system links to only one flow computer, with this relationship stored in a database table using terms like meter streams and runs.

c3: This text talks about measurement systems, tools for monitoring fluids in industrial processes, classified by process stage (e.g., fiscal, custody) and identified by a unique TAG. They measure various fluids using differential or linear technologies, depending on the type, with gas and oil systems employing distinct methodologies.

c4: This text talks about measuring equipment, devices within a measurement system identified by serial numbers and Equipment Tags. Tags indicate usage status (in use or spare), with equipment assigned based on technology (e.g., differential or linear). A database table tracks these assignments, linking equipment to measurement systems.

c5: This text talks about measurement equipment, classified into primary, secondary, and tertiary meters based on the type of variable they measure. Equipment types (e.g., transmitters, orifice plates) are stored in a database, with meteorological checks (calibration or inspection) to ensure accuracy and proper functioning.

c6: This text talks about the data stored by equipment, such as direct measurements of pressure, temperature, and volume, stored in a database table. These values are raw data, not calculated values, and users access them through the measurement system, with variable names linked to another database table for reference.

c7: This text talks about equipment calibration, a process that ensures measurement accuracy by correcting deviations. Calibration is performed periodically with two main cycles: as-found (pre-adjustment) and as-left (post-adjustment), and includes uncertainty, which affects the margin of error in measurements.

c8: This text talks about uncertainty, which measures the potential error or margin of error in measurements. It explains two types: uncertainty of magnitudes (specific variables like temperature or pressure) and uncertainty of the measurement system (overall flow measurement), with both types stored separately in the database.


You have to compare each summary with the input text given by the user.

To answer you will use the $RESPONSE_JSON_BLOB. It should be formatted in json. Do not try to escape special characters. Here is the template of a valid $RESPONSE_JSON_BLOB:
[
    {
        "user_input": $USER_INPUT,
        "text_id": c1,
        "score": $SCORE
    },
    {
        "user_input": $USER_INPUT,
        "text_id": c2,
        "score": $SCORE
    }
    ,...,
    {
        "user_input": $USER_INPUT,
        "text_id": c8,
        "score": $SCORE
    }
]<end_action>

Example of your final answer:

Response:
[
    {
        "user_input": "Which columns are related to flow rate uncertainty?",
        "text_id": "c1",
        "score": 0.15
    },...,
    {
        "user_input": "Which columns are related to flow rate uncertainty?",
        "text_id": "c8",
        "score": 0.7
    }
]<end_action>

Here are the rules you should always follow to solve your task:
1. ALWAYS provide a 'Response:' sequence that ends with <end_action>, else you will fail.
2. ALWAYS respect the answer format, else you will fail.
3. Be flexible when analyzing similarities, especially when you are provided with code or pseudocode. Make sure to handle these cases carefully and accurately.

Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000."""


def generate_scores(subqueries_list: list[str], nlp_expert_llm, current_chat = None):
    for index, sbquery in enumerate(subqueries_list):
        nlp_chat = [
            ChatMessage(
                role="system",
                content=nlp_expert_system,
            ),
            ChatMessage(
                role="user",
                content=f"""User: {sbquery}
                
Make sure you to follow this plan.
1. Read the input text and each of the target texts carefully.
2. Assign a similarity score between 0 and 1 to each pair of input text and target text.
3. Return the similarity scores in the correct $RESPONSE_JSON_BLOB format.
4. After your thoughts you have to give only the answer, anything else beyond the $RESPONSE_JSON_BLOB format.
Begin!""",
            ),
        ] if not current_chat else current_chat

        res, _, _ = chat_stream_response(
            llm=nlp_expert_llm, chat_messages=ChatTemplate(nlp_chat), has_stream=True
        )

        try:
            # lst = res.split("Response:")[1].strip().split("<end_action>")[0]
            # lst = res.split("</think>")[1].strip().split("<end_action>")[0]
            lst = res.split("</think>")[1].strip().split("Response:")[1].strip().split("<end_action>")[0]
            lst_json_ = json.loads(lst)
        except Exception as e:
            nlp_chat.extend(
                [
                    ChatMessage(
                        role="assistant",
                        content=res,
                    ),
                    ChatMessage(
                        role="user",
                        content=f"""User: You must use correct output $RESPONSE_JSON_BLOB format.""",
                    ),
                ]
            )
            return generate_scores(subqueries_list[index + 1:], nlp_expert_llm, nlp_chat)
            
            
        try:
            _save_scores_excel(lst_json_, sheet_name="scores")
            print(f"**********= {index+601} =**********")
            time.sleep(10)
            current_chat=None
        except Exception as e:
            print(f"Error al guardar en el excel en el item: {index}. Error: {e}")
            
            

In [None]:
# Experto para generar data para el modelo de embeddings

from assistant.sql_assistant_v3.src.app.notebooks.multi_agent.base import (
    llama_70b_hf,
    qwen_coder,
    openai_llm,
    qwen_instruct,
    llama_70b,
    GenericLLM
)

deepseek = GenericLLM.from_groq_deepseek_llama70_distill()
res = generate_scores(subqueries_list[:1], deepseek)


In [65]:
import pandas as pd


def read_sheet(sheet_name: str  = "", cols: list[str] = None, *, path = None):
    """To read an excel that contains views definitions"""
    if cols is not None:
        ex_df = pd.read_excel(io=path, sheet_name=sheet_name, usecols=cols)
    else: 
        ex_df = pd.read_excel(io=path, sheet_name=sheet_name)
    return ex_df



In [73]:
path = r"C:\Users\lauth\OneDrive\Desktop\LLM Assistant File - V2\fine_tuning\embeddings\sheets\datasets.xlsx"

chunks_vals = read_sheet(sheet_name="chunks", cols=["id", "content", "topic"], path=path)

path = r"C:\Users\lauth\OneDrive\Desktop\LLM Assistant File - V2\fine_tuning\embeddings\sheets\datasheet.xlsx"
scores = read_sheet(
    sheet_name="scores", cols=["chunk_id", "topic", "text_objective"], path=path
)


In [74]:
for index, item in scores.iterrows():
    for i, chunk in chunks_vals.iterrows():
        if item["chunk_id"] == chunk["id"]:
            scores.at[index, "topic"] = chunk["topic"]
            scores.at[index, "text_objective"] = chunk["content"]

  scores.at[index, "topic"] = chunk["topic"]
A flow computer is a device used in measurement engineering. It collects analog and digital data from flow meters and other sensors.

Key features of a flow computer:
- It has a unique name, firmware version, and manufacturer information.
- It is designed to record and process data such as temperature, pressure, and fluid volume (for gases or oils).

Main function:
The flow computer sends the collected data to a measurement system. This allows measurement engineers to analyze the data and perform their tasks effectively.' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  scores.at[index, "text_objective"] = chunk["content"]


In [76]:
p = r"C:\Users\lauth\OneDrive\Desktop\LLM Assistant File - V2\fine_tuning\embeddings\sheets\novo.xlsx"
scores.to_excel(p, index=False)

# Crear el dataset

In [77]:
import pandas as pd
from datasets import Dataset, DatasetDict
from huggingface_hub import login

# 1. Autenticarse en Hugging Face Hub
login(token="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")


p = r"C:\Users\lauth\OneDrive\Desktop\LLM Assistant File - V2\fine_tuning\embeddings\sheets\datasheet.xlsx"
df = read_sheet(
    sheet_name="scores", cols=["sentence1", "sentence2", "score"], path=p
)


df

Unnamed: 0,sentence1,sentence2,score
0,What are the columns in the pressure_data table?,What is a flow computer?\nA flow computer is a...,0.10
1,What are the columns in the pressure_data table?,How does a flow computer generate and store re...,0.15
2,What are the columns in the pressure_data table?,How are flow computers and measurement systems...,0.05
3,What are the columns in the pressure_data table?,What is a measurement system?\nA measurement s...,0.05
4,What are the columns in the pressure_data table?,What is measuring equipment?\nMeasuring equipm...,0.20
...,...,...,...
6520,Check if there are any specific conditions or ...,What is measuring equipment?\nMeasuring equipm...,0.05
6521,Check if there are any specific conditions or ...,What do measurement equipment measure?\nEach e...,0.05
6522,Check if there are any specific conditions or ...,What kind of data store an equipment?\nEquipme...,0.05
6523,Check if there are any specific conditions or ...,What is equipment calibration?\nCalibration is...,0.05


In [86]:
# 3. Convertir el DataFrame de pandas a un Dataset de Hugging Face
dataset = Dataset.from_pandas(df)

# 4. Dividir el dataset en entrenamiento (80%) y evaluación (20%)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
split_dataset_ = split_dataset["test"].train_test_split(test_size=0.5, seed=40)

train_dataset = split_dataset["train"]
test_dataset = split_dataset_["train"]
validation_dataset = split_dataset_["test"]


# 5. Crear un DatasetDict con ambos conjuntos
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": validation_dataset,
})

# # 6. Subir el dataset al Hub
dataset_dict.push_to_hub("embeddings-train-semantic")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Lauther/embeddings-train-semantic/commit/ce90f531bc39037053d223b27868ad178852f330', commit_message='Upload dataset', commit_description='', oid='ce90f531bc39037053d223b27868ad178852f330', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Lauther/embeddings-train-semantic', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Lauther/embeddings-train-semantic'), pr_revision=None, pr_num=None)

In [None]:
from datasets import load_dataset

# Cargar el dataset desde el Hub
dataset = load_dataset("Lauther/embeddings-train-semantic")

# Acceder a los conjuntos de entrenamiento y evaluación
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]