In [1]:
import sys

sys.path.append("C:\\Users\\lauth\\OneDrive\\Desktop\\open_ai_assistant_v2")
from src.db.training.training_config import TrainingAssistant
from src.utils.excel_data_reader import (
    read_excel_dictionary,
    read_keywords_dictionary,
    read_summaries_dictionary,
    read_classify_dictionary,
)

tr_assistant = TrainingAssistant()

#### Obteniendo datos para la colección de Context

In [2]:
archivo_excel = "../../assets/Diccionario_datos.xlsx"
columnas = ["Tabla", "Palabras clave", "Codigo"]

# Leyendo el diccionario del excel
excel_data = read_excel_dictionary(archivo_excel, columnas)


def process_excel_data():
    res = []
    for index, diccionario in enumerate(excel_data):
        embeddings_arr: list[list[float]] = []
        for index, kw in enumerate(diccionario["keywords"]):
            new_embedding: list[float] = tr_assistant.get_embeddings(kw)
            embeddings_arr.append(new_embedding)
        res.append(embeddings_arr)
    return res

# Arreglar la estructura de las metadatas y ademos poner una descripcion de la tabla para enviarla en el prompt complex_classifier_chain_template
ctx_embeddings = [tr_assistant.sum_embeddings(emb) for emb in process_excel_data()]
ctx_metadatas = [
    {"table_name": diccionario["table_name"], "ddl": diccionario["ddl"]}
    for diccionario in excel_data
]
ctx_ids = [f"id_ctx_col_{index}" for index, diccionario in enumerate(excel_data)]

In [10]:
ctx_metadatas[0]

{'table_name': 'pla_plataforma',
 'ddl': 'CREATE TABLE IF NOT EXISTS [dbo_v2].[pla_plataforma](\n\t[Id] [INT] IDENTITY(1,1) NOT NULL,\n\t[IdTipoPlataforma_fk] [INT] NOT NULL,\n\t[IdCliente_fk] [INT] NOT NULL,\n\t[Nombre] [VARCHAR](MAX) NOT NULL,\n\t[Codigo] [VARCHAR](10) NULL,\n\t[CNPJ] [VARCHAR](MAX) NULL,\n\t[Abreviatura] [VARCHAR](50) NULL,\n\t[EsVisible] [BIT] NOT NULL,\n\t[Estado] [VARCHAR](10) NOT NULL)'}

#### Obteniendo datos para la colección de Keywords

In [3]:
archivo_excel = "../../assets/Keywords_datos.xlsx"
columnas = ["Request", "Keywords"]

# Leyendo el diccionario del excel
excel_data = read_keywords_dictionary(archivo_excel, columnas)

kw_texts = [diccionario["request"] for diccionario in excel_data]
kw_metadatas = [{"keywords": diccionario["keywords"]} for diccionario in excel_data]
kw_ids = [f"id_kw_col_{index}" for index, diccionario in enumerate(excel_data)]

#### Obteniendo datos para la colección de Summaries

In [4]:
archivo_excel = "../../assets/Summary_datos.xlsx"
columnas = ["summary", "request"]

# Leyendo el diccionario del excel
excel_data = read_summaries_dictionary(archivo_excel, columnas)

sum_texts = [diccionario["summary"] for diccionario in excel_data]
sum_metadatas = [{"request": diccionario["request"]} for diccionario in excel_data]
sum_ids = [f"id_sum_col_{index}" for index, diccionario in enumerate(excel_data)]

#### Obteniendo datos para la colección de clasificación

In [5]:
archivo_excel = "../../assets/classifier_context.xlsx"
columnas = ["input", "analysis", "response"]

# Leyendo el diccionario del excel
excel_data = read_classify_dictionary(archivo_excel, columnas)

cls_texts = [diccionario["input"] for diccionario in excel_data]
cls_metadatas = [
    {
        "input": diccionario["input"],
        "analysis": diccionario["analysis"],
        "response": diccionario["response"],
    }
    for diccionario in excel_data
]
cls_ids = [f"id_sum_col_{index}" for index, diccionario in enumerate(excel_data)]

#### Alimentando las colecciones

In [7]:
tr_assistant._init_collections()

# Coleccion de Contexto
ctx_collection_name = tr_assistant.collection_names["CONTEXT_COLLECTION"]
# Coleccion de keywords
kw_collection_name = tr_assistant.collection_names["KEYWORDS_COLLECTION"]
# Coleccion de Summary
sum_collection_name = tr_assistant.collection_names["SUMMARY_COLLECTION"]
# Coleccion de Classifier
cls_collection_name = tr_assistant.collection_names["CLASSIFIER_COLLECTION"]

# Coleccion de Contexto
tr_assistant.train_with_chroma(
    collection_name=ctx_collection_name,
    embeddings=ctx_embeddings,
    ids=ctx_ids,
    metadatas=ctx_metadatas,
)
# Coleccion de keywords
tr_assistant.train_collection_with_langchain(
    collection_name=kw_collection_name,
    texts=kw_texts,
    metadatas=kw_metadatas,
)
# Coleccion de Summary
tr_assistant.train_collection_with_langchain(
    collection_name=sum_collection_name,
    texts=sum_texts,
    metadatas=sum_metadatas,
)
# Coleccion de Classifier
tr_assistant.train_collection_with_langchain(
    collection_name=cls_collection_name,
    texts=cls_texts,
    metadatas=cls_metadatas,
)

In [8]:
from langchain_community.vectorstores.chroma import Chroma


def counting_vectors(collection_names: list[str]):
    for name in collection_names:
        db = Chroma(
            embedding_function=tr_assistant.chromadb_embeddings_function,
            persist_directory=tr_assistant.chromadb_directory,
            collection_name=name,
        )
        count = db._collection.count()
        res = f"{name}: {count} vectores"
        print(res)


counting_vectors(
    [cls_collection_name, sum_collection_name, kw_collection_name, ctx_collection_name]
)

classifier_collection: 12 vectores
summary_collection: 8 vectores
keywords_collection: 6 vectores
context_collection: 13 vectores
