## Importar librerías e instancia de modelo de chat

In [1]:
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate,ChatPromptTemplate, HumanMessagePromptTemplate
import langchain
from langchain.schema import SystemMessage, HumanMessage
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
import getpass
import os

f = open('/home/iabd/huggingface_token.txt')
api_key = f.read().strip()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key

llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
)

chat = ChatHuggingFace(llm=llm)

  from .autonotebook import tqdm as notebook_tqdm


#  Incrustación de texto (embedding)

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModel
import torch

# Definir el modelo para embeddings (puedes usar otro modelo adecuado)
model_name = "sentence-transformers/all-MiniLM-L6-v2"

# Configurar HuggingFaceEmbeddings con el nombre del modelo
huggingface_embeddings = HuggingFaceEmbeddings(model_name=model_name)

# Texto a incrustar
texto = "Esto es un texto enviado a Hugging Face para ser incrustado en un vector n-dimensional"

# Obtener el embedding
embedded_text = huggingface_embeddings.embed_query(texto)

# Mostrar tipo y el embedding
print(type(embedded_text))  # Tipo del resultado
print(embedded_text)  # El vector de embedding

<class 'list'>
[-0.0500510074198246, 0.09061892330646515, 0.009858348406851292, -0.0304912980645895, -0.026037145406007767, 0.006927767302840948, -0.010471208021044731, -0.02341090328991413, -0.005756305996328592, -0.012510586529970169, 0.08684329688549042, -0.012611267156898975, -0.03217066824436188, -0.018649114295840263, 0.028287256136536598, -0.019649356603622437, -0.045020636171102524, 0.10268954187631607, 0.04824957251548767, 0.14515598118305206, 0.025744810700416565, 0.0021284297108650208, -0.0015124608762562275, 0.01425545196980238, -0.062492430210113525, 0.03228756785392761, 0.08128292858600616, 0.004222435876727104, 0.05072677507996559, -0.048368170857429504, -0.009142984636127949, -0.034501444548368454, 0.04576485976576805, 0.020945994183421135, -0.06450504064559937, 0.056654322892427444, 0.014244354330003262, 0.07654222100973129, -0.10096269845962524, 0.03885374218225479, -0.12376817315816879, 0.003584380494430661, -0.035015396773815155, -0.020096993073821068, 0.10351290553

## Incrustación de documentos

In [4]:
from langchain.document_loaders import CSVLoader

In [5]:
loader = CSVLoader('Fuentes datos/datos_ventas_small.csv',csv_args={'delimiter': ';'})

In [6]:
data = loader.load()

In [7]:
type(data)

list

In [8]:
type(data[0])

langchain_core.documents.base.Document

In [10]:
#Creamos una comprensión de listas concatenando el campo "page_content" de todos los documentos existentes en la lista "data"
[elemento.page_content for elemento in data]

['\ufeffID: 10145\nCantidad: 45\nPrecio unitario: 83,26\nVenta total: 3746,7\nFecha compra: 25/08/2023\nEstado: Shipped\nLínea Producto: Motorcycles\nCódigo Producto: S10_1678\nNombre cliente: Toys4GrownUps,com\nCiudad: Pasadena\nPaís: USA\nTerritorio: NA\nTamaño pedido: Medium',
 '\ufeffID: 10159\nCantidad: 0\nPrecio unitario: 100\nVenta total: 0\nFecha compra: 10/10/2023\nEstado: Shipped\nLínea Producto: Motorcycles\nCódigo Producto: S10_1678\nNombre cliente: Corporate Gift Ideas Co,\nCiudad: San Francisco\nPaís: USA\nTerritorio: NA\nTamaño pedido: Medium',
 '\ufeffID: 10168\nCantidad: 36\nPrecio unitario: 96,66\nVenta total: 3479,76\nFecha compra: 28/10/2023\nEstado: Shipped\nLínea Producto: Motorcycles\nCódigo Producto: S10_1678\nNombre cliente: Technics Stores Inc,\nCiudad: Burlingame\nPaís: USA\nTerritorio: NA\nTamaño pedido: Medium',
 '\ufeffID: 10180\nCantidad: 29\nPrecio unitario: 86,13\nVenta total: 2497,.77\nFecha compra: 11/11/2023\nEstado: Shipped\nLínea Producto: Motorcyc

In [11]:
embedded_docs = huggingface_embeddings.embed_documents([elemento.page_content for elemento in data])

In [12]:
#Verificamos cuántos vectores a creado (1 por cada registro del fichero CSV con datos)
len(embedded_docs)

22

In [13]:
#Vemos un ejemplo del vector creado para el primer registro
embedded_docs[1]

[-0.02650550752878189,
 0.06909177452325821,
 -0.05019831284880638,
 -0.0456511452794075,
 -0.06327751278877258,
 0.03473866730928421,
 0.02447187341749668,
 0.007831514813005924,
 -0.00046186885447241366,
 -0.043370310217142105,
 0.0998377650976181,
 -0.08499389886856079,
 -0.010166746564209461,
 -0.015034924261271954,
 -0.06961734592914581,
 -0.0063690440729260445,
 0.002275320002809167,
 -0.03481382504105568,
 0.0050624157302081585,
 0.07258853316307068,
 0.0624530203640461,
 0.054548103362321854,
 -0.010189497843384743,
 0.02887420728802681,
 -0.0751989483833313,
 -0.018050149083137512,
 -0.010250052437186241,
 0.07069660723209381,
 -0.07602445781230927,
 -0.06397788226604462,
 -0.009569458663463593,
 0.121170274913311,
 0.04358922690153122,
 0.010859924368560314,
 0.07357130944728851,
 -0.03580419346690178,
 0.007880646735429764,
 0.010332643054425716,
 0.0057146064937114716,
 0.01277348306030035,
 -0.034330058842897415,
 -0.07176141440868378,
 -0.023203540593385696,
 0.0154372621