In [1]:
from langchain.document_loaders import HuggingFaceDatasetLoader #carga de datos de Hugging Face
from langchain.text_splitter import RecursiveCharacterTextSplitter #Division de textos manteniendo contexto
from langchain.embeddings import HuggingFaceEmbeddings #generación de embebings para los textos
from langchain.vectorstores import FAISS #busquedas de similitud
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
import torch
from langchain import PromptTemplate, LLMChain

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    pipeline,
    AutoModelForCausalLM
    ) #transformers construcción de pipelines personalizados con recuperacion de información


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

def print_lines(text, max_ch = 50):  #division de textos en lineas de 50 caracters max
  current_line = ""                  #Esto solo facilita la lectura
  words = text.split()
  i = 0
  while i < len(words):
    if len(current_line) > 50:
      print(current_line)
      current_line = ""
    else:
      current_line += f"{words[i]} "
      i+=1
  if current_line:
    print(current_line)

cuda


# Datasets

In [3]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1YcNlffQl6E09Erst__EZvlC0iJRgy6LF' -O libros.zip
!unzip libros.zip -d ./libros

--2025-09-26 07:57:34--  https://docs.google.com/uc?export=download&id=1YcNlffQl6E09Erst__EZvlC0iJRgy6LF
Resolving docs.google.com (docs.google.com)... 172.217.162.110, 2800:3f0:4005:41c::200e
Connecting to docs.google.com (docs.google.com)|172.217.162.110|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1YcNlffQl6E09Erst__EZvlC0iJRgy6LF&export=download [following]
--2025-09-26 07:57:34--  https://drive.usercontent.google.com/download?id=1YcNlffQl6E09Erst__EZvlC0iJRgy6LF&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.251.133.97, 2800:3f0:4005:41c::2001
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.251.133.97|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20809441 (20M) [application/octet-stream]
Saving to: ‘libros.zip’


2025-09-26 07:57:42 (22.5 MB/s) - ‘libros.zip’ saved [20809441/20809441]

Archive

## Leer documentos

In [3]:
import os
path_docs = "./libros"

os.listdir(path_docs)

['Harry Potter - Book 1 - The Sorcerers Stone.pdf',
 'Harry Potter - Book 7 - The Deathly Hallows.pdf',
 'Harry Potter - Book 6 - The Half-Blood Prince.pdf',
 'Harry Potter - Book 4 - The Goblet of Fire.pdf',
 'Harry Potter - Book 3 - The Prisoner of Azkaban.pdf',
 'Harry Potter - Book 5 - The Order of the Phoenix.pdf',
 'Harry Potter - Book 2 - The Chamber of Secrets.pdf']

In [4]:
list_pdf = [x for x in os.listdir(path_docs) if x.endswith(".pdf")]

In [5]:
# Document Transformers
from tqdm import tqdm
from langchain.document_loaders import PyPDFLoader #cargar y leer archivos PDF.

chunk_size = 1000 # longitud máxima de cada fragmento de texto en caracteres.
chunk_overlap = 200

all_docs = []
text_splitter = RecursiveCharacterTextSplitter(  #divide el texto en fragmentos de tamaño determinado.
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
    )

for name in tqdm(list_pdf):#recorrer todos los archivos pdf
  if name.endswith(".pdf"): #filtro adicicional para tomar solo archivos que finalicen en .pdf
    path_tmp = os.path.join(path_docs, name) #construcción de la ruta completa
    loader = PyPDFLoader(path_tmp) #carga el archivo pdf
    text = loader.load() #extracción del texto del pdf
    chunks_of_text = text_splitter.split_documents(text) # aplicación de división por fragmentos
    all_docs.extend(chunks_of_text) #agrega los fragmentos a la lista


100%|██████████| 7/7 [01:06<00:00,  9.47s/it]


In [6]:
from pprint import pprint
chunk = all_docs[100]

print_lines(chunk.page_content)

pprint(chunk.metadata)

lot like yer dad, but yeh’ve got yer mom’s eyes.” Uncle 
Vernon made a funny rasping noise. “I demand that you 
leave at once, sir!” he said. “You are breaking and 
entering!” “Ah, shut up, Dursley, yeh great prune,” 
said the giant; he reached over 
{'author': 'J.K.Rowling',
 'creationdate': '2019-07-26T17:37:04+00:00',
 'creator': 'calibre 3.42.0 [https://calibre-ebook.com]',
 'page': 36,
 'page_label': '37',
 'producer': 'calibre 3.42.0 [https://calibre-ebook.com]',
 'source': './libros/Harry Potter - Book 1 - The Sorcerers Stone.pdf',
 'title': "Harry Potter 1 - Harry Potter and the Sorcerer's Stone",
 'total_pages': 221}


# Database

## Embeddings

In [7]:
def get_embeddings_model(model_path=None):
  modelPath = "sentence-transformers/all-MiniLM-l6-v2" #modelo de embeddings
  device = torch.device(
      "cuda" if torch.cuda.is_available() else "cpu"
      )
  #device = "cpu"
  if model_path:
    modelPath = model_path
  model_kwargs = {'device':device} #device sobre el que corre el modelo
  encode_kwargs = {'normalize_embeddings': False} #los espcaios embebidos no se normalizan automaticamente

  embeddings = HuggingFaceEmbeddings(
      model_name=modelPath,     # Provide the pre-trained model's path
      model_kwargs=model_kwargs, # Pass the model configuration options
      encode_kwargs=encode_kwargs # Pass the encoding options
  )
  print(f"device: {device}")
  return embeddings


In [8]:
embeddings = get_embeddings_model()

  embeddings = HuggingFaceEmbeddings(


device: cuda


In [9]:
text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result

[-0.03833852708339691,
 0.12346471101045609,
 -0.028642920777201653,
 0.05365273728966713,
 0.008845358155667782,
 -0.03983933478593826,
 -0.07300589978694916,
 0.04777122661471367,
 -0.03046245314180851,
 0.054979775100946426,
 0.08505292236804962,
 0.036656659096479416,
 -0.0053200190886855125,
 -0.0022332260850816965,
 -0.06071093678474426,
 -0.027237894013524055,
 -0.01135163102298975,
 -0.04243776574730873,
 0.009129976853728294,
 0.10081557184457779,
 0.0757872685790062,
 0.06911721080541611,
 0.00985750462859869,
 -0.0018377507803961635,
 0.026249080896377563,
 0.032902371138334274,
 -0.07177433371543884,
 0.028384288772940636,
 0.06170952320098877,
 -0.05252953618764877,
 0.03366170823574066,
 0.07446818053722382,
 0.07536028325557709,
 0.03538399189710617,
 0.06713411957025528,
 0.010798045434057713,
 0.08167027682065964,
 0.016562918201088905,
 0.03283063322305679,
 0.03632568195462227,
 0.0021728535648435354,
 -0.0989573746919632,
 0.0050467499531805515,
 0.05089651793241501

## Vectore Store

In [10]:
print(len(all_docs))

8912


In [11]:
db = FAISS.from_documents(all_docs, embeddings) # toma los documentos y sus representaciones en embeddings, y construye un índice que permite realizar búsquedas de manera eficiente.

In [12]:
question = "Who is Mr. Dursley?"
searchDocs = db.similarity_search(question) #realizar busqueda de similitud por distancia
print_lines(searchDocs[0].page_content) #searchDocs documentos que tienen contenido similar al embedding de la pregunta. Se imprime el contenido del primer documento (el más relevante)

CHAPTER ONE THE BOY WHO LIVED M r. and Mrs. Dursley, 
of number four, Privet Drive, were proud to say that 
they were perfectly normal, thank you very much. They 
were the last people you’d expect to be involved in 
anything strange or mysterious, because they just didn’t 
hold with such nonsense. Mr. Dursley was the director 
of a firm called Grunnings, which made drills. He was 
a big, beefy man with hardly any neck, although he 
did have a very large mustache. Mrs. Dursley was thin 
and blonde and had nearly twice the usual amount of 
neck, which came in very useful as she spent so much 
of her time craning over garden fences, spying on the 
neighbors. The Dursleys had a small son called Dudley 
and in their opinion there was no finer boy anywhere. 
The Dursleys had everything they wanted, but they also 
had a secret, and their greatest fear was that somebody 
would discover it. They didn’t think they could bear 
it if anyone found out about the Potters. Mrs. Potter 
was Mrs. 


In [13]:
import re
def get_context(db, question, top_k = 2): #devuelve los 2 mejores resultado de la busqueda en la base de datos
  searchDocs = db.similarity_search(question, k= top_k)
  return re.sub(r"\t+", " ", "\n".join([x.page_content for x in searchDocs])) #remplazar tabulación

In [14]:
get_context(db, question)

'CHAPTER ONE\n \nTHE BOY WHO LIVED\n \n \nM \nr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last people\nyou’d expect to be involved in anything strange or mysterious, because they just\ndidn’t hold with such nonsense.\n Mr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did have a\nvery large mustache. Mrs. Dursley was thin and blonde and had nearly twice the\nusual amount of neck, which came in very useful as she spent so much of her\ntime craning over garden fences, spying on the neighbors. The Dursleys had a\nsmall son called Dudley and in their opinion there was no finer boy anywhere.\n The Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn’t think they\ncould bear it if anyone found out about the Potters. Mrs. Potter was M

# Large Language Model (LLM)

## Modelo a usar

In [17]:
import torch
from transformers import AutoModelForQuestionAnswering
from transformers import TFAutoModelForQuestionAnswering
models_hf = ["deepset/minilm-uncased-squad2"] #modelo optimizado de question answering
model_ckpt = models_hf[0] #la posición 0 es la referencia del modelo
model_ckpt

'deepset/minilm-uncased-squad2'

In [18]:
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)#carga del modelo pre enttrenado
tokenizer = AutoTokenizer.from_pretrained(model_ckpt) #a formato adecuado para que el modelo lo interprete


Some weights of the model checkpoint at deepset/minilm-uncased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
question = "Who is Mr. Dursley?"
context = get_context(db, question)
inputs = tokenizer(question, context, return_tensors="pt")#aplica tokenización
pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)#definición de PipeLine para la tarea de pregunta-respuesta usando el modelo y el tokenizador cargados
pipe(question=question, context=context, topk=3) #aplicación del pipeline para la tarea

Device set to use cuda:0


[{'score': 0.3200396001338959,
  'start': 318,
  'end': 357,
  'answer': 'the director of a firm called Grunnings'},
 {'score': 0.13510501384735107,
  'start': 322,
  'end': 357,
  'answer': 'director of a firm called Grunnings'},
 {'score': 0.059819500893354416,
  'start': 318,
  'end': 376,
  'answer': 'the director of a firm called Grunnings, which made\ndrills'}]