In [1]:
import json

with open("drogas_texto.json", "r", encoding="utf-8") as f:
    drogas = json.load(f)

# Tomamos las primeras 10 con texto no vacío
documentos_drogas = [d["text"] for d in drogas if d.get("text", "").strip()][:10]


In [45]:
#Necesitamos limpieza. Pruebo algo antes de aplicarselo al JSON
import re
import wordninja

def clean_text_basic(raw_text):
    # Reemplaza \n y \r explícitamente por espacio
    text = raw_text.replace("\\n", " ").replace("\n", " ").replace("\r", " ")
    
    # Reemplaza caracteres no imprimibles por espacio
    text = re.sub(r'[\x00-\x1F\x7F]', ' ', text)
    
    # Reemplaza múltiples espacios o tabs por uno solo
    text = re.sub(r'[ \t]+', ' ', text)
    
    # Añade espacio después de ';' si falta
    text = re.sub(r';([^\s])', r'; \1', text)
    
    # Añade puntos después de palabras clave para mejorar estructura
    keywords = ['Uses', 'Side effects', 'Warnings', 'Dosage', 'Interactions', 'What is', 'Introduction', 'Stop using']
    for kw in keywords:
        text = re.sub(rf'({kw})([A-Z])', rf'\1. \2', text)
    
    # Limpieza final de espacios
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


def split_concatenated_words(text):
    words = text.split()
    split_words = []
    for w in words:
        # Si palabra larga (>12 caracteres) y no todo mayúsculas, intenta separarla
        if len(w) > 12 and not w.isupper():
            split_words.extend(wordninja.split(w))
        else:
            split_words.append(w)
    return ' '.join(split_words)

def clean_text_full(raw_text):
    text = clean_text_basic(raw_text)
    text = split_concatenated_words(text)
    return text


In [46]:
documentos_drogas_clean = [clean_text_full(doc) for doc in documentos_drogas]

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(documentos_drogas_clean)


In [24]:
#pip install transformers
#!pip install torch
#!pip install wordninja
#import torch
#print(torch.__version__)
#print(torch.cuda.is_available())

#x = torch.rand(3, 3)
#print(x)


In [44]:
from transformers import pipeline

generator = pipeline("text-generation",model="distilgpt2")



Device set to use cpu


In [48]:
def retrieve_document(query, vectorizer, doc_vectors, documentos):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, doc_vectors)
    most_similar_idx = similarities.argmax()
    return documentos[most_similar_idx]

# Ejemplo con un generador de HuggingFace (cambiar por el tuyo si usás otro)
def rag_example(query):
    document = retrieve_document(query, vectorizer, doc_vectors, documentos_drogas_clean)
    prompt = (
        f"Context: {document}\n\n"
        f"Question: {query}\n"
        f"Answer concisely in one or two sentences.\n"
        f"Answer:"
    )
    answer = generator(
    prompt,
    max_length=60,  # acortar la salida
    num_return_sequences=1,
    temperature=0.7,  # bajar temperatura para menos aleatoriedad
    top_k=30,
    top_p=0.85,
    repetition_penalty=1.2  # penaliza repetir frases
)

    # Eliminar el prompt de la salida (que a veces aparece al principio)
    generated_text = answer[0]["generated_text"]
    # Quitar el prompt para quedarse solo con la respuesta
    response = generated_text.split("Answer:")[-1].strip()
    return response



In [52]:
query = "What is Abacavir?"
rag_example(query)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


''

In [50]:
query = "Is Abacavir OK for pregnant people?"
rag_example(query)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


''

In [57]:
#Pruebo a ver qué pasó
def retrieve_document(query, vectorizer, doc_vectors, documents):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, doc_vectors)
    idx = similarities.argmax()
    return documents[idx]

# RAG: generación
generator = pipeline("text-generation", model="distilgpt2")

def rag_example(query):
    doc = retrieve_document(query, vectorizer, doc_vectors, documentos_drogas_clean)
    prompt = f"Context: {doc}\n\nQuestion: {query}\nAnswer:"
    answer = generator(prompt, max_length=100, do_sample=True, temperature=0.8)[0]["generated_text"]
    response = answer.split("Answer:")[-1].strip()
    return response

# Test
query = "What is Abacavir?"
respuesta = rag_example(query)
print("🧠 Respuesta:\n", respuesta)

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


🧠 Respuesta:
 Drug is a safe and affordable drug that is effective for treatment of a serious condition with severe side effects that include nausea, vomiting, and nausea. This new drug is expected to be available for the first time in the United States. In the United States, the U.S. Food and Drug Administration has approved a Phase III trial of NRTIs that is designed to improve safety and effectiveness of the drug. The FDA has approved Phase III clinical trials in several countries, including the United States.The US Food and Drug Administration's approval of a Phase III trial for NRTIs is in the works, and this clinical trial is expected to be of the highest quality in the United States. The NRTIs have demonstrated long-term safety and benefits in these clinical trials. In addition, the NRTIs are safe, safe and effective for treatment of a major group of the conditions known as chronic conditions such as cardiovascular disease, stroke, cardiovascular disease, and multiple sclerosis.

In [58]:
rag_example("Is Abacavir good when pregnant?")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'Injector:Oxycycline (Oxycycline) is a low-fat, no-vitamin (Oxycycline) drug that is high in calcium. It contains a low-fat, no-vitamin (Oxycycline) (Oxycycline) (Oxycycline) (Oxycycline) (Oxycycline) (Oxycycline) (Oxycycline) (Oxycycline) It is available for non-medical use by the person with whom it should be used for a prescription, if necessary, and in small doses for non-medical uses in the United States. The drug is formulated to work with known side effects such as kidney stones, kidney stones, and kidney stones.[11] In its most recent form, Oxycycline is manufactured in a blend of C, A, B, C, and C, and is available in the United States (as a prescription only).[12] Oxycycline is used in many other forms, including birth control, prescription drugs, and prescription drugs.[13] It is also formulated with a high-butyric acid content and has a low-protein component, and is less than 3 times the amount of the plant'

In [59]:
#Conclusion: Hay que laburar el texto, para separarle las partes (Ver los títulos, y generar cortes con eso).
#Imagino hacer splits en el json por what is ..., Warnings..., side effects..., Uses..., Before taking this medicine..., How should I use...., dosing..., What happens if..., 