In [14]:
from transformers import AutoTokenizer, AutoModel
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.schema import Document
from google import genai

class HuggingfaceEmbeddings:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True)
            outputs = self.model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
            embeddings.append(embedding[0])
        return embeddings

    def embed_query(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        outputs = self.model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        return embedding[0]

# Cargar documentos desde una carpeta
loader = DirectoryLoader('Teoria/')
documents = loader.load()

# Dividir los documentos en fragmentos manejables
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

# Crear incrustaciones de los documentos con Huggingface
embeddings = HuggingfaceEmbeddings('sentence-transformers/all-MiniLM-L6-v2')

# Convertir las incrustaciones en un formato compatible con FAISS
texts = [doc.page_content for doc in docs]
text_embeddings = embeddings.embed_documents(texts)
db = FAISS.from_documents([Document(page_content=text) for text in texts], embeddings)

# Configurar la API de Gemini
client = genai.Client(api_key="AIzaSyDc-XM8QNPTvXKk0xLoAcac7h5PKVjQ0BU")

def query_gemini(question, docs):
    contents = "\n".join([doc['content'] for doc in docs])
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=f"{contents}\n\nPregunta: {question}"
    )
    return response.text

# Preguntar y obtener respuesta
query = "Completa la siguiente frase según el contexto que se te da: Scikit-Learn utiliza el algoritmo Classification and Regression Tree (CART)..."
query_embedding = embeddings.embed_query(query)
similar_docs = db.similarity_search_by_vector(query_embedding)

# Asumimos que 'similar_docs' es una lista de documentos relevantes
# Convertir los documentos en el formato esperado por la API de Gemini
formatted_docs = [{"content": doc.page_content} for doc in similar_docs]
answer = query_gemini(query, formatted_docs)
print(answer)



Scikit-Learn utiliza el algoritmo Classification and Regression Tree (CART) **para entrenar árboles de decisión.**

