In [26]:
import requests
from bs4 import BeautifulSoup

def extract_lines_from_web(url):
    """
    Extrae el texto de la página web y divide cada línea en un fragmento de texto.
    
    Args:
        url (str): URL de la página web.
        
    Returns:
        list: Lista de líneas extraídas de la página web.
    """
    # Realizamos la solicitud HTTP a la página web
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error al acceder a la página: {response.status_code}")
        return []

    # Parseamos el contenido HTML de la página
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extraemos el texto de la página web. Ajusta el selector según sea necesario.
    # Aquí estoy asumiendo que el texto principal está en un artículo con la clase 'article-content'
    content = soup.find('div', {'class': 'content'})  # Cambiar según la estructura de la página
    
    if not content:
        print("No se pudo encontrar el contenido principal de la página.")
        return []

    # Extraemos todo el texto del artículo
    full_text = content.get_text("\n", strip=True)
    
    # Dividimos el texto en líneas
    lines = full_text.splitlines()
    
    # Filtramos líneas vacías y devolvemos solo las líneas no vacías
    return [line.strip() for line in lines if line.strip()]

# Ejemplo de uso

url = "https://oyister.oyis.org/articles/book-review-the-stranger-by-albert-camus"

# Extraer líneas de la página web
web_lines = extract_lines_from_web(url)

# Imprimir las líneas extraídas
print(web_lines)


['Book Review: The Stranger By Albert Camus', 'Review', 'Dec 16', 'Written By', 'Inhyuk K.', '“Maman died today. Or yesterday maybe, I don’t know. I got a telegram from the home: ‘Mother deceased. Funeral tomorrow. Faithfully yours.’ That doesn’t mean anything. Maybe it was yesterday.”', 'What an opening line.', 'The Stranger', 'written by French writer Albert Camus is a novel that is centered around the philosophical idea of existentialism.', 'It would be absurd to define existentialism in a paragraph, but here is the oversimplified summary of existentialism: the world is full of meaninglessness. There is no innate purpose or rules in our lives, and all of us are running towards the same ending: death. Existentialism says that in a meaningless world, we must make our own choices, bear responsibility for those choices, and find meaning and purpose on our own.', 'The main character of', 'The Stranger', ', Meursault, personifies existentialism. The opening line spoken by Meursault clearl

In [27]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [29]:
from langchain_chroma import Chroma


vector_store = Chroma.from_texts(
    texts=web_lines,
    collection_name="book_review",
    embedding=embeddings,
)

In [30]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="book_review",
    embedding_function=embeddings,
)

In [31]:
retriever = vector_store.as_retriever()

In [32]:
question = "Name of the protagonist"
docs = vector_store.similarity_search(question)
len(docs)

4

In [33]:
docs[0]

Document(metadata={}, page_content='The main character of')

In [34]:
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Local LLM
ollama_llm = "llama3.2"
model_local = ChatOllama(model=ollama_llm)

# Chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model_local
    | StrOutputParser()
)

In [40]:
chain.invoke("Tell me about the author of the book")

'The author of the book is Albert Camus, a French writer.'