## installs

In [None]:
%pip install -qU langchain-ollama langchain-community beautifulsoup4

## imports

In [2]:
from langchain.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
import re
from bs4 import BeautifulSoup
from typing import List
import requests
import tiktoken
from langchain_community.document_loaders import RecursiveUrlLoader


In [4]:
llm = OllamaLLM(model="llama3.1", temperature=0.5)

## ollama.com tags

In [None]:

loader = RecursiveUrlLoader(
    "https://ollama.com/library/llama3.2/tags",
    # max_depth=2,
    # use_async=False,
    # extractor=None,
    # metadata_extractor=None,
    # exclude_dirs=(),
    # timeout=10,
    # check_response_status=True,
    # continue_on_failure=True,
    # prevent_outside=True,
    # base_url=None,
    # ...
)

docs = loader.load()
docs[0].metadata


In [None]:
print(docs[0].page_content)

In [None]:

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()


loader = RecursiveUrlLoader("https://ollama.com/library/llama3.2/tags", extractor=bs4_extractor)
docs = loader.load()
print(docs[0].page_content[:1200])

In [None]:

def analyze_tags(tags_and_descriptions: str, model):
    # Prompt para a análise de tags.
    prompt = (
        "The following is content extracted from a document. Analyze and summarize the details "
        "specifically focusing on models optimized for small sizes:\n\n"
    )
    prompt += tags_and_descriptions
    return model(prompt)

def analyze_tags_in_docs(docs: List, llm, max_tokens: int = 1800):
    responses = []
    for doc in docs:
        # Extrai o conteúdo limitado ao máximo permitido.
        limited_content = doc.page_content[:max_tokens]
        # Analisa os tags no conteúdo extraído usando a LLM.
        response = analyze_tags(limited_content, llm)
        responses.append(response)
    # Retorna as respostas finais unificadas.
    final_response = "\n".join(responses)
    return final_response

# Exemplo de chamada da função
final_response = analyze_tags_in_docs(docs, llm)
print(final_response)




### count_tokens

In [None]:


def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
    """
    Conta o número de tokens no texto usando o encoding especificado.
    """
    encoding = tiktoken.get_encoding(encoding_name)
    return len(encoding.encode(text))

def truncate_text_to_tokens(text: str, max_tokens: int, encoding_name: str = "cl100k_base") -> str:
    """
    Trunca o texto para não ultrapassar o número máximo de tokens.
    """
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(text)
    truncated_tokens = tokens[:max_tokens]
    return encoding.decode(truncated_tokens)

def analyze_tags(tags_and_descriptions: str, model):
    """
    Análise de tags usando o modelo LLM.
    """
    # Prompt para a análise de tags.
    prompt = (
        "The following is content extracted from a document. Analyze and summarize the details "
        "specifically focusing on models optimized for small sizes:\n\n"
    )
    prompt += tags_and_descriptions
    return model(prompt)

def analyze_tags_in_docs(docs: List, llm, max_tokens: int = 4096, encoding_name: str = "cl100k_base"):
    """
    Analisa tags nos documentos sem ultrapassar a janela de contexto.
    """
    responses = []
    # Reservar tokens para o prompt fixo.
    prompt_reservation = count_tokens(
        "The following is content extracted from a document. Analyze and summarize the details "
        "specifically focusing on models optimized for small sizes:\n\n",
        encoding_name
    )
    allowed_tokens = max_tokens - prompt_reservation

    for doc in docs:
        # Trunca o conteúdo do documento para caber na janela de contexto.
        limited_content = truncate_text_to_tokens(doc.page_content, allowed_tokens, encoding_name)
        # Analisa as tags no conteúdo truncado usando a LLM.
        response = analyze_tags(limited_content, llm)
        responses.append(response)
    
    # Retorna as respostas finais unificadas.
    final_response = "\n".join(responses)
    return final_response

# Exemplo de chamada da função
# Certifique-se de que `docs` é uma lista de objetos com um atributo `page_content`.
# `llm` deve ser um objeto de um modelo LangChain LLM configurado.
final_response = analyze_tags_in_docs(docs, llm)
print(final_response)


## get movies

In [37]:

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()


loader = RecursiveUrlLoader("https://bflixhd.lol/home/", extractor=bs4_extractor)
docs = loader.load()

In [None]:
docs[0].page_content


In [None]:


def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    return len(encoding.encode(text))


def truncate_text_to_tokens(text: str, max_tokens: int, encoding_name: str = "cl100k_base") -> str:
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(text)
    truncated_tokens = tokens[:max_tokens]
    return encoding.decode(truncated_tokens)


def analyze_tags(tags_and_descriptions: str, model, base_prompt: str) -> str:
    prompt = base_prompt + tags_and_descriptions
    return model(prompt)


def analyze_tags_in_docs(docs: List, llm, max_tokens: int = 4096, encoding_name: str = "cl100k_base") -> str:
    responses = []
    base_prompt = (
        "Você é especialista em análise de filmes.\n"
        "Analise o conteúdo abaixo,entenda e extraia detalhes relevantes conforme exabaixo.\n"
        "ex: nome: Moana 2  nota: 7.2 ano: 2024 tempo: 100 min "
        "Por favor, me dê apenas as informações principais sobre o site, sem avisos sobre segurança ou legalidade."
        "Gostaria de uma resposta curta e objetiva, sem explicações adicionais."
        "Entenda os dados e me dê um resumo bem pequeno dos dados conforme exemplo."
        "Dados:\n\n"
    )
    prompt_reservation = count_tokens(base_prompt, encoding_name)
    allowed_tokens = max_tokens - prompt_reservation

    for doc in docs:
        if not hasattr(doc, 'page_content'):
            raise ValueError(
                "Cada item em 'docs' deve conter o atributo 'page_content'.")

        limited_content = truncate_text_to_tokens(
            doc.page_content, allowed_tokens, encoding_name)
        response = analyze_tags(limited_content, llm, base_prompt)
        responses.append(response)

    return "\n".join(responses)


class Document:
    def __init__(self, content):
        self.page_content = content


result = analyze_tags_in_docs(docs, llm)
print(result)

In [5]:
# Função para carregar dados de uma URL
def load_data_from_url(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")
    # Extrair texto visível da página
    return soup.get_text()

# Função para dividir os dados em chunks com base na contagem de tokens
def split_into_chunks(data, max_tokens, tokenizer):
    tokens = tokenizer.encode(data)
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i + max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)
    return chunks


In [None]:

# Configurações
url = "https://bflixhd.lol/the-judge-from-hell/"  # Substitua pelo URL desejado
question = "nome dos atores, origem do filme, ano de lançamento, sinopse" 
llm = OllamaLLM(model="llama3.1", temperature=0.9)
MAX_CHUNK_TOKENS = 4048  # Tamanho máximo permitido para o chunk

# Carregar e dividir os dados
try:
    raw_data = load_data_from_url(url)
    tokenizer = tiktoken.get_encoding("gpt2")
    chunks = split_into_chunks(raw_data, MAX_CHUNK_TOKENS, tokenizer)
except Exception as e:
    print(f"Erro ao carregar ou processar os dados: {e}")
    chunks = []
chunks

In [None]:

# Criar o prompt template
template = """
{question}
{data}
"""
prompt = ChatPromptTemplate.from_template(template)

# Processar cada chunk e acumular as respostas
responses = []
token_counts = []
print("numero de  tokens por chunk", len(chunks))
for chunk in chunks:
    token_count = len(tokenizer.encode(chunk))
    print("token_count", token_count)
    token_counts.append(token_count)
    print("token_counts.append", token_counts)
    chain = prompt | llm
    response = chain.invoke({"question": question, "data": chunk})
    responses.append(response)

# Combinar as respostas em uma resposta final
final_answer = " ".join(responses)

# Exibir a resposta final e a contagem de tokens
print("LLM Final Answer:")
print(final_answer)
print("\nToken Counts per Chunk:")
print(token_counts)


In [None]:
# Scores

# Configurações
url = "https://bflixhd.lol/filters/"  # Substitua pelo URL desejado
question = "nome do filme, ano de lançamento, e Scores, votes ou numero de views e tambem a sinopse" 
few_shot = '''
Considere os seguintes dados sobre filmes e séries. Extraia as informações de cada item e estruture-as de forma organizada como um JSON. Cada item deve conter os seguintes campos:
- type: Tipo do conteúdo (ex: "Movie", "Series").
- title: Título do conteúdo.
- season: Número da temporada (se aplicável).
- episode: Número do episódio (se aplicável).
- year: Ano de lançamento.
- rating: Avaliação do conteúdo (nota).
- country: País de origem (se aplicável).
- genre: Gêneros associados ao conteúdo.
- duration: Duração do filme ou episódio (se aplicável).
- score: A média das avaliações (se disponível).
- reviews_count: Número de avaliações.
- description: Descrição ou sinopse do conteúdo.

Aqui estão os dados:

HD    2024 SS 1 EP 15 Secret LevelSecret LevelHDTV-142024 7.9Country: USAGenre:Action, AnimationScores:7.9 by 14 reviews Adult animated series of original short stories which are set within the worlds of beloved video games. Each episode serves as a gateway to a new adventure, unlocking exciting worlds ...
HD    2024 Movie 102 min Ebenezer the TravelerEbenezer the TravelerHDNR2024 7102 minGenre:FantasyScores:7 by 1 reviews Ebenezer the Traveler continues the journey of the famous Ebenezer Scrooge following the events of "A Christmas Carol". In this untold twist, Ebenezer Scrooge is now enlisted as one of ...
HD    2024 Movie 91 min That ChristmasThat ChristmasHDPG2024 7.591 minCountry: UKGenre:Adventure, AnimationScores:7.5 by 54 reviews It\'s an unforgettable Christmas for the townsfolk of Wellington-on-Sea when the worst snowstorm in history alters everyone\'s plans — including Santa\'s.
HD    2024 Movie 110 min MaryMaryHDPG-132024 6.3110 minCountry: USAGenre:Action, DramaScores:6.3 by 27 reviews A miraculous conception. A merciless king. A murderous pursuit. Mary\'s journey to give birth to Jesus unfolds in this biblical coming-of-age epic.
HD    2024 SS 1 EP 2 Star Wars: Skeleton CrewStar Wars: Skeleton CrewHDTV-PG2024 7.2Country: USAGenre:Action, Science FictionScores:7.2 by 40 reviews Four ordinary kids search for their home planet after getting lost in the Star Wars galaxy.
HD    2024 SS 1 EP 4 Tomorrow and ITomorrow and IHDTV-MA2024 8.2Country: ThailandGenre:Drama, Science FictionScores:8.2 by 6 reviews This series reimagines Thailand in a dystopian future where technology scrapes at the surface of old customs, exposing rips in the fabric of culture.
HD    2024 SS 1 EP 6 Black DovesBlack DovesHDTV-MA2024 8Country: UKGenre:Action, CrimeScores:8 by 14 reviews When a spy posing as a politician\'s wife learns her lover has been murdered, an old assassin friend joins her on a quest for truth — and vengeance.
HD    2024 SS 1 EP 6 SennaSennaHDTV-MA2024 8.2Country: BrazilGenre:DramaScores:8.2 by 45 reviews Fascinated by cars since childhood, Brazilian racer Ayrton Senna became a sports legend — until tragedy struck, changing Formula 1 forever.
CAM    2024 Movie 100 min Moana 2Moana 2CAMPG2024 7.2100 minCountry: CanadaGenre:Adventure, AnimationScores:7.2 by 94 reviews After receiving an unexpected call from her wayfinding ancestors, Moana journeys alongside Maui and a new crew to the far seas of Oceania and into dangerous, long-lost waters for an ...
CAM    2024 Movie 161 min WickedWickedCAMPG2024 7.7161 minCountry: USAGenre:Drama, FantasyScores:7.7 by 255 reviews When ostracized and misunderstood green-skinned Elphaba is forced to share a room with the popular aristocrat Glinda.

'''

llm = OllamaLLM(model="llama3.1", temperature=0.9)
MAX_CHUNK_TOKENS = 4048  # Tamanho máximo permitido para o chunk

# Carregar e dividir os dados
try:
    raw_data = load_data_from_url(url)
    tokenizer = tiktoken.get_encoding("gpt2")
    chunks = split_into_chunks(raw_data, MAX_CHUNK_TOKENS, tokenizer)
except Exception as e:
    print(f"Erro ao carregar ou processar os dados: {e}")
    chunks = []
chunks


In [None]:

# Criar o prompt template
template = """
{data}
{few_shot}
{question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Processar cada chunk e acumular as respostas
responses = []
token_counts = []
print("numero de  tokens por chunk", len(chunks))
for chunk in chunks:
    token_count = len(tokenizer.encode(chunk))
    print("token_count", token_count)
    token_counts.append(token_count)
    print("token_counts.append", token_counts)
    chain = prompt | llm
    response = chain.invoke(
        {
            "question": question,
            "few_shot": few_shot,
            "data": chunk
        }
    )
    responses.append(response)

# Combinar as respostas em uma resposta final
final_answer = " ".join(responses)

# Exibir a resposta final e a contagem de tokens
print("LLM Final Answer:")
print(final_answer)
print("\nToken Counts per Chunk:")
print(token_counts)

tempo de execução
ollama
7m

llm studio
