Clonando o repositório

In [1]:
!git clone https://github.com/Mintplex-Labs/anything-llm.git

Cloning into 'anything-llm'...
remote: Enumerating objects: 35473, done.[K
remote: Counting objects: 100% (353/353), done.[K
remote: Compressing objects: 100% (204/204), done.[K
remote: Total 35473 (delta 241), reused 149 (delta 149), pack-reused 35120 (from 2)[K
Receiving objects: 100% (35473/35473), 48.95 MiB | 33.58 MiB/s, done.
Resolving deltas: 100% (22037/22037), done.


Execução do avalia_md.py

In [2]:
import os
import re
import json
import unicodedata
import time
import torch
from transformers import pipeline
import subprocess

print("GPU disponível?", torch.cuda.is_available())

if not torch.cuda.is_available():
    print("Ative a GPU em Runtime -> Change runtime type -> T4 GPU.")

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# =========================
# CONFIGURAÇÕES
# =========================

CAMINHO_REPO = "anything-llm"  # Substitua pelo caminho do repositório
LIMITE_CARACTERES = 3000  # Limite máximo por análise
ARQUIVO_SAIDA = "avalia_md.json"
ARQUIVO_SAIDA_TXT = "avalia_md.txt"

PADROES = [
    "Client-Server (a centralized server provides resources or services to multiple clients over a network)",
    "Blackboard (components work cooperatively by reading and writing shared data on a common knowledge base)",
    "Shared-Data (components communicate indirectly through shared data repositories or databases)",
    "Data-Model (the architecture centers around structured data schemas and access layers)",
    "Publish-Subscribe (components communicate asynchronously through message topics or events)",
    "Service-Oriented Architecture (system organized into reusable services communicating via standardized interfaces)",
    "Peer-to-Peer (decentralized network where each node can act as both client and server)",
    "Pipe-Filter (data flows through a sequence of processing steps, each transforming the input into output)",
    "Layers (system organized into hierarchical layers like presentation, logic, and data access)",
    "Microservices (independently deployable small services communicating via APIs or messaging)",
    "Blockchain (distributed ledger storing transactions in cryptographically linked blocks)"
]

# =========================
# MODELOS
# =========================

print("🧠 Carregando modelos...")

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# =========================
# FUNÇÕES AUXILIARES
# =========================

def limpar_markdown(texto):
    """Remove partes irrelevantes de arquivos .md"""
    texto = texto.encode("utf-8", "ignore").decode("utf-8")
    texto = ''.join(c for c in texto if c.isprintable())
    texto = unicodedata.normalize("NFKD", texto)
    texto = re.sub(r"```.*?```", "", texto, flags=re.DOTALL)  # blocos de código
    texto = re.sub(r"!\[.*?\]\(.*?\)", "", texto)  # imagens
    texto = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", texto)  # links [texto](url)
    texto = re.sub(r"http\S+", "", texto)  # urls soltas
    texto = re.sub(r"(^|\n)[#>\-\*\+]+\s*", "\n", texto)  # títulos, listas
    texto = re.sub(r"\n\s*\n+", "\n", texto)  # espaços múltiplos
    texto = re.sub(r" +", " ", texto)  # espaços duplos
    return texto.strip()

def dividir_texto(texto, limite):
    """Divide texto em partes menores respeitando o limite de caracteres"""
    partes = []
    while len(texto) > limite:
        corte = texto[:limite].rfind(".")
        if corte == -1:
            corte = limite
        partes.append(texto[:corte])
        texto = texto[corte:]
    if texto.strip():
        partes.append(texto.strip())
    return partes

# =========================
# PROCESSAMENTO
# =========================

resultados = []

tempo_execucao = time.perf_counter()

for raiz, dirs, arquivos in os.walk(CAMINHO_REPO):

     dirs[:] = [d for d in dirs if d.lower() != "locales"]

     for nome_arquivo in arquivos:
        if nome_arquivo.endswith(".md"):
            caminho = os.path.join(raiz, nome_arquivo)
            print(f"\n📄 Lendo {caminho}")

            with open(caminho, "r", encoding="utf-8", errors="ignore") as f:
                conteudo = f.read()

            conteudo_limpo = limpar_markdown(conteudo)
            if not conteudo_limpo.strip():
                print("⚪ Ignorado (sem conteúdo relevante)")
                continue

            partes = dividir_texto(conteudo_limpo, LIMITE_CARACTERES)
            resumo_final = ""

            for i, parte in enumerate(partes):

                input_length = len(parte.split())
                # Ajusta automaticamente os limites de resumo com base no tamanho da entrada
                max_len = max(40, int(input_length * 0.8))  # 80% do tamanho original, mínimo 40 tokens
                min_len = max(20, int(input_length * 0.3))  # 30% do tamanho original, mínimo 20 tokens

                try:
                    resumo = summarizer(parte,max_length=max_len,min_length=min_len, do_sample=False)[0]["summary_text"]
                    resumo_final += resumo + " "
                except Exception as e:
                    print(f"⚠️ Erro ao resumir parte {i+1}: {e}")

            if resumo_final.strip():
                try:
                    classificacao = classifier(
                        resumo_final,
                        candidate_labels=PADROES,
                        hypothesis_template="This project follows the following software architecture pattern: {}."
                    )
                    padrao_predito = classificacao["labels"][0]
                    confianca = classificacao["scores"][0]

                    print(f"🔹 {nome_arquivo}: {padrao_predito} ({confianca:.1%})")

                    resultados.append({
                        "arquivo": caminho, #nome_arquivo,
                        "resumo": resumo_final.strip(),
                        "padrao_arquitetural": padrao_predito,
                        "confianca": round(confianca, 3)
                    })
                except Exception as e:
                    print(f"⚠️ Erro ao classificar {nome_arquivo}: {e}")

# =========================
# SALVAR RESULTADOS
# =========================

with open(ARQUIVO_SAIDA, "w", encoding="utf-8") as f:
    json.dump(resultados, f, ensure_ascii=False, indent=2)

#try:
#    subprocess.run(["python", "json_to_txt.py"], check=True)
#    print("✅ Conversão concluída com sucesso!")
#except subprocess.CalledProcessError as e:
#    print(f"⚠️ Erro ao converter para txt: {e}")

tempo_execucao = time.perf_counter() - tempo_execucao

print("\n✅ Análise concluída!")
print(f"📁 Resultados salvos em: {ARQUIVO_SAIDA_TXT} {ARQUIVO_SAIDA}")

print(f"⏱️ Tempo de execução {tempo_execucao}")

GPU disponível? True
🧠 Carregando modelos...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0



📄 Lendo anything-llm/CONTRIBUTING.md
🔹 CONTRIBUTING.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (19.6%)

📄 Lendo anything-llm/README.md
🔹 README.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (30.3%)

📄 Lendo anything-llm/pull_request_template.md
🔹 pull_request_template.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (24.8%)

📄 Lendo anything-llm/SECURITY.md
🔹 SECURITY.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (24.4%)

📄 Lendo anything-llm/BARE_METAL.md


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


🔹 BARE_METAL.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (19.9%)

📄 Lendo anything-llm/server/utils/prisma/PRISMA.md
🔹 PRISMA.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (29.8%)

📄 Lendo anything-llm/server/utils/vectorDbProviders/astra/ASTRA_SETUP.md
🔹 ASTRA_SETUP.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (61.3%)

📄 Lendo anything-llm/server/utils/vectorDbProviders/pgvector/SETUP.md
🔹 SETUP.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (49.5%)

📄 Lendo anything-llm/server/utils/vectorDbProviders/milvus/MILVUS_SETUP.md
🔹 MILVUS_SETUP.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (55.0%)

📄 Lendo anything-llm/server/utils/vectorDbProviders/chroma/CHROMA_SETUP.md
🔹 CHROMA_SETUP.md: Shared-Data (components communicate indirec

Your max_length is set to 40, but your input_length is only 39. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)


🔹 DOCUMENTS.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (38.0%)

📄 Lendo anything-llm/collector/hotdir/__HOTDIR__.md
🔹 __HOTDIR__.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (40.2%)

📄 Lendo anything-llm/docker/HOW_TO_USE_DOCKER.md
🔹 HOW_TO_USE_DOCKER.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (17.4%)

📄 Lendo anything-llm/cloud-deployments/digitalocean/terraform/DEPLOY.md
🔹 DEPLOY.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (14.7%)

📄 Lendo anything-llm/cloud-deployments/helm/charts/anythingllm/README.md
🔹 README.md: Shared-Data (components communicate indirectly through shared data repositories or databases) (25.1%)

📄 Lendo anything-llm/cloud-deployments/gcp/deployment/DEPLOY.md
🔹 DEPLOY.md: Shared-Data (components communicate indirectly through shared data repositories

Conversão para formato .txt

In [6]:
import json
from collections import Counter, defaultdict

# =========================
# CONFIGURAÇÕES
# =========================
ARQUIVO_JSON = "avalia_md.json"   # JSON de entrada
ARQUIVO_TXT_SAIDA = "avalia_md.txt"  # TXT de saída

# =========================
# LEITURA DO JSON
# =========================
try:
    with open(ARQUIVO_JSON, "r", encoding="utf-8") as f:
        dados = json.load(f)
except FileNotFoundError:
    print(f"Arquivo {ARQUIVO_JSON} não encontrado.")
    exit()
except json.JSONDecodeError as e:
    print(f"Erro ao ler JSON: {e}")
    exit()

if not dados:
    print("Nenhum dado encontrado no arquivo JSON.")
    exit()

# =========================
# CÁLCULOS ESTATÍSTICOS
# =========================
contagem_padroes = Counter()
soma_confiancas = defaultdict(float)

for item in dados:
    padrao = item.get("padrao_arquitetural", "Desconhecido")
    confianca = item.get("confianca", 0)
    contagem_padroes[padrao] += 1
    soma_confiancas[padrao] += confianca

total_arquivos = len(dados)
padrao_mais_comum = contagem_padroes.most_common(1)[0]
padrao_nome = padrao_mais_comum[0]
padrao_qtd = padrao_mais_comum[1]
padrao_confianca_media = soma_confiancas[padrao_nome] / padrao_qtd if padrao_qtd > 0 else 0

# =========================
# GRAVAÇÃO DO TXT
# =========================
with open(ARQUIVO_TXT_SAIDA, "w", encoding="utf-8") as f:
    f.write("=== RESULTADOS DA CLASSIFICAÇÃO DE PADRÕES ARQUITETURAIS ===\n\n")

    for item in dados:
        f.write(f"Arquivo: {item.get('arquivo', 'N/A')}\n")
        f.write(f"Padrão Arquitetural: {item.get('padrao_arquitetural', 'N/A')}\n")
        f.write(f"Confiança: {item.get('confianca', 0):.2%}\n")
        resumo = item.get('resumo', '').strip().replace("\n", " ")
        f.write(f"Resumo: {resumo}\n")
        f.write("-" * 60 + "\n")

    f.write("\n=== ESTATÍSTICAS GERAIS ===\n")
    f.write(f"Total de arquivos analisados: {total_arquivos}\n\n")
    f.write("Distribuição de padrões detectados:\n")

    for padrao, qtd in contagem_padroes.most_common():
        confianca_media = soma_confiancas[padrao] / qtd
        f.write(f" - {padrao}: {qtd} ocorrências (média {confianca_media:.1%})\n")

    f.write("\n=== PADRÃO MAIS PROVÁVEL ===\n")
    f.write(f"Padrão predominante: {padrao_nome}\n")
    f.write(f"Ocorrências: {padrao_qtd}\n")
    f.write(f"Confiança média: {padrao_confianca_media:.1%}\n")

print(f"✅ Conversão concluída!")
print(f"📁 Resultados salvos em: {ARQUIVO_TXT_SAIDA}")


✅ Conversão concluída!
📁 Resultados salvos em: avalia_md.txt


Download local dos resultados

In [9]:
from google.colab import files
files.download("avalia_md.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
from google.colab import files
files.download("avalia_md.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>