In [3]:
import os
import tempfile
from git import Repo
import os
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
GITHIB_BASE_URL = "https://github.com/"

In [4]:
model = SentenceTransformer("BAAI/bge-large-en")

In [9]:
def collect_files(root_dir, extensions={".py"}):
    collected = []

    for dirpath, _, filenames in os.walk(root_dir):
        if any(excluded in dirpath for excluded in [".git", "node_modules", ".venv", "__pycache__"]):
            continue
        for fname in filenames:
            if any(fname.endswith(ext) for ext in extensions):
                collected.append(os.path.join(dirpath, fname))
    return collected

def read_file(file_path):
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    except Exception as e:
        print(f"Error leyendo {file_path}: {e}")
        return ""

def embed_repo(repo_path):
    files = collect_files(repo_path)
    embeddings = []
    metadata = []
    docs = []

    for path in files:
        content = read_file(path)
        if not content.strip():
            continue
        
        path_split = path.split("/")
        name = path_split[-1]

        content = "[CLS] " + content
        embedding = model.encode(content)  
        embeddings.append(embedding)
        metadata.append({"name": name, "extension": "py", "path": "/".join(path_split[-2:])})
        docs.append(content)

    return embeddings, metadata, docs

In [10]:
def process_repo(repo: str):
    repo_url = GITHIB_BASE_URL + repo
    repo_name = repo.split("/")[-1]
    
    with tempfile.TemporaryDirectory() as tmp_dir:
        print(f"Clonando {repo_url} en {tmp_dir}/{repo_name}")
        try:
            Repo.clone_from(repo_url, tmp_dir+"/"+repo_name)
        except Exception as e:
            print(f"Error al clonar {repo_url}: {e}")
            return

        embeddings, metadata, docs = embed_repo(tmp_dir+"/"+repo_name )

        print(f"Repo {repo_url} procesado y eliminado.")
        return embeddings, metadata, docs

In [11]:
chroma_client = chromadb.HttpClient(host="localhost", port=8005, settings=Settings())

In [12]:
embeddings, metadata, docs = process_repo("C3RetoAI/Testing-Github-Bot")

Clonando https://github.com/C3RetoAI/Testing-Github-Bot en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmpvq6k5x6f/Testing-Github-Bot
Repo https://github.com/C3RetoAI/Testing-Github-Bot procesado y eliminado.


In [13]:
import uuid


collection_name = "C3RetoAI_Testing-Github-Bot"

collection = chroma_client.get_or_create_collection(name=collection_name)

collection.upsert(
        ids=[str(uuid.uuid4()) for _ in docs],
        documents=docs,
        embeddings=embeddings,
        metadatas=metadata
    )  