In [62]:
import json
import os
import tempfile
from git import Repo
import os
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid

In [51]:
def get_unique_repo_set(jsonl_path):
    repos = set()
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                sample = json.loads(line)
                repo = sample.get("repo")
                if repo:
                    repos.add(repo.strip())
            except json.JSONDecodeError:
                continue
    return repos

In [52]:
lite_repos = get_unique_repo_set("./swe_bench/swe_bench_lite_test.jsonl")

In [53]:
len(lite_repos)

12

In [54]:
list_repos: list[str] = list(lite_repos)
list_repos = sorted(list_repos)

In [55]:
list_repos

['astropy/astropy',
 'django/django',
 'matplotlib/matplotlib',
 'mwaskom/seaborn',
 'pallets/flask',
 'psf/requests',
 'pydata/xarray',
 'pylint-dev/pylint',
 'pytest-dev/pytest',
 'scikit-learn/scikit-learn',
 'sphinx-doc/sphinx',
 'sympy/sympy']

In [56]:
GITHIB_BASE_URL = "https://github.com/"

In [57]:
model = SentenceTransformer("BAAI/bge-large-en")

In [66]:
def collect_files(root_dir, extensions={".py"}):
    collected = []

    for dirpath, _, filenames in os.walk(root_dir):
        if any(excluded in dirpath for excluded in [".git", "node_modules", ".venv", "__pycache__"]):
            continue
        for fname in filenames:
            if any(fname.endswith(ext) for ext in extensions):
                collected.append(os.path.join(dirpath, fname))
    return collected

def read_file(file_path):
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    except Exception as e:
        print(f"Error leyendo {file_path}: {e}")
        return ""

def embed_repo(repo_path):
    files = collect_files(repo_path)
    embeddings = []
    metadata = []
    docs = []

    for path in files:
        content = read_file(path)
        if not content.strip():
            continue
        
        path_split = path.split("/")
        name = path_split[-1]

        content = "[CLS] " + content
        embedding = model.encode(content)  
        embeddings.append(embedding)
        metadata.append({"name": name, "extension": "py", "path": "/".join(path_split[-2:])})
        docs.append(content)

    return embeddings, metadata, docs

In [59]:
def process_repo(repo):
    # 1. Crear carpeta temporal
    repo_name = repo.split("/")[-1]
    repo_url = GITHIB_BASE_URL + repo
    
    with tempfile.TemporaryDirectory() as tmp_dir:
        print(f"Clonando {repo_url} en {tmp_dir}")
        try:
            Repo.clone_from(repo_url, tmp_dir)
        except Exception as e:
            print(f"Error al clonar {repo_url}: {e}")
            return
        # _ = input("lll")
        embeddings, metadata, docs = embed_repo(tmp_dir )

        # 4. Guardar en vector DB
        # vector_db.add(embeddings, metadata=metadata)

        # 5. tmp_dir se elimina automáticamente
        print(f"Repo {repo_url} procesado y eliminado.")
        return embeddings, metadata, docs

In [60]:
chroma_client = chromadb.HttpClient(host="localhost", port=8005, settings=Settings())

In [None]:
current_repos = list_repos[1:]
BATCH_SIZE = 50

for idx, repo in enumerate(current_repos):
    print(f"({idx}/{len(current_repos)}) Repo: {repo} ------------------------ \n")
    embeddings, metadata, docs = process_repo(repo)
    collection_name = repo.replace("/", "_")
    
    collection = chroma_client.get_or_create_collection(name=collection_name)
    
    for i in range(0, len(docs), BATCH_SIZE):
        batch_docs = docs[i:i+BATCH_SIZE]
        batch_embeddings = embeddings[i:i+BATCH_SIZE]
        batch_metadata = metadata[i:i+BATCH_SIZE]
        batch_ids = [str(uuid.uuid4()) for _ in batch_docs]

        collection.upsert(
            ids=batch_ids,
            documents=batch_docs,
            embeddings=batch_embeddings,
            metadatas=batch_metadata
        )    

(0/11) Repo: django/django ------------------------ 

Clonando https://github.com/django/django en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmp4ndzme2t


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Repo https://github.com/django/django procesado y eliminado.
(1/11) Repo: matplotlib/matplotlib ------------------------ 

Clonando https://github.com/matplotlib/matplotlib en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmpnioxiccf


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Repo https://github.com/matplotlib/matplotlib procesado y eliminado.
(2/11) Repo: mwaskom/seaborn ------------------------ 

Clonando https://github.com/mwaskom/seaborn en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmpvqzdnlbg


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Repo https://github.com/mwaskom/seaborn procesado y eliminado.
(3/11) Repo: pallets/flask ------------------------ 

Clonando https://github.com/pallets/flask en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmpqjq4hyoh


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Repo https://github.com/pallets/flask procesado y eliminado.
(4/11) Repo: psf/requests ------------------------ 

Clonando https://github.com/psf/requests en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmpv7eysq24


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Repo https://github.com/psf/requests procesado y eliminado.
(5/11) Repo: pydata/xarray ------------------------ 

Clonando https://github.com/pydata/xarray en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmp8lhxjtru


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Repo https://github.com/pydata/xarray procesado y eliminado.
(6/11) Repo: pylint-dev/pylint ------------------------ 

Clonando https://github.com/pylint-dev/pylint en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmp_bdw16n_


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Repo https://github.com/pylint-dev/pylint procesado y eliminado.
(7/11) Repo: pytest-dev/pytest ------------------------ 

Clonando https://github.com/pytest-dev/pytest en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmp_cnwlfrp


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Repo https://github.com/pytest-dev/pytest procesado y eliminado.
(8/11) Repo: scikit-learn/scikit-learn ------------------------ 

Clonando https://github.com/scikit-learn/scikit-learn en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmp72xj3l87


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Repo https://github.com/scikit-learn/scikit-learn procesado y eliminado.
(9/11) Repo: sphinx-doc/sphinx ------------------------ 

Clonando https://github.com/sphinx-doc/sphinx en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmp4qf6lxjw


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Repo https://github.com/sphinx-doc/sphinx procesado y eliminado.
(10/11) Repo: sympy/sympy ------------------------ 

Clonando https://github.com/sympy/sympy en /var/folders/c1/rzk6gvfs68l9xj63x3bl24nr0000gn/T/tmp28gx_vxc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Repo https://github.com/sympy/sympy procesado y eliminado.
