Sélectionner un dépôt GitHub test

In [9]:
from git import Repo
import os

# Dossier où on stocke les dépôts clonés
os.makedirs("repos", exist_ok=True)

# Exemple : cloner un repo public
repo_url = "https://github.com/psf/requests.git"  # tu peux changer l’URL
local_path = "repos/requests"

if not os.path.exists(local_path):
    Repo.clone_from(repo_url, local_path)
    print(f"✅ Repo cloné : {repo_url}")
else:
    print("⚡ Repo déjà cloné")


⚡ Repo déjà cloné


Télécharger CodeSearchNet

In [10]:
from datasets import load_dataset

# Charger une partie de CodeSearchNet (ex: Python)
dataset = load_dataset("Nan-Do/code-search-net-python")

print(dataset)
print(dataset["train"][0])  # exemple d’un échantillon


DatasetDict({
    train: Dataset({
        features: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition', 'summary'],
        num_rows: 455243
    })
})
{'repo': 'ageitgey/face_recognition', 'path': 'examples/face_recognition_knn.py', 'func_name': 'train', 'original_string': 'def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo=\'ball_tree\', verbose=False):\n    """\n    Trains a k-nearest neighbors classifier for face recognition.\n\n    :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n     (View in source code to see train_dir example tree structure)\n\n     Structure:\n        <train_dir>/\n        ├── <person1>/\n        │   ├── <somename1>.jpeg\n        │   ├── <somename2>.jpeg\n        │   ├── ...\n        ├── <person2>/\n        │   ├── <somename1>.jpeg\n        │   └── <somename2>.jpeg\n        └── ...\n\n    :param model

Début de l’ingestion (lecture de fichiers du repo)


In [11]:
import glob

files = glob.glob(local_path + "/**/*.py", recursive=True)

print(f"📂 {len(files)} fichiers trouvés")
print(files[:5])  # afficher les 5 premiers


📂 36 fichiers trouvés
['repos/requests\\setup.py', 'repos/requests\\docs\\conf.py', 'repos/requests\\docs\\_themes\\flask_theme_support.py', 'repos/requests\\src\\requests\\adapters.py', 'repos/requests\\src\\requests\\api.py']


Extraire du code d’un dépôt GitHub

In [12]:
import os

def load_repo_files(path, extensions=[".py"]):
    code_files = []
    for root, _, files in os.walk(path):
        for f in files:
            if any(f.endswith(ext) for ext in extensions):
                try:
                    with open(os.path.join(root, f), "r", encoding="utf-8") as fp:
                        code_files.append({
                            "file": os.path.relpath(os.path.join(root, f), path),
                            "content": fp.read()
                        })
                except Exception as e:
                    print(f"⚠️ Erreur lecture {f}: {e}")
    return code_files

repo_code = load_repo_files("repos/requests")
print(f"📂 {len(repo_code)} fichiers chargés depuis le repo")
print(repo_code[0]["file"])
print(repo_code[0]["content"][:300])  # aperçu


📂 36 fichiers chargés depuis le repo
setup.py
#!/usr/bin/env python
import os
import sys
from codecs import open

from setuptools import setup

CURRENT_PYTHON = sys.version_info[:2]
REQUIRED_PYTHON = (3, 9)

if CURRENT_PYTHON < REQUIRED_PYTHON:
    sys.stderr.write(
        """
Unsupported Python version


Extraire quelques exemples de CodeSearchNet

In [13]:
from datasets import load_dataset

# Charger uniquement une petite portion pour tester (100 exemples)
dataset = load_dataset("Nan-Do/code-search-net-python", split="train[:100]")

codesearch_samples = []
for i, ex in enumerate(dataset):
    code = ex["code"]
    docstring = ex.get("docstring", "")
    codesearch_samples.append({
        "file": f"codesearchnet_{i}.py",
        "content": code,
        "doc": docstring
    })

print(f"📚 {len(codesearch_samples)} extraits de CodeSearchNet chargés")
print("Exemple fichier:", codesearch_samples[0]["file"])
print("Code:", codesearch_samples[0]["content"][:200])
print("Docstring:", codesearch_samples[0]["doc"])


📚 100 extraits de CodeSearchNet chargés
Exemple fichier: codesearchnet_0.py
Code: def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):
    """
    Trains a k-nearest neighbors classifier for face recognition.

    :param train_dir: dire
Docstring: Trains a k-nearest neighbors classifier for face recognition.

    :param train_dir: directory that contains a sub-directory for each known person, with its name.

     (View in source code to see train_dir example tree structure)

     Structure:
        <train_dir>/
        ├── <person1>/
        │   ├── <somename1>.jpeg
        │   ├── <somename2>.jpeg
        │   ├── ...
        ├── <person2>/
        │   ├── <somename1>.jpeg
        │   └── <somename2>.jpeg
        └── ...

    :param model_save_path: (optional) path to save model on disk
    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified
    :param knn_algo: (optional) und

In [15]:
from langchain.document_loaders import GitLoader

# Charger les fichiers du dépôt cloné
loader = GitLoader(repo_path="repos/requests")
repos = loader.load()

Error reading file repos/requests\tests/certs/valid/ca: [Errno 13] Permission denied: 'repos/requests\\tests/certs/valid/ca'
Error reading file repos/requests\tests/certs/mtls/client/ca: [Errno 13] Permission denied: 'repos/requests\\tests/certs/mtls/client/ca'


Indexer dans Chroma

In [19]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Embeddings locaux (léger et rapide)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Préparer tous les documents (repo + CodeSearchNet)
documents = repos + codesearch_samples

texts = []
metadatas = []

for doc in documents:
    # Si c’est un Document LangChain
    if hasattr(doc, "page_content"):
        content = doc.page_content
        metadata = doc.metadata
    else:
        # Si c’est un dict (CodeSearchNet)
        content = doc["content"]
        metadata = {"doc": doc.get("doc", ""), "file": doc.get("file", "inconnu")}

        # Ajouter la docstring si elle existe
        if metadata["doc"]:
            content = metadata["doc"] + "\n\n" + content

    texts.append(content)
    metadatas.append({"source": metadata.get("file", "inconnu")})


# Construire l’index vectoriel
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_texts(texts, embeddings, metadatas=metadatas)

print("✅ Index créé avec", len(texts), "documents")


✅ Index créé avec 219 documents


Recherche avec un modèle local Ollama (Mistral)


In [21]:
from langchain.llms import Ollama

# Mistral via Ollama
llm = Ollama(model="mistral")

# Exemple de recherche
query = "How does the code handle HTTP requests?"

# 1. Récupérer les passages similaires
results = vectorstore.similarity_search(query, k=3)

# 2. Construire un prompt avec le contexte
context = "\n\n".join([r.page_content for r in results])
prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

# 3. Lancer la génération locale
answer = llm(prompt)
print("🤖 Réponse Mistral:\n", answer)


🤖 Réponse Mistral:
  The provided code is a Python library called Requests that allows you to send and receive HTTP responses from various servers using a simple and elegant API. It supports multiple HTTP methods such as GET, POST, PUT, DELETE, etc., and it handles tasks like adding query strings, form-encoding data, handling cookies, and more, making it easy for developers to interact with HTTP resources.

It also comes with features like connection pooling, TLS/SSL verification, basic & digest authentication, and automatic content decompression and decoding. Requests is one of the most popular Python packages, and it's widely used in building robust and reliable HTTP-speaking applications.


In [24]:
from langchain.llms import Ollama
from IPython.display import display, Markdown

llm = Ollama(model="mistral")

def ask_code_agent(question: str, k: int = 3):
    # 1. Recherche contexte
    results = vectorstore.similarity_search(question, k=k)
    context = "\n\n".join([r.page_content for r in results])

    # 2. Prompt
    prompt = f"""You are an AI code assistant.
Answer the question based only on the context below.

Context:
{context}

Question: {question}
Answer in Markdown with code blocks if needed.
"""

    # 3. Stream Ollama (texte brut)
    stream = llm.stream(prompt)

    # Affichage progressif
    display_handle = display(Markdown(""), display_id=True)
    full_output = ""

    for token in stream:  # Ollama envoie du texte
        full_output += token
        display_handle.update(Markdown(full_output))

    return full_output


In [25]:
response = ask_code_agent("How does the code handle HTTP requests?")


 The code handles HTTP requests using theRequests library in Python. It allows you to send GET and POST requests easily, as shown in the example provided:

```python
>>> import requests
>>> r = requests.get('https://www.python.org')
>>> r.status_code
200
>>> b'Python is a programming language' in r.content
True
```

The `requests.get()` function sends a GET request to the specified URL, and the response is stored in the `r` variable. The response status code can be accessed with `r.status_code`, and the content of the response can be checked using `r.content`.

For more advanced HTTP requests like POST, PUT, DELETE etc., you can use the corresponding functions provided by the Requests library:

```python
>>> payload = dict(key1='value1', key2='value2')
>>> r = requests.post('https://httpbin.org/post', data=payload)
>>> print(r.text)
...
"form": {
  "key1": "value1",
  "key2": "value2"
}
...
```

You can find more details and supported features in the API Reference and User Guide available at <https://requests.readthedocs.io>.

In [26]:
response = ask_code_agent("Salut mistral")

 The context provided does not contain any information related to "Salut Mistral". It seems like it is a documentation for the Python HTTP library called Requests, which includes its license, features, and usage instructions. There is no apparent connection between Requests and "Salut Mistral".