Partie 1 : Installation des dépendances

In [21]:
# 1. Install Pinecone libraries
!pip install pinecone==6.0.1 pinecone-notebooks

# Install additional libraries needed
!pip install pandas torch transformers sentence-transformers



Part 1: Load Documents & Execute Reranking Model

# 2. Authenticate with Pinecone

In [22]:
import os

# Authentification
if not os.environ.get("PINECONE_API_KEY"):
    from pinecone_notebooks.colab import Authenticate
    Authenticate()



# 3. Instantiate the Pinecone client

In [23]:
from pinecone import Pinecone

# Initialisation du client Pinecone
api_key = os.environ["PINECONE_API_KEY"]
environment = os.environ.get("PINECONE_ENVIRONMENT", "us-west1-gcp")  # Valeur par défaut
pc = Pinecone(api_key=api_key)

# 4. Define your query & documents

In [24]:
def show_reranked(query, reranked_data):
    """Affiche les résultats du reranking"""
    print(f"Query: {query}")
    print("Reranked results:")
    for i, result in enumerate(reranked_data):
        print(f"{i+1}. Score: {result.score:.4f}")
        print(f"   Document: {result.document}")
        print()

# Définition de la requête et des documents
query = "Tell me about Apple's products"
documents = [
    "Apple is a popular fruit that comes in many varieties.",
    "Apple Inc. released the new iPhone with advanced features.",
    "Green apples are tart and delicious for baking.",
    "Apple's MacBooks are top sellers in the laptop market.",
    "I had a red apple for breakfast this morning."
]

# Exécution du reranking
try:
    reranked = pc.inference.rerank(
        model="bge-reranker-v2-m3",
        query=query,
        documents=documents
    )

    if reranked is not None and hasattr(reranked, 'data'):
        show_reranked(query, reranked.data)
    else:
        print("Erreur : reranked est None ou n'a pas d'attribut 'data'")

except Exception as e:
    print(f"Erreur lors du reranking: {e}")

Query: Tell me about Apple's products
Reranked results:
1. Score: 0.2770
   Document: text="Apple's MacBooks are top sellers in the laptop market."

2. Score: 0.0839
   Document: text='Apple is a popular fruit that comes in many varieties.'

3. Score: 0.0077
   Document: text='Apple Inc. released the new iPhone with advanced features.'

4. Score: 0.0010
   Document: text='Green apples are tart and delicious for baking.'

5. Score: 0.0000
   Document: text='I had a red apple for breakfast this morning.'



In [25]:
import os
import pinecone
import time

# Méthode d'authentification la plus fiable
try:
    # Option 1 : Authentification via les secrets Colab (recommandé)
    from google.colab import userdata
    api_key = userdata.get('PINECONE_API_KEY')
except:
    # Option 2 : Authentification via la méthode des notebooks Pinecone
    if not os.environ.get("PINECONE_API_KEY"):
        from pinecone_notebooks.colab import Authenticate
        Authenticate()
    api_key = os.environ.get("PINECONE_API_KEY")

# Initialisation Pinecone
pc = pinecone.Pinecone(api_key=api_key)

# Configuration de l'index
index_name = "pinecone-reranker"
spec = {
    "serverless": {
        "cloud": "aws",
        "region": "us-east-1"  # ou "eu-west-1" pour l'Europe
    }
}

# Gestion optimisée de l'index
try:
    existing_indexes = pc.list_indexes().names()

    if index_name in existing_indexes:
        print(f"L'index {index_name} existe déjà - suppression...")
        pc.delete_index(index_name)
        print("Index supprimé. Recréation...")

    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=spec
    )
    print("Index créé avec succès. Initialisation en cours...")

    # Vérification optimisée
    index = pc.Index(index_name)
    start_time = time.time()
    timeout = 120  # 2 minutes maximum

    while True:
        try:
            stats = index.describe_index_stats()
            if stats.get('total_vector_count', 0) >= 0:  # Juste pour vérifier la connexion
                print(f"Index prêt en {time.time()-start_time:.2f}s")
                print("Statistiques:", stats)
                break
        except Exception as e:
            if time.time() - start_time > timeout:
                raise TimeoutError(f"Initialisation dépassant {timeout} secondes")
            time.sleep(5)

except Exception as e:
    print("Erreur critique:", str(e))
    print("Vérifiez:")
    print("1. Votre clé API dans Colab Secrets (icône 🔒)")
    print("2. Votre quota sur app.pinecone.io")
    print("3. La région de votre index")

L'index pinecone-reranker existe déjà - suppression...
Index supprimé. Recréation...
Index créé avec succès. Initialisation en cours...
Index prêt en 0.89s
Statistiques: {'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [26]:
import requests
import pandas as pd
import json
from tqdm import tqdm

# Configuration
DATA_URL = "https://raw.githubusercontent.com/pinecone-io/examples/refs/heads/master/docs/data/sample_notes_data.jsonl"
CHUNK_SIZE = 100  # Taille des lots pour l'insertion

def download_and_prepare_data(url):
    """Télécharge et prépare les données pour Pinecone"""
    try:
        # Téléchargement avec vérification
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        # Conversion correcte en DataFrame
        data = [json.loads(line) for line in response.text.splitlines()]
        df = pd.DataFrame(data)

        # Vérification des colonnes requises
        required_columns = {'id', 'values', 'metadata'}
        if not required_columns.issubset(df.columns):
            missing = required_columns - set(df.columns)
            raise ValueError(f"Colonnes manquantes: {missing}")

        # Conversion des types
        df['id'] = df['id'].astype(str)
        df['values'] = df['values'].apply(lambda x: list(map(float, x)))

        return df

    except requests.exceptions.RequestException as e:
        print(f"Erreur de réseau: {str(e)}")
    except json.JSONDecodeError as e:
        print(f"Erreur de décodage JSON: {str(e)}")
    except Exception as e:
        print(f"Erreur inattendue: {str(e)}")
    return None

def insert_to_pinecone(index, dataframe, batch_size=100):
    """Insertion optimisée des données dans Pinecone"""
    try:
        # Préparation des vecteurs
        vectors = [
            (str(row['id']),
             [float(x) for x in row['values']],  # Conversion explicite
             dict(row['metadata']) if pd.notna(row['metadata']) else {}
            )
            for _, row in dataframe.iterrows()
        ]

        # Insertion par lots avec progression
        for i in tqdm(range(0, len(vectors), batch_size),
                     desc="Insertion des vecteurs",
                     unit="vec"):
            batch = vectors[i:i + batch_size]
            index.upsert(vectors=batch)

        print(f"\n✅ {len(dataframe)} vecteurs insérés avec succès")
        return True

    except Exception as e:
        print(f"❌ Erreur lors de l'insertion: {str(e)}")
        return False

# Exécution
print("📥 Téléchargement des données...")
df = download_and_prepare_data(DATA_URL)

if df is not None:
    print("\n🔍 Aperçu des données:")
    print(df[['id', 'values']].head())

    print("\n🚀 Début de l'insertion dans Pinecone...")
    if insert_to_pinecone(index, df, CHUNK_SIZE):
        stats = index.describe_index_stats()
        print("\n📊 Statistiques finales:")
        print(f"• Vecteurs: {stats['total_vector_count']}")
        print(f"• Dimensions: {stats['dimension']}")
        print(f"• Espace: {len(stats['namespaces'])} namespaces")

📥 Téléchargement des données...

🔍 Aperçu des données:
     id                                             values
0  P011  [-0.2027486265, 0.2769146562, -0.1509393603, 0...
1  P001  [0.1842793673, 0.4459365904, -0.0770567134, 0....
2  P002  [-0.2040648609, -0.1739618927, -0.2897160649, ...
3  P003  [0.1889383644, 0.2924542725, -0.2335938066, -0...
4  P004  [-0.1217106804, 0.1674752235, -0.2318888158, 0...

🚀 Début de l'insertion dans Pinecone...


Insertion des vecteurs: 100%|██████████| 1/1 [00:01<00:00,  1.42s/vec]



✅ 100 vecteurs insérés avec succès

📊 Statistiques finales:
• Vecteurs: 0
• Dimensions: 384
• Espace: 0 namespaces


In [27]:
from sentence_transformers import SentenceTransformer

def get_embedding(text):
    """Génère des embeddings de texte"""
    model = SentenceTransformer("all-MiniLM-L6-v2")
    return model.encode(text).tolist()

def show_results(q, matches):
    """Affiche les résultats de recherche"""
    print(f"Question: {q}")
    for i, m in enumerate(matches):
        print(f"{i+1}. ID: {m.id}, Score: {m.score:.4f}")
        print(f"   Metadata: {m.metadata}\n")

# Exemple de recherche
question = "what if my patient has leg pain"
emb = get_embedding(question)
results = index.query(vector=emb, top_k=5, include_metadata=True)
matches = sorted(results.matches, key=lambda m: m.score, reverse=True)

show_results(question, matches)

Question: what if my patient has leg pain


In [28]:
import os
from pinecone import Pinecone

# Authentification avec la méthode pinecone_notebooks
if not os.environ.get("PINECONE_API_KEY"):
    from pinecone_notebooks.colab import Authenticate
    Authenticate()  # Cette méthode va configurer la variable d'environnement

# Initialisation du client Pinecone
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

def show_reranked(query, results, top_n=5):
    print(f"Query: {query}\n")
    for i, res in enumerate(results[:top_n]):
        print(f"{i+1}. Score: {res.score:.4f}")
        print(f"   Texte: {res.document['text']}\n")

# Documents cliniques à reranker
clinical_docs = [
    {"text": "Protocol for managing chest pain in emergency"},
    {"text": "Guidelines for leg pain diagnosis"},
    {"text": "Treatment options for chronic back pain"},
    {"text": "Differential diagnosis for leg swelling"},
    {"text": "First aid for sports injuries"}
]

# Requête clinique
clinical_query = "what if my patient has leg pain"

try:
    # Appel correct sans top_k
    reranked = pc.inference.rerank(
        model="bge-reranker-v2-m3",
        query=clinical_query,
        documents=clinical_docs
    )

    # Afficher les top 5 résultats
    show_reranked(clinical_query, reranked.data)

except Exception as e:
    print(f"Erreur: {str(e)}")

Query: what if my patient has leg pain

1. Score: 0.1542
   Texte: Guidelines for leg pain diagnosis

2. Score: 0.0029
   Texte: First aid for sports injuries

3. Score: 0.0012
   Texte: Treatment options for chronic back pain

4. Score: 0.0010
   Texte: Differential diagnosis for leg swelling

5. Score: 0.0001
   Texte: Protocol for managing chest pain in emergency



In [29]:
def setup_pinecone_connection():
    """Configure la connexion Pinecone de manière interactive"""
    api_key = input("Entrez votre clé API Pinecone: ")
    try:
        pc = Pinecone(api_key=api_key)
        print("✅ Client Pinecone initialisé avec succès")
        return pc
    except Exception as e:
        print(f"❌ Erreur d'initialisation Pinecone: {e}")
        return None

def show_reranked_clinical(query, reranked_data):
    """Affiche les résultats cliniques rerankés"""
    print(f"Clinical Query: {query}")
    if reranked_data: # Add a check for None or empty data
        for i, result in enumerate(reranked_data):
            print(f"{i+1}. Score: {result.score:.4f}")
            print(f"   Document: {result.document}\n")
    else:
        print("No reranked results to display.")


def complete_workflow_example():
    """Exemple complet du workflow"""
    print("=== Workflow complet de reranking avec Pinecone ===")

    # Test avec les documents Apple (using existing variables from previous cells)
    print("\n1. Test avec les documents Apple:")
    # Ensure 'query' and 'documents' are defined in a previous cell or within this function
    # For this example, we assume they are defined globally in a previous cell
    try:
        reranked_apple = pc.inference.rerank(
            model="bge-reranker-v2-m3",
            query=query,
            documents=documents
        )
        show_reranked(query, reranked_apple.data) # Use .data
    except Exception as e:
        print(f"Erreur lors du reranking des documents Apple: {e}")


    # Test avec les notes médicales (using existing variables from previous cells)
    print("\n2. Résultats de recherche sémantique:")
    # Ensure 'question' and 'matches' are defined in a previous cell or within this function
    # For this example, we assume they are defined globally in a previous cell
    try:
        show_results(question, matches)
    except Exception as e:
        print(f"Erreur lors de l'affichage des résultats de recherche sémantique: {e}")

    print("\n3. Résultats après reranking:")
    # Ensure 'reranked' is defined in a previous cell or within this function
    # For this example, we assume it is defined globally in cell 5Taqcz_9Bs91
    rerank_query = "patient with post-operative leg pain and swelling" # Define rerank_query
    try:
        # Assuming 'reranked' from cell 5Taqcz_9Bs91 is the result of a rerank call
        # If not, you would need to perform the reranking here
        # Example if you need to re-run reranking:
        # clinical_docs_for_reranking = [{"text": m.metadata.get('symptoms', '') + " " + m.metadata.get('advice', '')} for m in matches]
        # reranked_clinical = pc.inference.rerank(
        #     model="bge-reranker-v2-m3",
        #     query=rerank_query,
        #     documents=[doc['text'] for doc in clinical_docs_for_reranking]
        # )
        # show_reranked_clinical(rerank_query, reranked_clinical.data)

        # Using the 'reranked' variable from cell 5Taqcz_9Bs91, which already performed reranking on clinical_docs
        show_reranked_clinical(rerank_query, reranked.data) # Use .data

    except Exception as e:
        print(f"Erreur lors de l'affichage des résultats rerankés: {e}")


# Pour exécuter le workflow complet
if __name__ == "__main__":
    # Note: This will use global variables defined in previous cells (pc, query, documents, question, matches, reranked)
    # Ensure these cells have been run before executing this cell.
    complete_workflow_example()

=== Workflow complet de reranking avec Pinecone ===

1. Test avec les documents Apple:
Query: Tell me about Apple's products

1. Score: 0.2770
   Texte: Apple's MacBooks are top sellers in the laptop market.

2. Score: 0.0839
   Texte: Apple is a popular fruit that comes in many varieties.

3. Score: 0.0077
   Texte: Apple Inc. released the new iPhone with advanced features.

4. Score: 0.0010
   Texte: Green apples are tart and delicious for baking.

5. Score: 0.0000
   Texte: I had a red apple for breakfast this morning.


2. Résultats de recherche sémantique:
Question: what if my patient has leg pain

3. Résultats après reranking:
Clinical Query: patient with post-operative leg pain and swelling
1. Score: 0.1542
   Document: text='Guidelines for leg pain diagnosis'

2. Score: 0.0029
   Document: text='First aid for sports injuries'

3. Score: 0.0012
   Document: text='Treatment options for chronic back pain'

4. Score: 0.0010
   Document: text='Differential diagnosis for leg swelling