In [2]:
import polars as pl
import numpy as np
from tqdm import tqdm
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from sentence_transformers import SentenceTransformer
import torch
import re
from typing import List, Dict, Tuple
from urllib.parse import unquote

print("üì¶ Imports loaded successfully")

üì¶ Imports loaded successfully


In [3]:
# Initialize Qdrant client (same as your existing setup)
client = QdrantClient(host="localhost", port=6333, prefer_grpc=True)

# Load sentence transformer model (same model you're already using)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("models/multilingual-e5-large", device=device)

print(f"‚úÖ Qdrant client connected")
print(f"‚úÖ Model loaded on {device}")

# Check existing collection
collection_name = "wikipedia_fr"
total = client.count(collection_name=collection_name, exact=True).count
print(f"üìä Existing collection '{collection_name}': {total:,} articles")

‚úÖ Qdrant client connected
‚úÖ Model loaded on cuda
üìä Existing collection 'wikipedia_fr': 30,208 articles


Load the Dataset as polars dataframe.

In [4]:
# Load the merged dataset
df = pl.scan_parquet("articles_fr_merged.parquet").filter(
    pl.col('link_count') > 0
).collect()

print(f"üìö Dataset loaded:")
print(f"   Total articles: {len(df):,}")
print(f"   Articles with links: {df.filter(pl.col('link_count') > 0).height:,}")
print(f"\n   Link structure example:")
print(f"   {df.select('links').to_series()[2][:3]}")

üìö Dataset loaded:
   Total articles: 2,556,402
   Articles with links: 2,556,402

   Link structure example:
   shape: (3,)
Series: '' [struct[5]]
[
	{"https://fr.wikipedia.org/wiki/alg%C3%A8bre",2,"alg√®bre","alg%C3%A8bre","alg√®bre"}
	{"https://fr.wikipedia.org/wiki/math%C3%A9matiques",100,"math√©matiques","math%C3%A9matiques","math√©matiques"}
	{"https://fr.wikipedia.org/wiki/structure%20alg%C3%A9brique",200,"structures alg√©briques","structure%20alg%C3%A9brique","structure alg√©brique"}
]


In [None]:
#Filter for only rows that have some links
df.filter(pl.col('link_count') >  0).head(100) 
# print(df.head())


id,title,text,links,link_count,text_withoutHref
i64,str,str,list[struct[5]],u32,str
3,"""Antoine Meillet""","""Antoine Meillet, n√© le √† &lt;a‚Ä¶","[{""https://fr.wikipedia.org/wiki/Moulins%20%28Allier%29"",25,""Moulins"",""Moulins%20%28Allier%29"",""Moulins (Allier)""}, {""https://fr.wikipedia.org/wiki/Allier%20%28d%C3%A9partement%29"",83,""Allier"",""Allier%20%28d%C3%A9partement%29"",""Allier (d√©partement)""}, ‚Ä¶ {""https://fr.wikipedia.org/wiki/Albert%20Lord"",6955,""Albert Lord"",""Albert%20Lord"",""Albert Lord""}]",65,"""Antoine Meillet, n√© le √† Mouli‚Ä¶"
7,"""Alg√®bre lin√©aire""","""L‚Äôalg√®bre lin√©aire est la bran‚Ä¶","[{""https://fr.wikipedia.org/wiki/math%C3%A9matiques"",38,""math√©matiques"",""math%C3%A9matiques"",""math√©matiques""}, {""https://fr.wikipedia.org/wiki/Espace%20vectoriel"",117,""espaces vectoriels"",""Espace%20vectoriel"",""Espace vectoriel""}, ‚Ä¶ {""https://fr.wikipedia.org/wiki/Diagonalisation"",17018,""diagonalisables"",""Diagonalisation"",""Diagonalisation""}]",111,"""L‚Äôalg√®bre lin√©aire est la bran‚Ä¶"
9,"""Alg√®bre g√©n√©rale""","""L'&lt;a href=""alg%C3%A8bre""&gt‚Ä¶","[{""https://fr.wikipedia.org/wiki/alg%C3%A8bre"",2,""alg√®bre"",""alg%C3%A8bre"",""alg√®bre""}, {""https://fr.wikipedia.org/wiki/math%C3%A9matiques"",100,""math√©matiques"",""math%C3%A9matiques"",""math√©matiques""}, ‚Ä¶ {""https://fr.wikipedia.org/wiki/dernier%20th%C3%A9or%C3%A8me%20de%20Fermat"",2686,""dernier th√©or√®me de Fermat"",""dernier%20th%C3%A9or%C3%A8me%20de%20Fermat"",""dernier th√©or√®me de Fermat""}]",15,"""L'alg√®bre g√©n√©rale, ou alg√®bre‚Ä¶"
10,"""Algorithmique""","""L'algorithmique est l'√©tude et‚Ä¶","[{""https://fr.wikipedia.org/wiki/algorithme"",127,""algorithme"",""algorithme"",""algorithme""}, {""https://fr.wikipedia.org/wiki/probl%C3%A8me%20algorithmique"",307,""probl√®me algorithmique"",""probl%C3%A8me%20algorithmique"",""probl√®me algorithmique""}, ‚Ä¶ {""https://fr.wikipedia.org/wiki/liste%20des%20algorithmes"",18655,""liste des algorithmes"",""liste%20des%20algorithmes"",""liste des algorithmes""}]",101,"""L'algorithmique est l'√©tude et‚Ä¶"
11,"""Politique en Argentine""","""L'&lt;a href=""Argentine""&gt;Ar‚Ä¶","[{""https://fr.wikipedia.org/wiki/Argentine"",2,""Argentine"",""Argentine"",""Argentine""}, {""https://fr.wikipedia.org/wiki/r%C3%A9publique"",56,""r√©publique"",""r%C3%A9publique"",""r√©publique""}, ‚Ä¶ {""https://fr.wikipedia.org/wiki/%C3%A9lections%20g%C3%A9n%C3%A9rales%20argentines%20de%202007"",12114,""cette derni√®re"",""%C3%A9lections%20g%C3%A9n%C3%A9rales%20argentines%20de%202007"",""√©lections g√©n√©rales argentines de 2007""}]",78,"""L'Argentine est une r√©publique‚Ä¶"
…,…,…,…,…,…
201,"""Arthur John Evans""","""Arthur John Evans (n√© le √† &lt‚Ä¶","[{""https://fr.wikipedia.org/wiki/Nash%20Mills"",27,""Nash Mills"",""Nash%20Mills"",""Nash Mills""}, {""https://fr.wikipedia.org/wiki/Hertfordshire"",85,""Hertfordshire"",""Hertfordshire"",""Hertfordshire""}, ‚Ä¶ {""https://fr.wikipedia.org/wiki/Knight%20Bachelor"",3130,""chevalier"",""Knight%20Bachelor"",""Knight Bachelor""}]",26,"""Arthur John Evans (n√© le √† Nas‚Ä¶"
206,"""Alfred Nobel""","""Alfred Bernhard Nobel /'alfr…ôd‚Ä¶","[{""https://fr.wikipedia.org/wiki/Stockholm"",59,""Stockholm"",""Stockholm"",""Stockholm""}, {""https://fr.wikipedia.org/wiki/Su%C3%A8de"",108,""Su√®de"",""Su%C3%A8de"",""Su√®de""}, ‚Ä¶ {""https://fr.wikipedia.org/wiki/Cimeti%C3%A8re%20du%20Nord%20%28Solna%29"",10712,""cimeti√®re du Nord"",""Cimeti%C3%A8re%20du%20Nord%20%28Solna%29"",""Cimeti√®re du Nord (Solna)""}]",87,"""Alfred Bernhard Nobel /'alfr…ôd‚Ä¶"
208,"""Alc√®ne""","""Les alc√®nes sont des &lt;a hre‚Ä¶","[{""https://fr.wikipedia.org/wiki/hydrocarbure"",21,""hydrocarbure"",""hydrocarbure"",""hydrocarbure""}, {""https://fr.wikipedia.org/wiki/Compos%C3%A9%20insatur%C3%A9"",74,""insatur√©s"",""Compos%C3%A9%20insatur%C3%A9"",""Compos√© insatur√©""}, ‚Ä¶ {""https://fr.wikipedia.org/wiki/cyclohex%C3%A8ne"",9887,""cyclohex√®ne"",""cyclohex%C3%A8ne"",""cyclohex√®ne""}]",63,"""Les alc√®nes sont des hydrocarb‚Ä¶"
210,"""Ange (homonymie)""","""&lt;templatestyles src=""Autres‚Ä¶","[{""https://fr.wikipedia.org/wiki/ange"",95,""ange"",""ange"",""ange""}, {""https://fr.wikipedia.org/wiki/Dieu"",165,""Dieu"",""Dieu"",""Dieu""}, ‚Ä¶ {""https://fr.wikipedia.org/wiki/grec%20ancien"",339,""grec"",""grec%20ancien"",""grec ancien""}]",5,"""&lt;templatestyles src=""Autres‚Ä¶"


In [5]:
result = (
    df
    .filter(pl.col("title") == "Antoine Meillet")
    .select("text_withoutHref")
)

print(result)

shape: (1, 1)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ text_withoutHref                ‚îÇ
‚îÇ ---                             ‚îÇ
‚îÇ str                             ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï°
‚îÇ Antoine Meillet, n√© le √† Mouli‚Ä¶ ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò


Use the position of the href links from the dataframe to get the sentances that contains them

In [6]:
def extract_sentences_with_links_from_positions(
    text: str, 
    links: list
) -> List[Dict]:
    """
    Extract sentences containing hyperlinks using the position information.
    
    Link format: {"full_url": str, "start_idx": int, "anchor": str, "href_raw": str, "href_decoded": str}
    """
    # Handle empty inputs
    if not text:
        return []
    if not links or len(links) == 0:  # Check length instead of truthiness
        return []
    
    # Sort links by position
    sorted_links = sorted(links, key=lambda x: x.get("start_idx", 0))
    
    # Simple sentence splitting
    sentence_pattern = r'[.!?]+\s+'
    sentences = re.split(sentence_pattern, text)
    
    results = []
    current_pos = 0
    
    for sentence in sentences:
        if not sentence.strip():
            continue
        
        # Find sentence boundaries in original text
        sent_start = text.find(sentence, current_pos)
        if sent_start == -1:
            continue
        sent_end = sent_start + len(sentence)
        
        # Find all links within this sentence using position info
        links_in_sent = []
        for link in sorted_links:
            link_pos = link.get("start_idx", -1)
            
            # Check if link position falls within sentence boundaries
            if sent_start <= link_pos < sent_end:
                links_in_sent.append({
                    'anchor': link.get('anchor', ''),
                    'href_decoded': link.get('href_decoded', ''),
                    'href_raw': link.get('href_raw', ''),
                    'position': link_pos,
                    'full_url': link.get('full_url', '')
                })
        
        if links_in_sent:  # Only keep sentences with links
            results.append({
                'sentence': sentence.strip(),
                'links_in_sentence': links_in_sent,
                'start_pos': sent_start,
                'num_links': len(links_in_sent)
            })
        
        current_pos = sent_end
    
    return results

# Test on sample - PROPERLY convert to Python list
test_article = df.filter(pl.col("id") == 7)  # Alg√®bre lin√©aire

# Get text as string
test_text = test_article.select("text_withoutHref").to_series()[0]

# Get links and convert to Python list properly
links_value = test_article.select("links").to_series()[0]

# Convert Polars list to Python list of dicts
if links_value is None:
    test_links = []
else:
    # This converts the Polars list to a Python list
    test_links = list(links_value) if hasattr(links_value, '__iter__') else []

print(f"üîç Debug info:")
print(f"   Type of test_text: {type(test_text)}")
print(f"   Type of test_links: {type(test_links)}")
print(f"   Number of links: {len(test_links)}")

sample_sentences = extract_sentences_with_links_from_positions(test_text, test_links)
print(f"\nüìù Sample extraction from 'Alg√®bre lin√©aire' (ID: 7):")
print(f"   Total links in article: {len(test_links)}")
print(f"   Sentences with links: {len(sample_sentences)}")
if sample_sentences:
    print(f"\n   Example sentence:")
    print(f"   '{sample_sentences[0]['sentence'][:120]}...'")
    print(f"   Links ({sample_sentences[0]['num_links']}): {[l['anchor'] for l in sample_sentences[0]['links_in_sentence'][:3]]}")

üîç Debug info:
   Type of test_text: <class 'str'>
   Type of test_links: <class 'list'>
   Number of links: 111

üìù Sample extraction from 'Alg√®bre lin√©aire' (ID: 7):
   Total links in article: 111
   Sentences with links: 45

   Example sentence:
   'L‚Äôalg√®bre lin√©aire est la branche des math√©matiques qui s'int√©resse aux espaces vectoriels et aux transformations lin√©ai...'
   Links (3): ['math√©matiques', 'espaces vectoriels', 'transformations lin√©aires']


    Build URL ‚Üí ID mapping using articles already in Qdrant.
    This is faster than iterating through the full DataFrame.
    Necessitate to have populated the qdrant db with some articles first (the more the better)

In [7]:
def create_url_to_id_mapping_from_qdrant(client, collection_name: str = "wikipedia_fr") -> Dict[str, int]:
    """
    Build URL ‚Üí ID mapping using articles already in Qdrant.
    This is faster than iterating through the full DataFrame.
    """
    from urllib.parse import quote
    
    url_to_id = {}
    
    print(f"üó∫Ô∏è  Building URL ‚Üí ID mapping from Qdrant collection '{collection_name}'...")
    
    # Scroll through all points in Qdrant
    offset = None
    batch_size = 1000
    total_processed = 0
    
    while True:
        points, offset = client.scroll(
            collection_name=collection_name,
            limit=batch_size,
            offset=offset,
            with_payload=True,
            with_vectors=False
        )
        
        if not points:
            break
        
        for point in points:
            article_id = point.payload.get("id")
            title = point.payload.get("title", "")
            
            if not article_id or not title:
                continue
            
            # Create URL pattern variations
            patterns = [
                title,
                title.replace(" ", "_"),
                quote(title.replace(" ", "_"), safe=""),
                title.lower(),
                title.lower().replace(" ", "_"),
            ]
            
            for pattern in patterns:
                url_to_id[pattern] = article_id
        
        total_processed += len(points)
        
        if offset is None:
            break
    
    print(f"‚úÖ Created {len(url_to_id):,} URL mappings from {total_processed:,} articles")
    return url_to_id

# Build mapping from Qdrant (faster than full DataFrame)
url_to_id = create_url_to_id_mapping_from_qdrant(client, "wikipedia_fr")

üó∫Ô∏è  Building URL ‚Üí ID mapping from Qdrant collection 'wikipedia_fr'...
‚úÖ Created 100,088 URL mappings from 30,208 articles


Use the struct of href with their position embedding to map sentences to articles.

In [8]:
def create_linkable_phrases_dataset_optimized(
    df: pl.DataFrame, 
    url_to_id: Dict[str, int],
    max_articles: int = None,
    min_links_per_sentence: int = 1
) -> pl.DataFrame:
    """
    Extract linkable phrases using the structured link data.
    Optimized to use position information from links column.
    """
    print("üîó Extracting linkable phrases from articles...")
    
    # Filter to articles with links
    df_with_links = df.filter(pl.col("link_count") > 0)
    
    if max_articles:
        df_with_links = df_with_links.head(max_articles)
    
    print(f"   Processing {len(df_with_links):,} articles with links...")
    
    linkable_phrases = []
    
    for row in tqdm(df_with_links.iter_rows(named=True), total=len(df_with_links), desc="Extracting phrases"):
        article_id = row["id"]
        article_title = row["title"]
        text = row.get("text_withoutHref", "")
        links_raw = row.get("links", [])
        
        # Convert links to Python list if needed
        if links_raw is None:
            links = []
        else:
            links = list(links_raw) if hasattr(links_raw, '__iter__') else []
        
        if not text or len(links) == 0:
            continue
        
        # Extract sentences with links using position info
        sentences_with_links = extract_sentences_with_links_from_positions(text, links)
        
        for sent_data in sentences_with_links:
            sentence = sent_data["sentence"]
            links_in_sent = sent_data["links_in_sentence"]
            
            # Map href_decoded to article IDs
            target_ids = []
            anchors = []
            href_decodeds = []
            
            for link in links_in_sent:
                href_decoded = link["href_decoded"]
                anchor = link["anchor"]
                
                # Try to find target article ID
                target_id = url_to_id.get(href_decoded)
                if not target_id:
                    # Try variations
                    for variation in [
                        href_decoded.replace("_", " "),
                        href_decoded.replace("%20", " "),
                        href_decoded.lower(),
                        href_decoded.lower().replace("_", " ")
                    ]:
                        target_id = url_to_id.get(variation)
                        if target_id:
                            break
                
                if target_id and target_id != article_id:
                    target_ids.append(target_id)
                    anchors.append(anchor)
                    href_decodeds.append(href_decoded)
            
            # Only keep if we successfully mapped links
            if len(target_ids) >= min_links_per_sentence:
                linkable_phrases.append({
                    "source_article_id": article_id,
                    "source_article_title": article_title,
                    "sentence": sentence,
                    "target_ids": target_ids,
                    "anchors": anchors,
                    "href_decodeds": href_decodeds,
                    "num_links": len(target_ids)
                })
    
    linkable_df = pl.DataFrame(linkable_phrases)
    
    print(f"\n‚úÖ Extracted linkable phrases:")
    print(f"   Total phrases: {len(linkable_df):,}")
    print(f"   From {linkable_df.select('source_article_id').n_unique():,} articles")
    
    # FIX: Extract scalar values properly from Polars
    total_links = linkable_df.select(pl.col('num_links').sum()).item()
    avg_links = linkable_df.select(pl.col('num_links').mean()).item()
    
    print(f"   Total links: {total_links:,}")
    print(f"   Avg links per phrase: {avg_links:.2f}")
    
    return linkable_df

# Create the dataset (use more articles since we have 26K in Qdrant)
linkable_phrases_df = create_linkable_phrases_dataset_optimized(
    df, 
    url_to_id, 
    max_articles=5000,  # Process 5K articles
    min_links_per_sentence=1
)

print("\nüìä Sample linkable phrases:")
print(linkable_phrases_df.head(3).select(["sentence", "anchors", "num_links"]))

üîó Extracting linkable phrases from articles...
   Processing 5,000 articles with links...


Extracting phrases: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [00:14<00:00, 356.13it/s] 



‚úÖ Extracted linkable phrases:
   Total phrases: 111,162
   From 4,739 articles
   Total links: 140,198
   Avg links per phrase: 1.26

üìä Sample linkable phrases:
shape: (3, 3)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ sentence                        ‚îÜ anchors               ‚îÜ num_links ‚îÇ
‚îÇ ---                             ‚îÜ ---                   ‚îÜ ---       ‚îÇ
‚îÇ str                             ‚îÜ list[str]             ‚îÜ i64       ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï°
‚îÇ Antoine Meillet, n√© le √† Mouli‚Ä¶ ‚îÜ ["Moulins", "Allier"] ‚îÜ 2         ‚îÇ
‚îÇ Paul Jules Antoine Meillet est‚Ä¶ ‚îÜ ["Cher"]              ‚îÜ

In [9]:
def create_linkable_phrases_collection(client, collection_name: str = "linkable_phrases", force_recreate: bool = False):
    """Create Qdrant collection for storing linkable phrase embeddings"""
    
    # Check if collection already exists
    try:
        collection_info = client.get_collection(collection_name)
        existing_count = collection_info.points_count
        
        if not force_recreate:
            print(f"‚úÖ Collection '{collection_name}' already exists with {existing_count:,} points")
            print(f"   Skipping creation (use force_recreate=True to rebuild)")
            return
        else:
            # User explicitly wants to recreate
            client.delete_collection(collection_name)
            print(f"üóëÔ∏è  Deleted existing collection: {collection_name}")
    except Exception as e:
        # Collection doesn't exist, that's fine
        print(f"üì¶ Collection '{collection_name}' doesn't exist, creating new one...")
    
    # Create new collection with same dimension as existing wikipedia_fr collection
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=model.get_sentence_embedding_dimension(),
            distance=Distance.COSINE
        )
    )
    
    print(f"‚úÖ Created collection: {collection_name}")
    print(f"   Vector dimension: {model.get_sentence_embedding_dimension()}")
    print(f"   Distance metric: COSINE")

    create_linkable_phrases_collection(client, "linkable_phrases", force_recreate=False)


Put sentances with href embeddings into Qdrant DB

In [10]:
import torch
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import gc

def upload_linkable_phrases_to_qdrant(
    client,
    linkable_df: pl.DataFrame,
    model,
    collection_name: str = "linkable_phrases",
    batch_size: int = 512,
    force_recreate: bool = False
):
    """
    Optimized upload with GPU memory management and parallel Qdrant uploads.
    """
    total_rows = len(linkable_df)
    
    # Check if collection exists
    try:
        collection_info = client.get_collection(collection_name)
        existing_count = collection_info.points_count
        
        if not force_recreate and existing_count == total_rows:
            print(f"‚úÖ Collection '{collection_name}' already exists with {existing_count:,} phrases")
            return
        elif not force_recreate and existing_count > 0:
            print(f"‚ö†Ô∏è  Resuming from phrase {existing_count:,}")
            start_from = existing_count
        else:
            start_from = 0
    except:
        print(f"üì¶ Creating new collection '{collection_name}'")
        start_from = 0
    
    if start_from >= total_rows:
        print(f"‚úÖ Collection already complete")
        return
    
    print(f"\nüöÄ Processing {total_rows - start_from:,} phrases...")
    
    # GPU memory diagnostic
    if torch.cuda.is_available():
        gpu_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
        print(f"üéÆ GPU: {torch.cuda.get_device_name(0)} ({gpu_mem_gb:.1f} GB)")
        
        # For 4GB GPU, use small batches to avoid OOM
        if gpu_mem_gb < 6:
            encode_batch_size = 32  # Very small for 4GB GPU
            print(f"   ‚ö†Ô∏è  Small GPU detected, using batch_size={encode_batch_size}")
        elif gpu_mem_gb < 12:
            encode_batch_size = 128
        else:
            encode_batch_size = 512
    else:
        encode_batch_size = 64
        print(f"üíª Using CPU with batch_size={encode_batch_size}")
    
    # Slice remaining data
    remaining_df = linkable_df.slice(start_from, total_rows - start_from)
    
    # Extract sentences in ONE Polars operation
    all_sentences = remaining_df.select("sentence").to_series().to_list()
    
    print(f"\nüß† Encoding {len(all_sentences):,} sentences in batches of {encode_batch_size}...")
    
    # Encode in small batches to avoid GPU OOM
    all_embeddings = []
    
    for i in tqdm(range(0, len(all_sentences), encode_batch_size), desc="Encoding"):
        batch_sentences = all_sentences[i:i + encode_batch_size]
        
        # Encode batch
        embeddings = model.encode(
            batch_sentences,
            normalize_embeddings=True,
            show_progress_bar=False,
            convert_to_numpy=True,
            batch_size=encode_batch_size,
            device=str(model.device)
        )
        
        all_embeddings.append(embeddings)
        
        # Clear GPU cache every 100 batches to prevent memory fragmentation
        if torch.cuda.is_available() and i % (100 * encode_batch_size) == 0:
            torch.cuda.empty_cache()
            gc.collect()
    
    # Concatenate all embeddings
    all_embeddings = np.vstack(all_embeddings)
    
    print(f"‚úÖ Encoded {len(all_embeddings):,} sentences")
    
    # Clear GPU memory before upload phase
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
    
    # PARALLEL QDRANT UPLOAD using ThreadPoolExecutor
    print(f"\nüì§ Uploading to Qdrant with {8} parallel threads...")
    
    # Convert DataFrame to dicts ONCE (Polars vectorized operation)
    all_data = remaining_df.to_dicts()
    
    # Split into upload batches
    upload_batch_size = 256  # Smaller batches for parallel uploads
    num_batches = (len(remaining_df) + upload_batch_size - 1) // upload_batch_size
    
    print(f"   Split into {num_batches} batches for parallel upload")
    
    def upload_batch(batch_idx):
        """Upload a single batch to Qdrant (runs in parallel thread)"""
        start_idx = batch_idx * upload_batch_size
        end_idx = min(start_idx + upload_batch_size, len(remaining_df))
        
        batch_embeddings = all_embeddings[start_idx:end_idx]
        batch_data = all_data[start_idx:end_idx]
        
        # Create points (list comprehension is fast)
        points = [
            PointStruct(
                id=start_from + start_idx + j,
                vector=batch_embeddings[j].tolist(),
                payload={
                    "phrase_id": start_from + start_idx + j,
                    "sentence": row["sentence"],
                    "source_article_id": row["source_article_id"],
                    "source_article_title": row["source_article_title"],
                    "target_ids": row["target_ids"],
                    "anchors": row["anchors"],
                    "href_decodeds": row["href_decodeds"],
                    "num_links": row["num_links"]
                }
            )
            for j, row in enumerate(batch_data)
        ]
        
        # Upload (Qdrant client is thread-safe)
        client.upsert(
            collection_name=collection_name,
            points=points,
            wait=True
        )
        
        return len(points)
    
    # Use ThreadPoolExecutor for PARALLEL uploads (this WILL use multiple threads)
    # You'll see multiple threads active during upload phase
    with ThreadPoolExecutor(max_workers=8) as executor:
        # Submit all upload jobs at once
        futures = [executor.submit(upload_batch, i) for i in range(num_batches)]
        
        # Track progress as uploads complete
        uploaded = 0
        for future in tqdm(futures, desc="Uploading (8 threads)"):
            uploaded += future.result()
    
    print(f"\n‚úÖ Upload complete!")
    
    # Verify
    collection_info = client.get_collection(collection_name)
    print(f"   Collection size: {collection_info.points_count:,} phrases")
    print(f"   Expected: {total_rows:,} phrases")
    
    if collection_info.points_count != total_rows:
        print(f"   ‚ö†Ô∏è  Mismatch: {collection_info.points_count:,} != {total_rows:,}")

# Run with proper memory management
upload_linkable_phrases_to_qdrant(
    client,
    linkable_phrases_df,
    model,
    "linkable_phrases",
    batch_size=512,
    force_recreate=False
)

‚ö†Ô∏è  Resuming from phrase 105,879

üöÄ Processing 5,283 phrases...
üéÆ GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU (4.3 GB)
   ‚ö†Ô∏è  Small GPU detected, using batch_size=32

üß† Encoding 5,283 sentences in batches of 32...


Encoding: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 166/166 [02:29<00:00,  1.11it/s]


‚úÖ Encoded 5,283 sentences

üì§ Uploading to Qdrant with 8 parallel threads...
   Split into 21 batches for parallel upload


Uploading (8 threads): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21/21 [00:04<00:00,  4.68it/s]


‚úÖ Upload complete!
   Collection size: 112,861 phrases
   Expected: 111,162 phrases
   ‚ö†Ô∏è  Mismatch: 112,861 != 111,162





Run actual prediction by querying Qdrant DB

In [11]:
def predict_links_for_article_hybrid(
    client,
    linkable_df: pl.DataFrame,
    model,
    collection_name: str = "linkable_phrases",
    batch_size: int = 128,
    force_recreate: bool = False
):
    """
    Embed linkable phrases and upload to Qdrant.
    Uses same encoding approach as existing wikipedia_fr collection.
    Supports resuming and parallel encoding.
    """
    total_rows = len(linkable_df)
    
    # Check if collection exists and has the right size
    try:
        collection_info = client.get_collection(collection_name)
        existing_count = collection_info.points_count
        
        if not force_recreate and existing_count == total_rows:
            print(f"‚úÖ Collection '{collection_name}' already exists with {existing_count:,} phrases")
            print(f"   Skipping upload (use force_recreate=True to rebuild)")
            return
        elif not force_recreate and existing_count > 0:
            print(f"‚ö†Ô∏è  Collection exists with {existing_count:,} phrases (expected {total_rows:,})")
            user_input = input("   Continue from where it left off? (y/n): ")
            if user_input.lower() == 'y':
                start_from = existing_count
                print(f"   Resuming from phrase {start_from:,}")
            else:
                print("   Aborting. Use force_recreate=True to rebuild from scratch.")
                return
        else:
            start_from = 0
    except:
        print(f"üì¶ Collection '{collection_name}' doesn't exist, will create it")
        start_from = 0
    
    print(f"üöÄ Uploading {total_rows - start_from:,} linkable phrases to Qdrant...")
    print(f"   Using parallel encoding with batch_size={batch_size}")
    
    # Process in batches
    for i in tqdm(range(start_from, total_rows, batch_size), desc="Uploading batches"):
        batch = linkable_df.slice(i, min(batch_size, total_rows - i))
        
        # Get sentences
        sentences = batch.select("sentence").to_series().to_list()
        
        # Embed sentences with parallel processing
        # The model.encode already uses multi-threading internally
        embeddings = model.encode(
            sentences,
            normalize_embeddings=True,
            show_progress_bar=False,
            convert_to_numpy=True,
            batch_size=64,  # Internal batch size for encoding
            device=device,
            convert_to_tensor=False
        )
        
        # Create points in parallel using list comprehension (faster than loop)
        batch_rows = list(batch.iter_rows(named=True))
        points = [
            PointStruct(
                id=i + j,
                vector=embeddings[j].tolist(),
                payload={
                    "phrase_id": i + j,
                    "sentence": row["sentence"],
                    "source_article_id": row["source_article_id"],
                    "source_article_title": row["source_article_title"],
                    "target_ids": row["target_ids"],
                    "anchors": row["anchors"],
                    "href_decodeds": row["href_decodeds"],
                    "num_links": row["num_links"]
                }
            )
            for j, row in enumerate(batch_rows)
        ]
        
        # Upload to Qdrant (this is already async internally)
        client.upsert(
            collection_name=collection_name,
            points=points,
            wait=True
        )
    
    print(f"‚úÖ Upload complete!")
    
    # Verify
    collection_info = client.get_collection(collection_name)
    print(f"   Collection '{collection_name}': {collection_info.points_count:,} phrases")
    print(f"   Existing 'wikipedia_fr': {client.count('wikipedia_fr', exact=True).count:,} articles")

# Upload the phrases (will skip if already exists with same size)
upload_linkable_phrases_to_qdrant(
    client,
    linkable_phrases_df,
    model,
    "linkable_phrases",
    batch_size=256,  # Increased batch size for better parallelism
    force_recreate=False  # Set to True to force rebuild
)

‚úÖ Collection 'linkable_phrases' already exists with 105,879 phrases


In [21]:
def predict_links_for_sentences(
    client,
    model,
    test_sentences: List[str],
    source_article_id: int,  # ‚Üê Add this parameter
    collection_name: str = "linkable_phrases",
    top_k: int = 5,  # ‚Üê Increase to get more candidates
    min_similarity: float = 0.7
) -> List[Dict]:
    """
    Predict links for new sentences by finding similar linkable phrases.
    Excludes matches from the same source article.
    """
    print(f"üîç Predicting links for {len(test_sentences)} sentences...")
    
    # Encode test sentences
    test_embeddings = model.encode(
        test_sentences,
        normalize_embeddings=True,
        show_progress_bar=True,
        convert_to_numpy=True,
        device=device
    )
    
    predictions = []
    
    # Query Qdrant for each sentence
    for sentence, embedding in tqdm(zip(test_sentences, test_embeddings), total=len(test_sentences)):
        # Search for similar phrases in the collection
        search_results = client.search(
            collection_name=collection_name,
            query_vector=embedding.tolist(),
            limit=top_k,
            score_threshold=min_similarity
        )
        
        # Extract results, EXCLUDING same article
        similar_phrases = []
        for result in search_results:
            # ‚Üê ADD THIS CHECK
            if result.payload['source_article_id'] == source_article_id:
                continue  # Skip matches from same article
            
            similar_phrases.append({
                'similarity_score': result.score,
                'similar_sentence': result.payload['sentence'],
                'source_article': result.payload['source_article_title'],
                'source_article_id': result.payload['source_article_id'],
                'target_ids': result.payload['target_ids'],
                'anchors': result.payload['anchors'],
                'href_decodeds': result.payload['href_decodeds'],
                'num_links': result.payload['num_links']
            })
        
        predictions.append({
            'sentence': sentence,
            'num_matches': len(similar_phrases),
            'similar_linkable_phrases': similar_phrases
        })
    
    return predictions


# Usage
test_article = df.filter(pl.col("id") == 7)
test_article_id = test_article.select("id").to_series()[0]  # ‚Üê Get article ID
test_text = test_article.select("text_withoutHref").to_series()[0]
test_links = list(test_article.select("links").to_series()[0])

test_sentences_data = extract_sentences_with_links_from_positions(test_text, test_links)
test_sentences = [s['sentence'] for s in test_sentences_data[:5]]

# Generate predictions, excluding self-matches
predictions = predict_links_for_sentences(
    client,
    model,
    test_sentences,
    source_article_id=test_article_id,  # ‚Üê Pass article ID
    collection_name="linkable_phrases",
    top_k=5,  # Increased to get enough non-self matches
    min_similarity=0.7
)

üîç Predicting links for 5 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  search_results = client.search(
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 39.27it/s]


Show results

In [22]:
def display_predictions_detailed(predictions: List[Dict], df: pl.DataFrame, max_sentences: int = 5):
    """Display predicted links with full article information"""
    
    print("\n" + "="*80)
    print("üéØ PREDICTED LINKS (Phrase-Level Similarity)")
    print("="*80)
    
    for i, pred in enumerate(predictions[:max_sentences]):
        print(f"\n{'‚îÄ'*80}")
        print(f"Sentence {i+1}:")
        print(f"'{pred['sentence'][:200]}...'")
        print(f"\nüí° {pred['num_matches']} similar linkable phrase(s) found:")
        
        for j, similar in enumerate(pred['similar_linkable_phrases']):
            print(f"\n   Match {j+1} | Similarity: {similar['similarity_score']:.3f} | Links: {similar['num_links']}")
            print(f"   Similar phrase: '{similar['similar_sentence'][:150]}...'")
            print(f"   From article: '{similar['source_article']}'")
            print(f"   Suggested links:")
            
            for target_id, anchor in zip(similar['target_ids'][:3], similar['anchors'][:3]):
                target_title = df.filter(pl.col("id") == target_id).select("title").to_series()
                target_name = target_title[0] if len(target_title) > 0 else f"ID {target_id}"
                print(f"      üîó '{anchor}' ‚Üí {target_name}")

# Display predictions
display_predictions_detailed(predictions, df, max_sentences=5)


üéØ PREDICTED LINKS (Phrase-Level Similarity)

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Sentence 1:
'L‚Äôalg√®bre lin√©aire est la branche des math√©matiques qui s'int√©resse aux espaces vectoriels et aux transformations lin√©aires, formalisation g√©n√©rale des th√©ories des syst√®mes d'√©quations lin√©aires...'

üí° 4 similar linkable phrase(s) found:

   Match 1 | Similarity: 0.909 | Links: 1
   Similar phrase: 'L'alg√®bre g√©n√©rale, ou alg√®bre abstraite, est la branche des math√©matiques qui porte principalement sur l'√©tude des structures alg√©briques et de leurs...'
   From article: 'Alg√®bre g√©n√©rale'
   Suggested links:
      üîó 'alg√®bre' ‚Üí Alg√®bre

   Match 2 | Similarity: 0.892 | Links: 2
   Similar phrase: 'En math√©matiques, plus pr√©cis√©ment en alg√®bre lin√©aire,

Create a test set with articles that are not already embedded in the qdrant db

In [None]:
# Cell: Create Test Set with Valid Links Only 

print("üß™ Creating test set from unseen articles (checking link validity)...")

# Get IDs of articles used in training
training_article_ids = set(linkable_phrases_df.select("source_article_id").to_series().unique().to_list())
print(f"üìö Training set: {len(training_article_ids):,} unique articles")

# Get set of all article IDs in our database (for validation)
all_article_ids = set(df.select("id").to_series().to_list())
print(f"üìñ Total articles in database: {len(all_article_ids):,}")

# Get articles that exist in Qdrant (these are the only valid link targets)
qdrant_article_ids = set(url_to_id.values())
print(f"üóÑÔ∏è  Articles in Qdrant (valid link targets): {len(qdrant_article_ids):,}")

# Filter to articles NOT in training set
unseen_articles = df.filter(
    ~pl.col("id").is_in(training_article_ids) & 
    (pl.col("link_count") > 0)
)

print(f"üîç Available unseen articles: {len(unseen_articles):,}")

# Sample some test articles
test_sample_size = 500  # Increased to get more valid sentences
test_articles = unseen_articles.head(test_sample_size)

print(f"‚úÖ Selected {len(test_articles)} unseen articles for testing")

# Extract test sentences from these unseen articles
test_data = []
stats = {
    'articles_processed': 0,
    'sentences_extracted': 0,
    'sentences_with_empty_anchors': 0,
    'sentences_with_unmappable_links': 0,
    'sentences_with_valid_links': 0
}

for row in tqdm(test_articles.iter_rows(named=True), total=len(test_articles), desc="Extracting test sentences"):
    stats['articles_processed'] += 1
    article_id = row["id"]
    article_title = row["title"]
    text = row.get("text_withoutHref", "")
    links_raw = row.get("links", [])
    
    # Convert links
    if links_raw is None:
        links = []
    else:
        links = list(links_raw) if hasattr(links_raw, '__iter__') else []
    
    if not text or len(links) == 0:
        continue
    
    # Extract sentences with links
    sentences_with_links = extract_sentences_with_links_from_positions(text, links)
    stats['sentences_extracted'] += len(sentences_with_links)
    
    # Take first 5 sentences from each article
    for sent_data in sentences_with_links[:5]:
        sentence = sent_data["sentence"]
        links_in_sent = sent_data["links_in_sentence"]
        
        # Map href_decoded to article IDs (ground truth)
        ground_truth_links = []
        has_empty_anchor = False
        
        for link in links_in_sent:
            href_decoded = link["href_decoded"]
            anchor = link["anchor"]
            
            # Skip links with empty anchors
            if not anchor or anchor.strip() == "":
                has_empty_anchor = True
                continue
            
            # Try to find target article ID
            target_id = url_to_id.get(href_decoded)
            
            if not target_id:
                # Try variations
                variations = [
                    href_decoded.replace("_", " "),
                    href_decoded.replace("%20", " "),
                    href_decoded.lower(),
                    href_decoded.lower().replace("_", " "),
                    href_decoded.replace("_", " ").lower(),
                ]
                
                for variation in variations:
                    target_id = url_to_id.get(variation)
                    if target_id:
                        break
            
            # Only include if target exists in Qdrant and is not self-link
            if target_id and target_id != article_id and target_id in qdrant_article_ids:
                ground_truth_links.append({
                    'anchor': anchor,
                    'target_id': target_id,
                    'href_decoded': href_decoded
                })
        
        # Track statistics
        if has_empty_anchor:
            stats['sentences_with_empty_anchors'] += 1
        
        if ground_truth_links:
            stats['sentences_with_valid_links'] += 1
            test_data.append({
                'article_id': article_id,
                'article_title': article_title,
                'sentence': sentence,
                'ground_truth_links': ground_truth_links,
                'num_ground_truth': len(ground_truth_links)
            })
        else:
            stats['sentences_with_unmappable_links'] += 1

print(f"\n‚úÖ Created test set:")
print(f"   Total test sentences: {len(test_data):,}")
print(f"   From {len(set(t['article_id'] for t in test_data)):,} unseen articles")
print(f"   Total ground truth links: {sum(t['num_ground_truth'] for t in test_data):,}")
print(f"   Avg links per sentence: {sum(t['num_ground_truth'] for t in test_data) / len(test_data):.2f}" if test_data else "   Avg links per sentence: 0")

print(f"\nüìä Statistics:")
print(f"   Articles processed: {stats['articles_processed']:,}")
print(f"   Sentences extracted: {stats['sentences_extracted']:,}")
print(f"   Sentences with empty anchors: {stats['sentences_with_empty_anchors']:,}")
print(f"   Sentences with unmappable links: {stats['sentences_with_unmappable_links']:,}")
print(f"   Sentences with valid links: {stats['sentences_with_valid_links']:,}")

if test_data:
    print(f"\nüìù Sample test sentences:")
    for i, sample in enumerate(test_data[:3]):
        print(f"\n   {i+1}. '{sample['sentence'][:80]}...'")
        print(f"      From: {sample['article_title']}")
        print(f"      Ground truth: {[gt['anchor'] for gt in sample['ground_truth_links']]}")

üß™ Creating test set from unseen articles (checking link validity)...
üìö Training set: 4,739 unique articles
üìñ Total articles in database: 2,556,402
üóÑÔ∏è  Articles in Qdrant (valid link targets): 30,208
üîç Available unseen articles: 2,551,663
‚úÖ Selected 500 unseen articles for testing


Extracting test sentences: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 1140.43it/s]


‚úÖ Created test set:
   Total test sentences: 508
   From 199 unseen articles
   Total ground truth links: 695
   Avg links per sentence: 1.37

üìä Statistics:
   Articles processed: 500
   Sentences extracted: 7,589
   Sentences with empty anchors: 14
   Sentences with unmappable links: 823
   Sentences with valid links: 508

üìù Sample test sentences:

   1. 'Messenger proposait des environnements (d√©cors), un grand nombre d'√©motic√¥nes, l...'
      From: Yahoo! Messenger
      Ground truth: ['Yahoo!']

   2. 'Il donnait √©galement acc√®s aux services Yahoo (m√©t√©o, bourse, information, r√©sul...'
      From: Yahoo! Messenger
      Ground truth: ['Windows']

   3. '"La Dame √† l'hermine" (Portrait de Cecilia Gallerani), de L√©onard de Vinci, pein...'
      From: 1488
      Ground truth: ['L√©onard de Vinci']





Upload them into Qdrant

In [17]:
# Cell: Run Predictions on Unseen Test Set

# Extract just the sentences for prediction
test_sentences = [t['sentence'] for t in test_data]

print(f"üîÆ Running predictions on {len(test_sentences)} unseen sentences...")

# Generate predictions
test_predictions = predict_links_for_sentences(
    client,
    model,
    test_sentences,
    collection_name="linkable_phrases",
    top_k=5,  # Get more candidates
    min_similarity=0.65  # Lower threshold to see more results
)

print(f"‚úÖ Generated {len(test_predictions)} predictions")

# Combine predictions with ground truth
for i, pred in enumerate(test_predictions):
    pred['article_id'] = test_data[i]['article_id']
    pred['article_title'] = test_data[i]['article_title']
    pred['ground_truth_links'] = test_data[i]['ground_truth_links']
    pred['num_ground_truth'] = test_data[i]['num_ground_truth']

üîÆ Running predictions on 508 unseen sentences...
üîç Predicting links for 508 sentences...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

  search_results = client.search(
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 508/508 [00:02<00:00, 226.79it/s]

‚úÖ Generated 508 predictions





Evaluate sentances vs unseen articles from qdrant
Results are terrible as expected

In [18]:
# Cell: Evaluate Predictions vs Ground Truth

def evaluate_predictions(predictions, df):
    """
    Evaluate predicted links against ground truth.
    Computes precision, recall, and F1 score.
    """
    total_predicted = 0
    total_ground_truth = 0
    total_correct = 0
    
    results = []
    
    for pred in predictions:
        ground_truth_ids = set(link['target_id'] for link in pred['ground_truth_links'])
        
        # Collect all predicted target IDs from similar phrases
        predicted_ids = set()
        for similar in pred['similar_linkable_phrases']:
            predicted_ids.update(similar['target_ids'])
        
        # Calculate matches
        correct = ground_truth_ids & predicted_ids
        
        total_ground_truth += len(ground_truth_ids)
        total_predicted += len(predicted_ids)
        total_correct += len(correct)
        
        # Per-sentence metrics
        precision = len(correct) / len(predicted_ids) if predicted_ids else 0
        recall = len(correct) / len(ground_truth_ids) if ground_truth_ids else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        results.append({
            'sentence': pred['sentence'][:100],
            'article': pred['article_title'],
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'num_predicted': len(predicted_ids),
            'num_ground_truth': len(ground_truth_ids),
            'num_correct': len(correct)
        })
    
    # Overall metrics
    overall_precision = total_correct / total_predicted if total_predicted > 0 else 0
    overall_recall = total_correct / total_ground_truth if total_ground_truth > 0 else 0
    overall_f1 = 2 * overall_precision * overall_recall / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0
    
    print("\n" + "="*80)
    print("üìä EVALUATION RESULTS (Unseen Test Set)")
    print("="*80)
    print(f"\nüìà Overall Metrics:")
    print(f"   Precision: {overall_precision:.3f} ({total_correct}/{total_predicted})")
    print(f"   Recall:    {overall_recall:.3f} ({total_correct}/{total_ground_truth})")
    print(f"   F1 Score:  {overall_f1:.3f}")
    print(f"\nüìù Test Set Size:")
    print(f"   Sentences: {len(predictions):,}")
    print(f"   Avg ground truth links per sentence: {total_ground_truth/len(predictions):.2f}")
    print(f"   Avg predicted links per sentence: {total_predicted/len(predictions):.2f}")
    
    # Show best and worst examples
    results_df = pl.DataFrame(results).sort("f1", descending=True)
    
    print(f"\n‚úÖ Top 5 Best Predictions (by F1):")
    print(results_df.head(5).select(['sentence', 'article', 'precision', 'recall', 'f1']))
    
    print(f"\n‚ùå Top 5 Worst Predictions (by F1):")
    print(results_df.tail(5).select(['sentence', 'article', 'precision', 'recall', 'f1']))
    
    return results_df, {
        'precision': overall_precision,
        'recall': overall_recall,
        'f1': overall_f1
    }

# Run evaluation
results_df, metrics = evaluate_predictions(test_predictions, df)


üìä EVALUATION RESULTS (Unseen Test Set)

üìà Overall Metrics:
   Precision: 0.092 (212/2307)
   Recall:    0.305 (212/694)
   F1 Score:  0.141

üìù Test Set Size:
   Sentences: 508
   Avg ground truth links per sentence: 1.37
   Avg predicted links per sentence: 4.54

‚úÖ Top 5 Best Predictions (by F1):
shape: (5, 5)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ sentence                        ‚îÜ article                ‚îÜ precision ‚îÜ recall ‚îÜ f1       ‚îÇ
‚îÇ ---                             ‚îÜ ---                    ‚îÜ ---       ‚îÜ ---    ‚îÜ ---      ‚îÇ
‚îÇ str                             ‚îÜ str                    ‚îÜ f64       ‚îÜ f64    ‚îÜ f64      ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

In [19]:
# Cell: Detailed Analysis of Sample Predictions

def display_detailed_comparison(predictions, df, num_samples=3):
    """Show detailed comparison of predictions vs ground truth"""
    
    print("\n" + "="*80)
    print("üîç DETAILED PREDICTION ANALYSIS (Unseen Articles)")
    print("="*80)
    
    for i, pred in enumerate(predictions[:num_samples]):
        print(f"\n{'‚îÄ'*80}")
        print(f"Example {i+1}: Article '{pred['article_title']}'")
        print(f"Sentence: '{pred['sentence'][:150]}...'")
        
        # Ground truth
        print(f"\n‚úÖ Ground Truth Links ({pred['num_ground_truth']}):")
        for gt_link in pred['ground_truth_links']:
            target_title = df.filter(pl.col("id") == gt_link['target_id']).select("title").to_series()
            target_name = target_title[0] if len(target_title) > 0 else f"ID {gt_link['target_id']}"
            print(f"   üîó '{gt_link['anchor']}' ‚Üí {target_name}")
        
        # Predictions
        print(f"\nüîÆ Predicted Links ({pred['num_matches']} similar phrases found):")
        if pred['num_matches'] == 0:
            print("   ‚ö†Ô∏è  No predictions (no similar phrases above threshold)")
        else:
            for j, similar in enumerate(pred['similar_linkable_phrases'][:3]):
                print(f"\n   Match {j+1} | Similarity: {similar['similarity_score']:.3f}")
                print(f"   From: '{similar['source_article']}'")
                print(f"   Similar: '{similar['similar_sentence'][:100]}...'")
                print(f"   Suggests:")
                for target_id, anchor in zip(similar['target_ids'][:3], similar['anchors'][:3]):
                    target_title = df.filter(pl.col("id") == target_id).select("title").to_series()
                    target_name = target_title[0] if len(target_title) > 0 else f"ID {target_id}"
                    
                    # Check if this matches ground truth
                    is_correct = target_id in [gt['target_id'] for gt in pred['ground_truth_links']]
                    marker = "‚úì" if is_correct else "‚úó"
                    print(f"      {marker} '{anchor}' ‚Üí {target_name}")

# Display detailed analysis
display_detailed_comparison(test_predictions, df, num_samples=5)


üîç DETAILED PREDICTION ANALYSIS (Unseen Articles)

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Example 1: Article 'Yahoo! Messenger'
Sentence: 'Messenger proposait des environnements (d√©cors), un grand nombre d'√©motic√¥nes, la possibilit√© de communiquer oralement avec un microphone et de pouvoi...'

‚úÖ Ground Truth Links (1):
   üîó 'Yahoo!' ‚Üí Yahoo!

üîÆ Predicted Links (5 similar phrases found):

   Match 1 | Similarity: 0.874
   From: 'Liste de logiciels libres'
   Similar: 'Les mod√®les les plus perfectionn√©s permettent de jouer tout en discutant par oral en se voyant gr√¢ce...'
   Suggests:
      ‚úó 'bases de donn√©es' ‚Üí Base de donn√©es

   Match 2 | Similarity: 0.872
   From: 'Webcam'
   Similar: 'L'utilisation de la webcam pour la visiophonie se diffuse pour les communi

In [20]:
# Cell: Optional - Save Test Results for Later Analysis

# Save test predictions to parquet for later analysis
test_results = []
for pred in test_predictions:
    test_results.append({
        'article_id': pred['article_id'],
        'article_title': pred['article_title'],
        'sentence': pred['sentence'],
        'num_ground_truth': pred['num_ground_truth'],
        'num_predictions': pred['num_matches'],
        'ground_truth_anchors': [gt['anchor'] for gt in pred['ground_truth_links']],
        'ground_truth_targets': [gt['target_id'] for gt in pred['ground_truth_links']]
    })

test_results_df = pl.DataFrame(test_results)
test_results_df.write_parquet("test_results_unseen_articles.parquet")
print(f"üíæ Saved test results to 'test_results_unseen_articles.parquet'")

üíæ Saved test results to 'test_results_unseen_articles.parquet'
