In [5]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [1]:
!git clone https://github.com/GwenTsang/Information-Retrieval.git
!mkdir -p /content/folder

for i in range(1, 6):
    file_name = f"Trad_experiment/{i}.txt"
    source_path = f"/content/Information-Retrieval/{file_name}"
    target_path = f"/content/folder/{i}.txt"
    !mv {source_path} {target_path}

Cloning into 'Information-Retrieval'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 18 (delta 2), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (18/18), 8.69 KiB | 8.69 MiB/s, done.
Resolving deltas: 100% (2/2), done.
Files downloaded and moved to /content/folder


In [8]:
#@title Test de détection de paraphrase

!pip -q install -U sentence-transformers nltk

import os
from pathlib import Path
import torch
from sentence_transformers import SentenceTransformer, util
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize

file_paths = [f"/content/folder/{i}.txt" for i in range(1, 6)]

MODEL_NAME = "Lajavaness/sentence-camembert-large"

SOURCE_FILE = 3
TOP_K = 2


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = SentenceTransformer(MODEL_NAME, device=device)

# ---------------------------
# Load and split sentences
# ---------------------------
def load_and_split(file_path: str):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read().strip()
    sents = [s.strip() for s in sent_tokenize(text) if s.strip()]
    return sents

doc_names = []
doc_sents = []  # list of list[str], one per file
for p in file_paths:
    if not Path(p).exists():
        raise FileNotFoundError(f"Missing file: {p}")
    sents = load_and_split(p)
    doc_sents.append(sents)
    doc_names.append(Path(p).name)
    print(f"Loaded {Path(p).name}: {len(sents)} sentences")


doc_embs = []  # list of tensor [num_sentences, dim]
for sents in doc_sents:
    emb = model.encode(sents, convert_to_tensor=True, device=device, show_progress_bar=False)
    doc_embs.append(emb)

def run_experiment(source_idx: int, which: str = "first", top_k: int = 3):
    assert which in {"first", "last"}
    src_name = doc_names[source_idx]
    src_sents = doc_sents[source_idx]
    assert len(src_sents) > 0, f"No sentences in source file {src_name}"

    query_idx = 0 if which == "first" else len(src_sents) - 1
    query_sentence = src_sents[query_idx]
    print("\n" + "="*80)
    print(f"Experiment: match the {which} sentence from {src_name}")
    print(f"Query sentence ({src_name} [{query_idx}]): {query_sentence}")
    query_emb = model.encode([query_sentence], convert_to_tensor=True, device=device)

    correct = 0
    for j, (name, sents_j, embs_j) in enumerate(zip(doc_names, doc_sents, doc_embs)):
        sims = util.cos_sim(query_emb, embs_j)[0]  # shape: [num_sents]
        best_idx = int(torch.argmax(sims).item())
        best_score = float(sims[best_idx].item())
        expected_idx = 0 if which == "first" else (len(sents_j) - 1)
        is_correct = (best_idx == expected_idx)
        correct += int(is_correct)

        print("\n" + "-"*80)
        print(f"Target file: {name}")
        print(f"Top match index: {best_idx} (expected: {expected_idx}) | score: {best_score:.4f}")
        print(f"Top match sentence: {sents_j[best_idx]}")

        k = min(top_k, len(sents_j))
        if k > 1:
            vals, idxs = torch.topk(sims, k=k)
            print("Top-k candidates:")
            for rank, (val, idx) in enumerate(zip(vals.tolist(), idxs.tolist()), start=1):
                tag = " (expected)" if idx == expected_idx else ""
                print(f"  {rank:>2}. idx={idx:>3}, score={val:.4f}{tag} | {sents_j[idx]}")

    print("\n" + "="*80)
    print(f"Accuracy across files (should be 5/5 ideally): {correct}/{len(doc_names)}")
    return correct

src_idx = SOURCE_FILE - 1

first_acc = run_experiment(src_idx, which="first", top_k=TOP_K)
last_acc = run_experiment(src_idx, which="last", top_k=TOP_K)
print(f"\nSummary -> First sentence accuracy: {first_acc}/5, Last sentence accuracy: {last_acc}/5")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using device: cuda
Loaded 1.txt: 3 sentences
Loaded 2.txt: 4 sentences
Loaded 3.txt: 6 sentences
Loaded 4.txt: 4 sentences
Loaded 5.txt: 6 sentences

Experiment: match the first sentence from 3.txt
Query sentence (3.txt [0]): Mais ce ne sont pas les seuls objets que renferme l'immense capacité de ma mémoire.

--------------------------------------------------------------------------------
Target file: 1.txt
Top match index: 0 (expected: 0) | score: 0.5493
Top match sentence: ﻿Mais cette vaste étendue de ma mémoire ne conserve pas seulement les espèces de toutes les choses dont je viens de parler : mais elle contient aussi tout ce que j’ai appris des sciences, et que je n’ai point encore oublié ; et elle le garde comme dans des lieux secrets et particuliers bien différents des lieux ordinaires où les corps sont renfermés ; et elle ne conserve pas seulement les images de ces connaissances, mais les connaissances mêmes 1 .
Top-k candidates:
   1. idx=  0, score=0.5493 (expected) | ﻿Mais c