In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np

# 1. Load a pre‑trained model
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

paragraph_one = """(48) Considering that the financial risks associated with the support
 to Moldova in the form of loans under the Facility
 are comparable to the financial risks associated with lending operations
 under Regulation (EU) 2021/947, provisioning for the financial liability from
 loans under this Regulation should be constituted at the rate
 of 9%, in line with Article 214 of the Financial
 Regulation and the funding of the provisioning should be sourced
 from the envelope allocated to the Neighbourhood geographic programme under
 Article [[6(2),]] point (a), of Regulation (EU) [[2021/947.]]"""

paragraph_two = """(48) Tā kā finanšu risks, kas saistīts ar atbalstu, kurš
 atbilstīgi mehānismam sniegts Moldovai aizdevumu veidā, ir līdzīgs finanšu riskam,
 kas saistīts ar Regulas (ER) 2021/947 ietvaros veiktajām aizdevumu operācijām,
 uzkrājumi to finanšu saistību segšanai, kas izriet no šīs regulas
 ietvaros sniegtajiem aizdevumiem, būtu jāveido ar likmi 9% saskaņā ar
 Finanšu regulas 214. pantu, un uzkrājumu finansējums būtu jāiegūst no
 finansējuma, kas piešķirts Kaimiņattiecību ģeogrāfiskajai programmai saskaņā ar Regulas (ES)
 [[2021/9476.]] panta 2. punkta q) apakšpunktu."""

# Use the full paragraphs (or split into sentences if needed)
sentence1 = paragraph_one
sentence2 = paragraph_two

# 2. Encode sentences as embeddings
emb1 = model.encode(sentence1)
emb2 = model.encode(sentence2)

# 3. Compute cosine similarity
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

sim_score = cosine_similarity(emb1, emb2)
print(f"Similarity score between sentences: {sim_score:.4f}")

# 4. Decide a threshold for "sufficient similarity"
threshold = 0.95
if sim_score >= threshold:
    print("✅ Sentences appear to have the same meaning (above threshold).")
else:
    print("⚠️ Sentences might differ in meaning (below threshold).")


  from .autonotebook import tqdm as notebook_tqdm


Similarity score between sentences: 0.9203
⚠️ Sentences might differ in meaning (below threshold).


In [5]:
import stanza

# Download and initialize models
stanza.download('en')  # English
stanza.download('lv')  # Latvian
nlp_en = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse')
nlp_lv = stanza.Pipeline('lv', processors='tokenize,pos,lemma,depparse')

doc1 = nlp_en(paragraph_one)
doc2 = nlp_lv(paragraph_two)

# Extract subjects
def extract_subjects_stanza(doc):
    subjects = []
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.deprel in ("nsubj", "nsubj:pass", "csubj"):
                subjects.append(word.text)
    return subjects

subjects1 = extract_subjects_stanza(doc1)
subjects2 = extract_subjects_stanza(doc2)
print(subjects1)
print(subjects2)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 48.0MB/s]                    
2025-10-29 18:37:55 INFO: Downloaded file to C:\Users\robbe\stanza_resources\resources.json
2025-10-29 18:37:55 INFO: Downloading default packages for language: en (English) ...
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.11.0/models/default.zip: 100%|██████████| 526M/526M [00:51<00:00, 10.2MB/s] 
2025-10-29 18:38:48 INFO: Downloaded file to C:\Users\robbe\stanza_resources\en\default.zip
2025-10-29 18:38:54 INFO: Finished downloading models and saved to C:\Users\robbe\stanza_resources
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 7.11MB/s]                    
2025-10-29 18:38:54 INFO: Downloaded file to C:\Users\robbe\stanza_resources\resources.json
2025-10-29 18:38:54 INFO: Downloading default packages for language: lv (Latvian) ...
Downloading ht

['risks', 'provisioning']
['kas', 'kurš', 'kas', 'uzkrājumi', 'kas', 'kas']


In [10]:
import stanza
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import Levenshtein
import numpy as np
import re

# -----------------------------
# 1. Initialize Stanza pipelines
# -----------------------------
stanza.download('en')
stanza.download('lv')

nlp_en = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse')
nlp_lv = stanza.Pipeline('lv', processors='tokenize,pos,lemma,depparse')

# -----------------------------
# 2. Paragraph examples
# -----------------------------
paragraph_en = "Article [[6(2),]] point (a), of Regulation (EU) [[2021/947.]]"
paragraph_lv = "Regulas (ES) [[2021/9476.]] panta 2. punkta q) apakšpunktu."

# -----------------------------
# 3. Extract references with Stanza
# -----------------------------
def extract_references_stanza(doc, heads):
    refs = []
    for s in doc.sentences:
        for w in s.words:
            if w.text.lower() in heads:
                # Include token itself + all children in the dependency tree
                subtree_words = [ww.text for ww in s.words if ww.head == w.id or ww.id == w.id]
                refs.append(" ".join(subtree_words))
    return refs

refs_en = extract_references_stanza(nlp_en(paragraph_en), ["article", "point", "regulation"])
refs_lv = extract_references_stanza(nlp_lv(paragraph_lv), ["panta", "punkta", "regulas"])

print("English references:", refs_en)
print("Latvian references:", refs_lv)

# -----------------------------
# 4. Normalize references (optional)
# -----------------------------
def normalize_ref(ref):
    ref = ref.lower()
    ref = re.sub(r"[^\w\s]", "", ref)  # remove punctuation
    return ref.strip()

refs_en_norm = [normalize_ref(r) for r in refs_en]
refs_lv_norm = [normalize_ref(r) for r in refs_lv]

# -----------------------------
# 5. Encode references with SentenceTransformer
# -----------------------------
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
emb_en = model.encode(refs_en_norm)
emb_lv = model.encode(refs_lv_norm)

# -----------------------------
# 6. Compute pairwise cosine similarity
# -----------------------------
similarity_matrix = cosine_similarity(emb_en, emb_lv)

# -----------------------------
# 7. Align references by highest similarity
# -----------------------------
matched_refs = []
for i, en_ref in enumerate(refs_en):
    j = np.argmax(similarity_matrix[i])
    lv_ref = refs_lv[j]
    sim_score = similarity_matrix[i][j]

    # Optional: Levenshtein distance for numeric/letter check
    lev_distance = Levenshtein.distance(normalize_ref(en_ref), normalize_ref(lv_ref))

    matched_refs.append({
        "english": en_ref,
        "latvian": lv_ref,
        "similarity": sim_score,
        "levenshtein": lev_distance,
        "flag_mismatch": sim_score < 0.85 or lev_distance > 2
    })

# -----------------------------
# 8. Display results
# -----------------------------
for m in matched_refs:
    print(f"{m['english']!r} ↔ {m['latvian']!r} | similarity={m['similarity']:.3f} "
          f"| Levenshtein={m['levenshtein']} | FLAG={m['flag_mismatch']}")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 48.0MB/s]                    
2025-10-29 18:48:17 INFO: Downloaded file to C:\Users\robbe\stanza_resources\resources.json
2025-10-29 18:48:17 INFO: Downloading default packages for language: en (English) ...
2025-10-29 18:48:18 INFO: File exists: C:\Users\robbe\stanza_resources\en\default.zip
2025-10-29 18:48:22 INFO: Finished downloading models and saved to C:\Users\robbe\stanza_resources
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 24.2MB/s]                    
2025-10-29 18:48:22 INFO: Downloaded file to C:\Users\robbe\stanza_resources\resources.json
2025-10-29 18:48:22 INFO: Downloading default packages for language: lv (Latvian) ...
2025-10-29 18:48:23 INFO: File exists: C:\Users\robbe\stanza_resources\lv\default.zip
2025-10-29 18:48:24 INFO: Finished downloading models and saved to C:\Users\rob

English references: ['Article 6 point', 'point a Regulation 2021/947', ', of Regulation EU']
Latvian references: ['Regulas ES 2021', 'Regulas panta', 'panta 2. punkta']
'Article 6 point' ↔ 'panta 2. punkta' | similarity=0.652 | Levenshtein=11 | FLAG=True
'point a Regulation 2021/947' ↔ 'Regulas ES 2021' | similarity=0.600 | Levenshtein=15 | FLAG=True
', of Regulation EU' ↔ 'Regulas ES 2021' | similarity=0.432 | Levenshtein=11 | FLAG=True
