In [3]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\FernandaOrtega\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
%pip install spacy

import re
import nltk
import spacy
from docx import Document
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Download spaCy model if not present
import spacy.cli
spacy.cli.download("en_core_web_sm")

# Load NLP tools
nltk.download('punkt')
nltk.download('punkt_tab')
nlp = spacy.load("en_core_web_sm")

# Load SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2') 

# === STEP 1: Read DOCX files ===
def read_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])

# === STEP 2: Preprocessing ===
# def preprocess(text):
#     text = text.lower()
#     text = re.sub(r'\n+', ' ', text)
#     text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
#     return text.strip()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    tokens = [token.lemma_ for token in doc if token.is_alpha] 
    return ' '.join(tokens)

# === STEP 3: Extract segments ===
def extract_action_items(text):
    return re.findall(r"\[\s*\]\s*(.*)", text)

def extract_recommendations(text):
    return re.findall(r"\d+\.\d+\.\s+(.*?)(?=(?:\d+\.\d+\.|$))", text, re.DOTALL)

def extract_recommendation_numbers(text):
    return re.findall(r"(\d+\.\d+\.)\s+", text)

# === STEP 4: Load Documents ===
doc1_text = read_docx('./Summary action points- sustainability.docx')
doc2_text = read_docx('./T20 communique.docx')
#doc2_text = read_docx('./T20 communique.docx').replace('\n', ' ')

#doc1_text = read_docx('./Summary action points- sustainability.docx')


doc1_actions_raw = extract_action_items(doc1_text)
doc2_recs_raw = extract_recommendations(doc2_text)

# === STEP 5: Sentence-level Tokenization ===
action_sentences = []
action_index_map = []  # Maps sentence index to action point

for i, action in enumerate(doc1_actions_raw):
    for sent in sent_tokenize(action):
        if sent.strip():
            action_sentences.append(sent.strip())
            action_index_map.append(i)

rec_sentences = []
rec_index_map = []  # Maps sentence index to recommendation

for j, rec in enumerate(doc2_recs_raw):
    for sent in sent_tokenize(rec):
        if sent.strip():
            rec_sentences.append(sent.strip())
            rec_index_map.append(j)

# === STEP 6: Preprocess and Encode Sentences ===
action_sentences_clean = [preprocess(s) for s in action_sentences]
rec_sentences_clean = [preprocess(s) for s in rec_sentences]

action_embeddings = model.encode(action_sentences_clean, convert_to_numpy=True)
rec_embeddings = model.encode(rec_sentences_clean, convert_to_numpy=True)







[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\FernandaOrtega\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\FernandaOrtega\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
# === STEP 7: Compute Similarity Matrix ===
sim_matrix = cosine_similarity(action_embeddings, rec_embeddings)

# === STEP 8: Find Top Sentence Matches ===
results = []

for i, row in enumerate(sim_matrix):
    top_idx = np.argmax(row)
    score = row[top_idx]
    results.append({
        "Action Sentence": action_sentences[i],
        "From Action Point #": action_index_map[i] + 1,
        "Top Matching Recommendation Sentence": rec_sentences[top_idx],
        "From Recommendation #": rec_index_map[top_idx] + 1,
        "Similarity Score": round(score, 3)
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Sort by highest similarity
results_df = results_df.sort_values(by="Similarity Score", ascending=False)

# Display
print(results_df.head(10).to_string(index=False))


                                                                                                                                                                                    Action Sentence  From Action Point #                                                                                                                                                                                                                                                                                                                                                                                                         Top Matching Recommendation Sentence  From Recommendation #  Similarity Score
         Facilitate greater coordination and cooperation between multilateral institutions, regional blocs, and major economies like China and the US to advance the global decarbonization agenda.                   10 Empower the WTO to preserve and reform the multilateral trading system: The G20 should empower t

In [6]:
#import re
import nltk
#import spacy
from docx import Document
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# === Download NLTK tokenizer once ===
nltk.download('punkt')

# === Load models ===
nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer('all-MiniLM-L6-v2')

# === Read .docx file content ===
def read_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])

# === Preprocess a sentence or text ===
def preprocess(text):
     text = text.lower()
     text = re.sub(r'\n+', ' ', text)
     text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
     doc = nlp(text)
     tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
     return ' '.join(tokens)

# def preprocess(text):
#     text = text.lower()
#     text = re.sub(r'\n+', ' ', text)
#     text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
#     return text.strip()

# === Extract bullet points or recommendations ===
def extract_action_items(text):
    return re.findall(r"\[\s*\]\s*(.*)", text)

def extract_recommendations(text):
    return re.findall(r"\d+\.\d+\.\s+(.*?)(?=(?:\d+\.\d+\.|$))", text, re.DOTALL)

def extract_recommendation_numbers(text):
    return re.findall(r"(\d+\.\d+\.)\s+", text)

# === Load your documents ===
doc1_text = read_docx('./Summary action points- sustainability.docx')
doc2_text = read_docx('./T20 communique.docx')

doc1_actions_raw = extract_action_items(doc1_text)
doc2_recs_raw = extract_recommendations(doc2_text)

# === Tokenize action sentences ===
action_sentences = []
action_index_map = []

for i, action in enumerate(doc1_actions_raw):
    cleaned_action = action.replace('\n', ' ')  # Clean line breaks
    for sent in sent_tokenize(cleaned_action):
        sent_clean = sent.strip()
        if sent_clean:
            action_sentences.append(sent_clean)
            action_index_map.append(i)

# === Tokenize recommendation sentences ===
rec_sentences = []
rec_index_map = []

for j, rec in enumerate(doc2_recs_raw):
    cleaned_rec = rec.replace('\n', ' ')
    for sent in sent_tokenize(cleaned_rec):
        sent_clean = sent.strip()
        if sent_clean:
            rec_sentences.append(sent_clean)
            rec_index_map.append(j)

# === Preprocess sentences ===
action_sentences_clean = [preprocess(s) for s in action_sentences]
rec_sentences_clean = [preprocess(s) for s in rec_sentences]

# === Encode with SentenceTransformer ===
action_embeddings = model.encode(action_sentences_clean, convert_to_numpy=True)
rec_embeddings = model.encode(rec_sentences_clean, convert_to_numpy=True)

# === Compute similarity matrix ===
sim_matrix = cosine_similarity(action_embeddings, rec_embeddings)




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\FernandaOrtega\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# print action points
for i, action in enumerate(doc1_actions_raw):
    print(f"Action Point {i+1}: {action}")

Action Point 1: Explore opportunities for private sector investment and blended financing models to scale up the green hydrogen project in Namibia.
Action Point 2: Investigate potential for carbon credits or other impact-based financing mechanisms to support the biomass pellet project in Madagascar.
Action Point 3: Advocate for increased and more accessible funding for community-led loss and damage initiatives, and ensure community agency is central in the design of new global funding mechanisms.
Action Point 4: Understand the new global context and the role of the private sector as a political player, as well as the limits that nature will impose on the energy transition through resource constraints.
Action Point 5: Develop a positive narrative around the energy transition that focuses on success stories and opportunities, rather than just apocalyptic scenarios.
Action Point 6: Ensure that the energy transition is approached as a systemic change, rather than just the addition of renew

In [8]:
# print recomms points
for i, action in enumerate(doc2_recs_raw):
    print(f"Recomendation Points {i+1}: {action}")

Recomendation Points 1: Empower the WTO to preserve and reform the multilateral trading system: The G20 should empower the WTO by strengthening the WTO Secretariat and making additional financial resources available, the decision-making process should also be made more flexible, this will help advance and incorporate pro-development plurilateral agreements, such as the Investment Facilitation for Development (IFD) Agreement, into the WTO Framework. The dispute settlement system needs to be reformed to strengthen deliberative processes and preventive mechanisms, in the interim building on the Multi-Party Interim Appeal Arbitration Arrangement (MPIA). The Generalised System of Preferences (GSP) should be reviewed to provide predictable, long term market access for services and flexible rules of origin for goods for Least Developed Countries (LDCs). The G20 should build consensus and momentum to the 14th Ministerial Conference of the WTO (MC14) around the GSP reform. 
Recomendation Points

In [10]:
# === Define session mapping based on action point number ===
session_mapping = {
    1: "Scaling grassroots solutions for sustainable development",
    2: "Scaling grassroots solutions for sustainable development",
    3: "Scaling grassroots solutions for sustainable development",
    4: "Energy transition and decarbonization",
    5: "Energy transition and decarbonization",
    6: "Energy transition and decarbonization",
    7: "Energy transition and decarbonization",
    8: "Geopolitical challenges to industrial decarbonization",
    9: "Geopolitical challenges to industrial decarbonization",
    10: "Geopolitical challenges to industrial decarbonization",
    11: "Geopolitical challenges to industrial decarbonization",
    12: "Africa's energy transition",
    13: "Africa's energy transition",
    14: "Africa's energy transition",
    15: "Africa's energy transition",
    16: "Africa's energy transition"
}

# === List of shared topics to map to top 13 rows ===
shared_topics = [
    "Multilateral cooperation for global decarbonization and economic reform (e.g., WTO, trade, global governance)",
    "Social protection, community-led development, inclusive funding mechanisms",
    "Green transition finance, clean technology investment",
    "Technology sharing, green transition support",
    "Systemic approaches to energy, sustainable finance, affordability",
    "Role of private sector",
    "Unlock and mobilize capital",
    "Systemic financial reforms, closing SDG financing gaps, inclusive financial access.",
    "Equitable climate financing",
    "Community engagement, just transition policies",
    "International cooperation, risk sharing, financial mechanisms",
    "Development finance, affordable capital",
    "Integrated energy transition, bioeconomy, sustainability frameworks"
]

# === Find top match for each action sentence ===
results = []
for i, row in enumerate(sim_matrix):
    top_idx = np.argmax(row)
    score = row[top_idx]
    action_point_num = action_index_map[i] + 1
    results.append({
        "GSS 2025 Session": session_mapping.get(action_point_num, "Unknown"),
        "Action Point GSS": action_sentences[i],
        "From Action Point #": action_point_num,
        "Top Matching Recommendation (T20 Communiqué)": rec_sentences[top_idx],
        "From Recommendation #": rec_index_map[top_idx] + 1,
        "Similarity Score": round(score, 3)
    })

# === Create DataFrame and sort ===
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Similarity Score", ascending=False)

# === Keep only top 13 and add shared topics ===
top_13_df = results_df.head(13).copy()
top_13_df["Shared Topics"] = shared_topics

# === Save outputs ===
top_13_df.to_excel("top_13_similarity_results_with_shared_topics.xlsx", index=False)
top_13_df.to_html("top_13_similarity_results_with_shared_topics.html", index=False)

print("✅ Saved Excel and HTML with top 13 and shared topics.")
print(top_13_df.to_string(index=False))


✅ Saved Excel and HTML with top 13 and shared topics.
                                        GSS 2025 Session                                                                                                                                                                                    Action Point GSS  From Action Point #                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Top Matching Recommendation (T20 Communiqué)  From Recommendation #  Similarity Score                 

In [None]:
# # === Define session mapping based on action point number ===
# session_mapping = {
#    1: "Scaling grassroots solutions for sustainable development",
#  2: "Scaling grassroots solutions for sustainable development",
#    3: "Scaling grassroots solutions for sustainable development",
#    4: "Energy transition and decarbonization",
#    5: "Energy transition and decarbonization",
#     6: "Energy transition and decarbonization",
#     7: "Energy transition and decarbonization",
#     8: "Geopolitical challenges to industrial decarbonization",
#     9: "Geopolitical challenges to industrial decarbonization",
#     10: "Geopolitical challenges to industrial decarbonization",
#     11: "Geopolitical challenges to industrial decarbonization",
#     12: "Africa's energy transition",
#     13: "Africa's energy transition",
#     14: "Africa's energy transition",
#     15: "Africa's energy transition",
#     16: "Africa's energy transition"
# }

# #=== Find top match for each action sentence ===
# results = []
# for i, row in enumerate(sim_matrix):
#    top_idx = np.argmax(row)
#    score = row[top_idx]
#    action_point_num = action_index_map[i] + 1
#    results.append({
#        "Session": session_mapping.get(action_point_num, "Unknown"),
#        "Action Sentence": action_sentences[i],
#        "From Action Point #": action_point_num,
#        "Top Matching Recommendation Sentence": rec_sentences[top_idx],
#        "From Recommendation #": rec_index_map[top_idx] + 1,
#        "Similarity Score": round(score, 3)
#    })

# #=== Create DataFrame ===
# results_df = pd.DataFrame(results)

# #=== Sort by similarity score ===
# results_df = results_df.sort_values(by="Similarity Score", ascending=False)

# #=== Save as Excel with sessions ===
# results_df.to_excel("sentence_similarity_results_with_sessions.xlsx", index=False)

# #=== Save as HTML table ===
# results_df.to_html("sentence_similarity_results_with_sessions.html", index=False)

# print("✅ Saved Excel and HTML with session info.")
# print(results_df.head(5).to_string(index=False))


✅ Saved Excel and HTML with session info.
                                                 Session                                                                                                                                                                                     Action Sentence  From Action Point #                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Top Matching Recommendation Sentence  From Recommendation #  Similarity Score
   Geopolitical challenges t

In [11]:
%pip install spacy seaborn matplotlib
!python -m spacy download en_core_web_md






[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
     ---------------------------------------- 0.0/33.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/33.5 MB ? eta -:--:--
     --------------------------------------- 0.0/33.5 MB 330.3 kB/s eta 0:01:42
     ---------------------------------------- 0.3/33.5 MB 2.0 MB/s eta 0:00:17
     --- ------------------------------------ 2.8/33.5 MB 16.0 MB/s eta 0:00:02
     ----- ---------------------------------- 4.8/33.5 MB 23.7 MB/s eta 0:00:02
     --------- ------------------------------ 7.6/33.5 MB 30.4 MB/s eta 0:00:01
     ----------- ---------------------------- 9.4/33.5 MB 30.2 MB/s eta 0:00:01
     -------------- ------------------------ 12.1/33.5 MB 50.4 MB/s eta 0:00:01
     ---------------- ---------------------- 14.2/33.5 MB 50.1 MB/s eta 0:00:01
     ------------------ -----------------


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
%pip install spacy

import spacy
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

# Download spaCy model if not present
import spacy.cli
spacy.cli.download("en_core_web_md")

# Load spaCy model with word vectors
nlp = spacy.load("en_core_web_md")



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [15]:
# Select first row (best match)
row = results_df.iloc[0]

action_text = row["Action Point GSS"]
rec_text = row["Top Matching Recommendation (T20 Communiqué)"]


In [19]:
%pip install termcolor

from termcolor import colored
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize and filter
def lemmatized_tokens(text):
    doc = nlp(text.lower())
    return set(token.lemma_ for token in doc if token.is_alpha and not token.is_stop)

# Function to highlight shared words in green
def highlight_text(text, shared_lemmas):
    words = text.split()
    highlighted = []
    for word in words:
        lemma = nlp(word.lower())[0].lemma_
        if lemma in shared_lemmas:
            highlighted.append(colored(word, "green", attrs=["bold"]))
        else:
            highlighted.append(word)
    return " ".join(highlighted)

# Show highlights for top N matches
top_n = 10
for _, row in results_df.head(top_n).iterrows():
    action = row["Action Point GSS"]
    recommendation = row["Top Matching Recommendation (T20 Communiqué)"]
    session = row["GSS 2025 Session"]

    action_lemmas = lemmatized_tokens(action)
    rec_lemmas = lemmatized_tokens(recommendation)
    shared = action_lemmas & rec_lemmas

    print(f"\n=== GSS Session: {session} ===")
    print("Action Point GSS:")
    print(highlight_text(action, shared))
    print("\nTop Matching Recommendation (T20 Communiqué):")
    print(highlight_text(recommendation, shared))
    print("\nShared words:", ", ".join(sorted(shared)) if shared else "None")
    print("-" * 100)



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip




=== GSS Session: Geopolitical challenges to industrial decarbonization ===
Action Point GSS:
Facilitate greater coordination and cooperation between [1m[32mmultilateral[0m institutions, regional blocs, and major economies like China and the US to [1m[32madvance[0m the global decarbonization agenda.

Top Matching Recommendation (T20 Communiqué):
Empower the WTO to preserve and reform the [1m[32mmultilateral[0m trading system: The G20 should empower the WTO by strengthening the WTO Secretariat and making additional financial resources available, the decision-making process should also be made more flexible, this will help [1m[32madvance[0m and incorporate pro-development plurilateral agreements, such as the Investment Facilitation for Development (IFD) Agreement, into the WTO Framework.

Shared words: advance, multilateral
----------------------------------------------------------------------------------------------------

=== GSS Session: Scaling grassroots solutions for s

In [22]:
import spacy
from IPython.core.display import display, HTML

# Load spaCy
nlp = spacy.load("en_core_web_sm")

# Lemmatize and filter tokens
def lemmatized_tokens(text):
    doc = nlp(text.lower())
    return set(token.lemma_ for token in doc if token.is_alpha and not token.is_stop)

# Highlight shared words in HTML
def highlight_html(text, shared_lemmas):
    words = text.split()
    highlighted = []
    for word in words:
        lemma = nlp(word.lower())[0].lemma_
        if lemma in shared_lemmas:
            highlighted.append(f"<span style='color: green; font-weight: bold'>{word}</span>")
        else:
            highlighted.append(word)
    return " ".join(highlighted)

# Show top N results with inline HTML
top_n = 15
html_output = "<div style='font-family: Arial, sans-serif'>"

for _, row in results_df.head(top_n).iterrows():
    action = row["Action Point GSS"]
    recommendation = row["Top Matching Recommendation (T20 Communiqué)"]
    session = row["GSS 2025 Session"]
    
    action_lemmas = lemmatized_tokens(action)
    rec_lemmas = lemmatized_tokens(recommendation)
    shared = action_lemmas & rec_lemmas

    html_output += f"<h3 style='color: #004d99'>GSS 2025 Session: {session}</h3>"
    html_output += f"<p><strong>Action Point GSS:</strong><br>{highlight_html(action, shared)}</p>"
    html_output += f"<p><strong>Top Matching Recommendation (T20 Communiqué):</strong><br>{highlight_html(recommendation, shared)}</p>"
    if shared:
        html_output += f"<p><strong>Shared word:</strong> {', '.join(sorted(shared))}</p>"
    else:
        html_output += "<p><strong>Shared word:</strong> None</p>"
    html_output += "<hr style='margin: 20px 0'>"

html_output += "</div>"

# Display the HTML
display(HTML(html_output))


  from IPython.core.display import display, HTML


In [23]:
from pathlib import Path

# === Save to HTML file ===
output_path = Path("top_sentence_matches_highlighted.html")
output_path.write_text(html_output, encoding="utf-8")


14986