# Installing dependencies

In [1]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Collecting scipy<1.14.0,>=1.7.0 (from gensim<5.0.0,>=4.3.0->node2vec)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scipy, node2vec
  Attempting uninstall: scipy
    Found existing installation: scipy 1.15.2
    Uninstalling scipy-1.15.2:
      Successfully uninstalled scipy-1.15.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. T

# Imports

In [2]:
import os
import random

#Data
import numpy as np
import pandas as pd
import re
import json
import math

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import networkx as nx
from tqdm import tqdm

# Models
from kaggle_secrets import UserSecretsClient
from openai import AzureOpenAI
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from node2vec import Node2Vec
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataloader, Dataset
import umap

#Metrics
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

#Transformations
from sklearn.preprocessing import normalize
import networkx as nx
from itertools import combinations
from sklearn.manifold import TSNE
from scipy.spatial.distance import pdist, squareform
from itertools import combinations
from collections import Counter
import ast

2025-07-12 01:29:08.276616: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752283748.455699      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752283748.504684      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Policy Embeddings
In this notebook, I vectorized each nation's political stance by:
- cleaning text
- embedding country speeches with semantic segmentation and averaging
- graph based knowledge embedding utilzied country information such as: political bloc, voting record
- combining vectors

# Cleaning Text

In [3]:
# Function for cleaning data

def clean_text(text: str) -> str:
    """Clean text by removing country names while preserving stopwords"""
    print("Cleaning text")
    country_names = [
        "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", 
        "Antigua", "Argentina", "Armenia", "Australia", "Austria",
        "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados",
        "Belarus", "Belgium", "Belize", "Benin", "Bhutan",
        "Bolivia", "Bosnia", "Botswana", "Brazil", "Brunei",
        "Bulgaria", "Burkina", "Burundi", "Cambodia", "Cameroon",
        "Canada", "Cape Verde", "Central African Republic", "Chad", "Chile",
        "China", "Colombia", "Comoros", "Congo", "Costa Rica",
        "Croatia", "Cuba", "Cyprus", "Czech Republic", "Denmark",
        "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt",
        "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini",
        "Ethiopia", "Fiji", "Finland", "France", "Gabon",
        "Gambia", "Georgia", "Germany", "Ghana", "Greece",
        "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana",
        "Haiti", "Honduras", "Hungary", "Iceland", "India",
        "Indonesia", "Iran", "Iraq", "Ireland", "Israel",
        "Italy", "Ivory Coast", "Jamaica", "Japan", "Jordan",
        "Kazakhstan", "Kenya", "Kiribati", "Korea", "Kosovo",
        "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon",
        "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania",
        "Luxembourg", "Madagascar", "Malawi", "Malaysia", "Maldives",
        "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius",
        "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia",
        "Montenegro", "Morocco", "Mozambique", "Myanmar", "Namibia",
        "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua",
        "Niger", "Nigeria", "North Korea", "North Macedonia", "Norway",
        "Oman", "Pakistan", "Palau", "Panama", "Papua New Guinea",
        "Paraguay", "Peru", "Philippines", "Poland", "Portugal",
        "Qatar", "Romania", "Russia", "Rwanda", "Saint Kitts",
        "Saint Lucia", "Saint Vincent", "Samoa", "San Marino", "Sao Tome",
        "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone",
        "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia",
        "South Africa", "South Korea", "South Sudan", "Spain", "Sri Lanka",
        "Sudan", "Suriname", "Sweden", "Switzerland", "Syria",
        "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste",
        "Togo", "Tonga", "Trinidad", "Tunisia", "Turkey",
        "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates",
        "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu",
        "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe"
    ]
    
    country_names += [
        "USA", "UK", "UAE", "PRC", "DPRK", 
        "ROK", "DRC", "U.S.", "U.K.", "America",
        "Britain", "England", "Scotland", "Wales", "Northern Ireland",
        "Hong Kong", "Macau", "Palestine", "Ivory Coast", "Czechia",
        "Macedonia", "Swaziland", "Burma", "East Timor", "Vatican"
    ]
    text = re.sub(r'\b(?:the\s+)?(?:delegation\s+of\s+)?(?:representative\s+of\s+)?(' + 
                  '|'.join(country_names) + r')\b', '[COUNTRY]', text, flags=re.IGNORECASE)
    
    text = re.sub(r'[^\w\s]', ' ', text) 
    text = text.lower()

    text = ' '.join(text.split()).strip()
    
    return text[:3000]

# Semantic Segmentation using GPT 4o

In [4]:
def get_segments(text: str, max_tokens = 200) -> list[str]:
    paragraphs = [p for p in text.split('\n') if p.split()]

    num_seg = max(0, len(text.split())//max_tokens) + 3
    system =     system = f"""
    You will do semantic segmentation of the following text and output the result as a JSON string.
    Segment this diplomatic text into {num_seg} coherent policy segments.
    Each segment should focus on a single policy theme (e.g., economic policy, 
    security concerns, human rights, international cooperation).
    Preserve diplomatic context and policy coherence within each segment.
    
    Return the result as JSON with this exact format:
    {{"segments": ["segment1", "segment2", "segment3"]}}
    """
    print("Doing semantic segmentation...")
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": text}
        ],
        response_format={"type": "json_object"}
    )
    
    segments = json.loads(response.choices[0].message.content)["segments"]
    if not len(paragraphs) == 1:
        segments.append(paragraphs)
    segments.append(text)
    print(f"Generated {len(segments)} segments. ")
    return segments

In [5]:
user_secrets = UserSecretsClient()

api = user_secrets.get_secret("OPENAI_API_KEY")
client = AzureOpenAI(
    api_key=api,
    api_version="2024-11-01-preview",
    azure_endpoint="https://swedencentral.api.cognitive.microsoft.com"
)

# Ensemble Embeddings using text-embedding-3-large & bge-large-en-v1.5

In [6]:
def generate_embeddings(texts: list[str]):
    print("Generating embeddings for each segmentation")
    stm = SentenceTransformer('BAAI/bge-large-en-v1.5')
    embeddings = []
    for i in texts:
        bge = stm.encode(i)
        response = client.embeddings.create(
            model="text-embedding-3-large",
            input=i
        )
        emb1 = normalize([response.data[0].embedding])[0]
        emb2 = normalize([bge])[0]
        embedding = np.concatenate([emb1, emb2])
        embeddings.append(embedding)
    
    fin_emb = []
    
    for i in range(len(embeddings)):
        fin_emb.append(list(embeddings[i]))
    return fin_emb

In [7]:
def get_organizations(country: str):
    return [igo for igo, members in organizations.items() if country in members]

In [8]:
df = pd.read_csv("/kaggle/input/better-speech-embeddings/country_embeddings_optimized.csv").transpose()
df2 = pd.read_csv("/kaggle/input/2-hour-speech-embedding-average-5200-speeches/my_file.csv")
df.columns = df2.columns
df.drop('Unnamed: 0', inplace=True)
df.index = range(0, 4096)

df = df.drop(columns=['Holy See (Vatican City State)'])
df = df.rename(columns={'Viet Nam': 'Vietnam',
                        'Brunei Darussalam': 'Brunei',
                        'Iran, Islamic Republic of': 'Iran',
                        "Lao People's Democratic Republic": 'Laos',
                        'Türkiye': 'Turkey',
                        'Syrian Arab Republic': 'Syria',
                        'Russian Federation': 'Russia',
                        'Palestine, State of': 'Palestine',
                        'Korea, Republic of': 'South Korea',
                        "Korea, Democratic People's Republic of": 'North Korea',
                        'Bolivia, Plurinational State of': 'Bolivia',
                        'Moldova, Republic of': 'Moldova',
                        'Tanzania, United Republic of': 'Tanzania',
                        'Micronesia, Federated States of':'Micronesia',
                        'Venezuela, Bolivarian Republic of': 'Venezuela',
                        'Cabo Verde':'Cape Verde',
                        'Timor-Leste': 'East Timor',
                        'Congo, The Democratic Republic of the': 'Democratic Republic of the Congo',
                        'Czechia': 'Czech Republic',
                        'Congo': 'Republic of the Congo'})

In [9]:
df = df.drop(columns=['Palestine'])
#df = df.drop(columns=['Bolivia, Plurinational State of', 'Brunei Darussalam'])
countries = list(df.columns)
speech_data = []
for country in countries:
    speech_data.append(list(df[country]))

speech_data = np.array(speech_data)

# Combining speech embeddings and knowledge based embeddings

In [None]:
cl_speeches = pd.read_csv("")
dataset = cl_speeches.to_dict(orient='records')

class EmbeddingPairDataset(Dataset):
    def __init__(self, emb1, emb2, scores):
        self.emb1 = torch.tensor(emb1, dtype=torch.float32)
        self.emb2 = torch.tensor(emb2, dtype=torch.float32)
        self.scores = torch.tensor(scores, dtype=torch.float32)

    def __len__(self):
        return len(self.scores)
    def __getitem__(self, idx):
        return self.emb1[idx], self.emb2[idx], self.scores[idx]

class ProjectionNet(nn.Module):
    def __init__(self, input_dim, proj_dim=512):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Linear(input_dim, proj_dim)
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )
    def forward(self, x):
        return F.normalize(self.proj(x), dim=-1)

def cosine_similarity_loss(x,y,target_sim):
    cos_sim = F.cosine_similarity(x,y)
    return F.mse_loss(cos_sim, target_sim)

dataset = EmbeddingPairDataset(dataset)
dataloader = Dataloader(dataset, batch_size=32, shuffle=True)

model = ProjectionNet(4096)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
for epoch in range(10):
    total_loss = 0
    for a,b,score in dataloader:
        proj_a = model(a)
        proj_b = model(b)

        loss = cosine_similarity_loss(proj_a, proj_b, score)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(score)
    print(f"Epoch: {epoch + 1} | Loss: {total_loss/len(dataset):.2f}")

In [10]:
model.eval()

with torch.no_grad():
    speech_emb = torch.tensor(speech_data).float() 
    fused_output = model(speech_emb)
    transformed_speech = fused_output.squeeze(0)

NameError: name 'transformer' is not defined

In [None]:
combined_embedding = torch.reshape(torch.tensor(transformed_speech), (512, 193))

np_array = combined_embedding.detach().cpu().numpy()
final_country_embeddings = pd.DataFrame(np_array, columns=countries)

In [None]:
def compare_countries(country1, country2):
    cs = cosine_similarity(
        np.array(list(final_country_embeddings[country1])).reshape(1, -1),
        np.array(list(final_country_embeddings[country2])).reshape(1, -1)
    )
    return cs

# Visualizing and testing

In [None]:
umap_model = umap.UMAP(n_components=5, random_state=42)
speech_umap = umap_model.fit_transform(final_country_embeddings)
or_speech_umap = umap_model.fit_transform(speech_data)

from scipy.spatial.distance import pdist, squareform

def get_distance_matrix(umap_embeddings, countries, sample_size=5):
    idx = sorted(random.sample(range(len(countries)), sample_size))
    selected = umap_embeddings[idx]
    dist_matrix = squareform(pdist(selected, metric='euclidean'))
    selected_countries = [countries[i] for i in idx]
    df = pd.DataFrame(dist_matrix, index=selected_countries, columns=selected_countries)
    return df

In [None]:
plt.figure(figsize=(10,8))

speech_umap_df = get_distance_matrix(speech_umap, countries, 10)
or_speech_umap_df = get_distance_matrix(or_speech_umap, countries, 10)

sns.heatmap(speech_umap_df, annot=True, vmin=0, vmax=None, cmap="Blues_r")
plt.title('UMAP Distance Between Country Policy Embeddings')

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(graph_umap_df, annot=True, vmin=0, vmax=None, cmap="Blues_r")
plt.title('UMAP Distance in Knowledge Injection')

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(or_speech_umap_df, annot=True, vmin=0, vmax=None, cmap="Blues_r")
plt.title('UMAP Distance in Original Speech Embeddings')

In [None]:
def get_umap_map(umap):
    distances = []
    for i in range(len(countries)):
        dists = [j**2 for j in umap[i]]
        dists = math.sqrt(sum(dists))
        distances.append(dists)
    
    world_map = pd.DataFrame({'country': countries, 'value': distances})
    fig = px.choropleth(world_map,
                    locations="country",         # Can also use 'iso_alpha' (ISO-3 codes)
                    locationmode="country names",# or "ISO-3"
                    color="value",               # The numeric column to color by
                    color_continuous_scale="Cividis",  # Or 'Plasma', 'Cividis', etc.
                    title="World Map by Euclidian Distance of graph")
    
    fig.show()

get_umap_map(speech_umap)

In [None]:
full_umap = get_distance_matrix(or_speech_umap, countries, 190)

In [None]:
original_country_embeddings = pd.DataFrame(speech_data).T
original_country_embeddings.columns = final_country_embeddings.columns

In [None]:
umap_projector = umap.UMAP(n_components=2, random_state=42)

def generate_country_similarity(speech: str, place: str, df):
    # 1. Preprocess and embed the input speech
    clean_speech = clean_text(speech)
    segments = get_segments(clean_speech)
    segment_embeddings = np.array(generate_embeddings(segments))
    av_vec = np.mean(segment_embeddings, axis=0)
    av_vec = np.expand_dims(av_vec, axis=0)  # shape: (1, 4096)

    transformer.eval()
    with torch.no_grad():
        speech_emb = torch.tensor(av_vec).float()  # (1, 4096)
        graph_emb = torch.tensor(graph_data[list(df.columns).index(place)]).float().unsqueeze(0)  # (1, 128)
        
        # Fuse embeddings
        fused_output = transformer(speech_emb, graph_emb)  # shape: (1, 4096) or other
        fused_speech_vec = fused_output.squeeze(0).cpu().numpy()  # shape: (D,)

    country_names = list(final_country_embeddings.columns)
    country_matrix = np.stack([final_country_embeddings[c] for c in country_names])

    sims = cosine_similarity([fused_speech_vec], country_matrix)[0]
    sorted_data = sorted(zip(sims, country_names), key=lambda x: -x[0])
    print("\n\nPolicy Alignment Scores (Cosine Similarity: closer to 1 = better alignment)\n")
    for sim, country in sorted_data:
        if country == place:
            print(f'\n\n-----{country}------\n\n')
            print(f"{sim:.6f} — {country}")
            if sim > 0.85:
                print("You did a great job in policy alignment!!!")
            print("\n\n")
        else:
            print(f"{sim:.6f} — {country}")

    country_sim_list = [(country, sim) for sim, country in sorted_data]
    countries, values = zip(*country_sim_list)
    sim_df = pd.DataFrame({'country': countries, 'value': values})
    all_vectors = np.vstack([fused_speech_vec, country_matrix])
    all_umap = umap_projector.fit_transform(all_vectors)
    speech_umap = all_umap[0]
    country_umap = all_umap[1:]
    fig = px.choropleth(sim_df,
                        locations="country",
                        locationmode="country names",
                        color="value",
                        color_continuous_scale="Viridis",
                        projection="natural earth",
                        title="World Map: Policy Alignment via Cosine Similarity")

    fig.show()

In [None]:
def get_country_stats(country: str, num: int):
    distances = full_umap[country].copy()
    closest_countries = distances[distances > 0].nsmallest(num)
    print(f"3 closest countries to {country}: (UMAP Euclidian Distance)")
    print(', '.join(closest_countries.keys()))
    print("\n")
    print(f"Socioeconomic statistics of {country} as of 2023")
    country_data = pd.DataFrame(socioeconomic[socioeconomic['countries']==country]).drop(columns=['countries']).rename(columns={'Density\n(P/Km2)': 'Density (P/Km2)'})
    print(''.join(f"   - {column}: {country_data.iloc[0][column]}\n" for column in country_data.columns))

In [None]:
MUN_speeches = pd.read_csv("/kaggle/input/mun-speech-dataset/MUN Speech Testing.csv")
index = random.randint(0, len(MUN_speeches)-1)
print(f"Policy similarities of speech:\n\n{MUN_speeches.loc[index]['Speech']} \n\nFrom delegate of nation: {MUN_speeches.loc[index]['Country']}\n\n")
print(f"\nBasic information about {MUN_speeches.loc[index]['Country']}:\n")
print(f"{MUN_speeches.loc[index]['Country']} is part of the following organizations")
print("".join(f"   - {organization}\n" for organization in get_organizations(MUN_speeches.loc[index]['Country'])))

get_country_stats(MUN_speeches.loc[index]['Country'], 3)

generate_country_similarity(MUN_speeches.loc[index]['Speech'], MUN_speeches.loc[index]['Country'], df)