# Installing dependencies

In [1]:
!pip install node2vec
!pip install pycountry

Collecting node2vec
  Downloading node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Collecting scipy<1.14.0,>=1.7.0 (from gensim<5.0.0,>=4.3.0->node2vec)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, node2vec
  Attempting uninstall: scipy
    Found existing installation: scipy 1.15.2
    Uninstalling scipy-1.15.2:
      Successfully uninstalled scipy-1.15.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the sou

In [2]:
speech = "Free market is an economic system based purely on supply and demand. This quintessential system however is used as a disguise to hide corruption and crony capilaism. It is used to hide a system where companies benefit not from free enterprise but from their relation with the government. Under this disguise of economic liberalization, governments privatize their companies unfairly with low prices, exclusive contracts, and regulatory capture. Corruption in privatization processes undermines economic stability, erodes public trust, and distorts the principles of free market economies. During privatization, government officials may be compelled to give unfair advantages to companies beneficial for themselves. By promoting stock holding disclosure and facilitating random audits, we can improve transparency within nations undergoing significant change. Promoting transparency in crucial in ensuring that privatization happens fairly. Transparency in privatization isn’t just a moral imperative; it is essential in attracting foreign investment and ensuring sustainable economic growth. A vote for this directive is a vote to uphold the values of free market during this critical period of economic transformation"

# Policy Embeddings
In this notebook, I vectorized each nation's political stance by:
- cleaning text
- embedding country speeches with semantic segmentation and averaging
- graph based knowledge embedding utilzied country information such as: political bloc, voting record
- combining vectors

# Cleaning Text

In [3]:
# Function for cleaning data
import re
def clean_text(text: str) -> str:
    """Clean text by removing country names while preserving stopwords"""

    country_names = [
        "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", 
        "Antigua", "Argentina", "Armenia", "Australia", "Austria",
        "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados",
        "Belarus", "Belgium", "Belize", "Benin", "Bhutan",
        "Bolivia", "Bosnia", "Botswana", "Brazil", "Brunei",
        "Bulgaria", "Burkina", "Burundi", "Cambodia", "Cameroon",
        "Canada", "Cape Verde", "Central African Republic", "Chad", "Chile",
        "China", "Colombia", "Comoros", "Congo", "Costa Rica",
        "Croatia", "Cuba", "Cyprus", "Czech Republic", "Denmark",
        "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt",
        "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini",
        "Ethiopia", "Fiji", "Finland", "France", "Gabon",
        "Gambia", "Georgia", "Germany", "Ghana", "Greece",
        "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana",
        "Haiti", "Honduras", "Hungary", "Iceland", "India",
        "Indonesia", "Iran", "Iraq", "Ireland", "Israel",
        "Italy", "Ivory Coast", "Jamaica", "Japan", "Jordan",
        "Kazakhstan", "Kenya", "Kiribati", "Korea", "Kosovo",
        "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon",
        "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania",
        "Luxembourg", "Madagascar", "Malawi", "Malaysia", "Maldives",
        "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius",
        "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia",
        "Montenegro", "Morocco", "Mozambique", "Myanmar", "Namibia",
        "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua",
        "Niger", "Nigeria", "North Korea", "North Macedonia", "Norway",
        "Oman", "Pakistan", "Palau", "Panama", "Papua New Guinea",
        "Paraguay", "Peru", "Philippines", "Poland", "Portugal",
        "Qatar", "Romania", "Russia", "Rwanda", "Saint Kitts",
        "Saint Lucia", "Saint Vincent", "Samoa", "San Marino", "Sao Tome",
        "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone",
        "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia",
        "South Africa", "South Korea", "South Sudan", "Spain", "Sri Lanka",
        "Sudan", "Suriname", "Sweden", "Switzerland", "Syria",
        "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste",
        "Togo", "Tonga", "Trinidad", "Tunisia", "Turkey",
        "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates",
        "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu",
        "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe"
    ]
    
    country_names += [
        "USA", "UK", "UAE", "PRC", "DPRK", 
        "ROK", "DRC", "U.S.", "U.K.", "America",
        "Britain", "England", "Scotland", "Wales", "Northern Ireland",
        "Hong Kong", "Macau", "Palestine", "Ivory Coast", "Czechia",
        "Macedonia", "Swaziland", "Burma", "East Timor", "Vatican"
    ]

    country_pattern = re.compile(r'\b(?:' + '|'.join(re.escape(name.lower()) for name in country_names) + r')\b')
    text = text.lower()
    text = country_pattern.sub('', text)

    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = ' '.join(text.split()).strip()
    
    return text[:3000]

# Semantic Segmentation using GPT 4o

In [4]:
import json
def get_segments(text: str, max_tokens = 200) -> list[str]:
    paragraphs = [p for p in text.split('\n') if p.split()]

    num_seg = max(0, len(text.split())//max_tokens) + 3
    print(f"Num segments: {num_seg}")
    system = f"""
    You will do semantic segmentation of the following text and output in a json string list object.
    The data format should be segments: [jfidfij, jfodfjo, jfodo]
    Split the text into {num_seg} segments.
    """
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": speech}
        ],
        response_format={"type": "json_object"}
    )
    
    segments = json.loads(response.choices[0].message.content)["segments"]
    if not len(paragraphs) == 1:
        segments.append(paragraphs)
    segments.append(text)
    print(f"Generated {len(segments)} segments. ")
    return segments

In [5]:
import pandas as pd
from kaggle_secrets import UserSecretsClient
from openai import AzureOpenAI
import os
from openai import OpenAI

user_secrets = UserSecretsClient()

pair_data = pd.read_csv("/kaggle/input/inputdata-dataset-10k/pair_data.csv")

api = user_secrets.get_secret("OPENAI_API_KEY")
client = AzureOpenAI(
    api_key=api,
    api_version="2024-11-01-preview",
    azure_endpoint="https://swedencentral.api.cognitive.microsoft.com"
)

# Ensemble Embeddings using text-embedding-3-large & bge-large-en-v1.5

In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import numpy as np

def generate_embeddings(texts: list[str]):
    stm = SentenceTransformer('BAAI/bge-large-en-v1.5')
    embeddings = []
    for i in texts:
        bge = stm.encode(i)
        response = client.embeddings.create(
            model="text-embedding-3-large",
            input=i
        )
        emb1 = normalize([response.data[0].embedding])[0]
        emb2 = normalize([bge])[0]
        embedding = np.concatenate([emb1, emb2])
        embeddings.append(embedding)
    
    fin_emb = []
    
    for i in range(len(embeddings)):
        fin_emb.append(list(embeddings[i]))
    return fin_emb

texts = get_segments(speech)
embeddings = generate_embeddings(texts)
print(len(embeddings))

2025-05-28 09:15:00.765726: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748423701.103996      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748423701.204446      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Num segments: 3
Generated 4 segments. 


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

4


In [7]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

or_encode = np.array(generate_embeddings([speech])[0])
vec = np.array(embeddings)
av_vec = np.mean(vec, axis=0)
topic_consistency = cosine_similarity(av_vec.reshape(1, -1), or_encode.reshape(1, -1))[0][0]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

# Knowledge injecting using node graphs

In [8]:
import networkx as nx
from itertools import combinations
from node2vec import Node2Vec

import pandas as pd

IGO = pd.read_csv("/kaggle/input/political-igos/Major Political IGOs Expanded.csv")
IGO["Member Countries"] = IGO["Member Countries"].apply(lambda x: str(x.split("; ")))

organizations = {IGO.iloc[i]['IGO Name'] : IGO.iloc[i]['Member Countries'] for i in range(len(IGO))}

In [9]:
import ast

# Convert stringified lists into actual lists
for k in organizations:
    if isinstance(organizations[k], str):
        organizations[k] = ast.literal_eval(organizations[k])

In [10]:
edges = []
for members in organizations.values():
    edges.extend(combinations(members, 2))

G = nx.Graph()
G.add_edges_from(edges)

node2vec = Node2Vec(G, dimensions=4096, walk_length=30, num_walks=200, workers=2)

model = node2vec.fit(window=10, min_count=1, batch_words=4)

Computing transition probabilities:   0%|          | 0/200 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 100/100 [00:17<00:00,  5.66it/s]
Generating walks (CPU: 2): 100%|██████████| 100/100 [00:19<00:00,  5.09it/s]


In [11]:
import pandas as pd
import networkx as nx
from itertools import combinations
import seaborn as sns
import matplotlib.pyplot as plt
import random

def plot_country_IGO(num: int):
    country_igos = {}
    for _, row in IGO.iterrows():
        for country in row["Member Countries"]:
            if country not in country_igos:
                country_igos[country] = []
            country_igos[country].append(row["IGO Name"])
    
    shared_igos = {}
    countries = list(country_igos.keys())
    for c1, c2 in combinations(countries, 2):
        common_igos = set(country_igos[c1]) & set(country_igos[c2])
        shared_igos[(c1, c2)] = len(common_igos)
        shared_igos[(c2, c1)] = len(common_igos)
    
    G_weighted = nx.Graph()
    for (c1, c2), weight in shared_igos.items():
        if weight > 0:
            G_weighted.add_edge(c1, c2, weight=weight)
    
    selected_countries = random.sample(countries, num)
    
    adj_matrix = nx.to_numpy_array(G_weighted, nodelist=selected_countries, weight="weight")
    adj_df = pd.DataFrame(adj_matrix, index=selected_countries, columns=selected_countries)
    plt.figure(figsize=(15, 12))
    sns.heatmap(
        adj_df,
        cmap="YlOrRd",
        square=True,
        linewidths=0.3,
        annot=True,
        cbar_kws={"label": "Number of Shared IGOs"},
    )
    
    plt.title("Number of Shared IGO Memberships (30 Random Countries)", fontsize=16)
    plt.xlabel("Country", fontsize=12)
    plt.ylabel("Country", fontsize=12)
    plt.xticks(rotation=90, fontsize=9)
    plt.yticks(rotation=0, fontsize=9)
    plt.tight_layout()
    plt.show()

def get_organizations(country: str):
    return [igo for igo, members in organizations.items() if country in members]

# Generating embeddings for every country speech

In [12]:
def get_av_embedding(speech: str):
    clean_speech = clean_text(speech)
    segments = get_segments(clean_speech)
    embeddings = generate_embeddings(segments)
    vec = np.array(embeddings)
    return np.mean(vec, axis=0)

In [13]:
import pycountry
data = pd.read_csv("/kaggle/input/un-general-debates/un-general-debates.csv")
data = data.drop(columns=['session', 'year'])

code_to_country = {country.alpha_3: country.name for country in pycountry.countries}

data['country'] = data['country'].map(code_to_country)
data.head()

Unnamed: 0,country,text
0,Maldives,﻿It is indeed a pleasure for me and the member...
1,Finland,"﻿\nMay I begin by congratulating you. Sir, on ..."
2,Niger,"﻿\nMr. President, it is a particular pleasure ..."
3,Uruguay,﻿\nDuring the debate at the fortieth session o...
4,Zimbabwe,﻿I should like at the outset to express my del...


In [14]:
# import numpy as np
# import pandas as pd
# import openai
# from collections import defaultdict

# grouped = (
#     data.groupby("country")
#     .apply(lambda x: x["text"].tail(5).tolist())
# )

# country_to_embedding = {}

# for country, speeches in grouped.items():
#     all_embeddings = [get_av_embedding(speech) for speech in speeches]
#     avg_embedding = np.mean(all_embeddings, axis=0)
#     country_to_embedding[country] = avg_embedding.tolist()  # Save as list for serialization

In [15]:
# df = pd.DataFrame(country_to_embedding)
# df.to_csv("/kaggle/working/my_file.csv", index=False)

In [16]:
df = pd.read_csv("/kaggle/input/2-hour-speech-embedding-average-5200-speeches/my_file.csv")
df

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,"Venezuela, Bolivarian Republic of",Viet Nam,Yemen,Zambia,Zimbabwe
0,-0.002830,-0.002275,-0.003853,-0.003506,-0.001115,-0.002218,-0.005365,-0.002687,0.002012,-0.000911,...,0.003586,0.000595,-0.002140,0.000379,-0.000506,-0.002072,-0.000247,0.004131,0.001895,0.001542
1,-0.014219,-0.012174,-0.010564,-0.012624,-0.010476,-0.010223,-0.010036,-0.011576,-0.012566,-0.014199,...,-0.012159,-0.016181,-0.009929,-0.011209,-0.012978,-0.011788,-0.014769,-0.008517,-0.008151,-0.012103
2,-0.011768,-0.011083,-0.012586,-0.012999,-0.012179,-0.012775,-0.012056,-0.013188,-0.013387,-0.013796,...,-0.012903,-0.013879,-0.012449,-0.012461,-0.012698,-0.013479,-0.012942,-0.012585,-0.012793,-0.013329
3,-0.008219,-0.012000,-0.006651,-0.008555,-0.010844,-0.007830,-0.010371,-0.010527,-0.009601,-0.006244,...,-0.005243,-0.003971,-0.005838,-0.012097,-0.010295,-0.004901,-0.011390,-0.010391,-0.007997,-0.009904
4,-0.016086,-0.017030,-0.017446,-0.016143,-0.017607,-0.018307,-0.018296,-0.016283,-0.019728,-0.017499,...,-0.018312,-0.019540,-0.018404,-0.018047,-0.019902,-0.018091,-0.016949,-0.017802,-0.021315,-0.017591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4091,0.002061,0.005066,0.004410,0.002844,0.003889,0.004803,0.004737,-0.000585,-0.001856,-0.000996,...,-0.001049,-0.003648,0.000216,-0.002262,-0.000760,-0.001456,0.000835,-0.006713,0.001614,0.000826
4092,0.003688,0.000004,0.001837,-0.003209,0.000724,0.002078,-0.002925,0.000019,0.000974,0.000729,...,0.004469,0.003364,0.002732,-0.000927,0.001899,0.001985,0.000674,0.001287,0.000061,0.001017
4093,-0.009502,-0.007418,-0.008675,-0.007911,-0.010109,-0.008621,-0.007242,-0.008291,-0.009096,-0.008522,...,-0.008411,-0.009892,-0.009069,-0.008038,-0.010061,-0.007587,-0.009596,-0.009638,-0.008202,-0.012697
4094,-0.010405,-0.013699,-0.011363,-0.015361,-0.010800,-0.016704,-0.016813,-0.011484,-0.007736,-0.006945,...,-0.005053,-0.006501,-0.011507,-0.009072,-0.013089,-0.006775,-0.010140,-0.004523,-0.007741,-0.006711


In [17]:
df = df.drop(columns=['Holy See (Vatican City State)', 'Congo', 'Czechia', 'Congo, The Democratic Republic of the'])
df = df.rename(columns={'Viet Nam': 'Vietnam', 'Brunei Darussalam': 'Brunei','Iran, Islamic Republic of': 'Iran',"Lao People's Democratic Republic": 'Laos','Türkiye': 'Turkey','Syrian Arab Republic': 'Syria','Russian Federation': 'Russia', 'Palestine, State of': 'Palestine', 'Korea, Republic of': 'South Korea', "Korea, Democratic People's Republic of": 'North Korea', 'Bolivia, Plurinational State of': 'Bolivia', 'Moldova, Republic of': 'Moldova', 'Tanzania, United Republic of': 'Tanzania', 'Micronesia, Federated States of':'Micronesia', 'Venezuela, Bolivarian Republic of': 'Venezuela'})
for i in df.columns:
    if not i in list(model.wv.key_to_index.keys()):
        print(i)

In [18]:
#df = df.drop(columns=['Bolivia, Plurinational State of', 'Brunei Darussalam'])
countries = list(df.columns)
graph_data = []
speech_data = []
for country in countries:
    graph_data.append(list(model.wv[country]))
    speech_data.append(list(df[country]))

In [19]:
graph_data = np.array(graph_data)
speech_data = np.array(speech_data)
print(graph_data.shape, speech_data.shape)

(190, 4096) (190, 4096)


# Combining speech embeddings and knowledge based embeddings

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

class ProjectionMLP(nn.Module):
    def __init__(self, input_dim, output_dim=512):
        super().__init__()
        self.projection = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, output_dim)
        )

    def forward(self, x):
        return self.projection(x)

# Replace with your real embeddings
speech_embeddings = graph_data
graph_embeddings = speech_data      # shape (N, d_graph)

# Convert to torch tensors
speech_tensor = torch.tensor(speech_embeddings, dtype=torch.float32)
graph_tensor = torch.tensor(graph_embeddings, dtype=torch.float32)

dataset = TensorDataset(speech_tensor, graph_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

d_speech = speech_tensor.shape[1]
d_graph = graph_tensor.shape[1]

speech_proj = ProjectionMLP(d_speech)
graph_proj = ProjectionMLP(d_graph)

optimizer = torch.optim.Adam(list(speech_proj.parameters()) + list(graph_proj.parameters()), lr=1e-3)

def contrastive_loss(z1, z2, temperature=0.07):
    """
    Computes contrastive loss between two batches of embeddings (z1 and z2).
    Uses cosine similarity and InfoNCE loss.
    """
    batch_size = z1.size(0)

    # Normalize embeddings
    z1 = F.normalize(z1, dim=1)
    z2 = F.normalize(z2, dim=1)

    # Cosine similarity matrix
    sim_matrix = torch.matmul(z1, z2.T)  # Shape: (B, B)
    positives = torch.diag(sim_matrix)  # Positive pairs are diagonal

    # Compute cross entropy loss over rows
    labels = torch.arange(batch_size).to(z1.device)
    logits = sim_matrix / temperature
    loss = F.cross_entropy(logits, labels)

    # Optionally, symmetrize: also compare z2 as query
    logits_T = sim_matrix.T / temperature
    loss_T = F.cross_entropy(logits_T, labels)

    return (loss + loss_T) / 2


In [21]:
num_epochs = 50

for epoch in range(num_epochs):
    total_loss = 0
    for speech_batch, graph_batch in dataloader:
        optimizer.zero_grad()

        z_speech = speech_proj(speech_batch)
        z_graph = graph_proj(graph_batch)

        loss = contrastive_loss(z_speech, z_graph)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}")

Epoch 1: Loss = 21.0652
Epoch 11: Loss = 20.7285
Epoch 21: Loss = 20.5865
Epoch 31: Loss = 15.3172
Epoch 41: Loss = 10.7656


In [22]:
with torch.no_grad():
    speech_aligned = F.normalize(speech_proj(speech_tensor), dim=1)
    graph_aligned = F.normalize(graph_proj(graph_tensor), dim=1)

    # Optional: concatenate
    combined_embedding = 0.5 * speech_aligned + 0.5 * graph_aligned
    combined_embedding = torch.reshape(combined_embedding, (512, 190))

In [23]:
np_array = combined_embedding.detach().cpu().numpy()

final_country_embeddings = pd.DataFrame(np_array, columns=countries)

final_country_embeddings

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
0,-0.040190,-0.041954,-0.026253,-0.001382,0.035932,0.066165,0.054476,-0.015772,-0.027608,-0.023091,...,-0.049789,0.012225,-0.080704,-0.038738,0.021187,-0.035446,0.037569,-0.013480,0.050976,0.042726
1,-0.093314,-0.053561,-0.051042,-0.030337,-0.003397,-0.031506,-0.012724,-0.054902,0.033388,-0.008683,...,-0.044816,-0.030310,-0.039371,-0.020409,-0.014896,0.024907,-0.025500,0.014727,0.019582,0.003535
2,0.019660,-0.027227,0.106540,0.011336,0.023270,-0.023359,-0.064010,-0.019561,0.069561,0.084183,...,-0.060187,0.066767,-0.009031,0.000706,-0.015617,-0.043118,0.058867,-0.000074,-0.091085,-0.002227
3,0.037755,0.054669,-0.040699,-0.034803,0.003956,0.025884,0.040516,0.016047,-0.071400,-0.056527,...,-0.007433,-0.019171,-0.010437,0.015930,0.046748,0.029286,0.000495,0.043045,0.024438,0.006910
4,-0.023395,-0.067253,-0.005731,0.051525,-0.014606,-0.010224,0.039578,0.044133,0.049353,0.021759,...,0.017340,0.017067,0.051459,0.021212,-0.057324,-0.025367,0.018494,0.000557,0.036934,0.049645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,-0.024200,0.001679,0.026880,0.035120,-0.045897,0.047188,0.030400,-0.034309,-0.016019,0.059739,...,-0.087829,-0.090338,-0.054923,-0.030951,0.008220,-0.009054,-0.056052,-0.005679,0.038461,0.028548
508,0.033184,0.052944,-0.002723,-0.046482,-0.013014,-0.085046,0.013327,-0.016293,0.058550,-0.080150,...,0.019964,-0.061502,-0.042423,-0.044873,0.037170,-0.031870,-0.020138,0.065920,0.032910,0.055377
509,0.079285,0.055052,-0.006818,0.023555,0.039853,0.007986,0.030640,0.018927,-0.053704,-0.066423,...,-0.007978,0.041943,0.065115,-0.043175,0.043630,0.005344,0.051998,0.006343,-0.060261,0.043630
510,-0.036528,0.086396,0.047877,-0.000821,-0.012735,-0.088383,0.077190,0.019445,0.059376,0.039279,...,0.009087,-0.054003,-0.024168,0.016607,-0.003064,-0.060236,-0.022284,0.011604,-0.064328,0.055622


In [24]:
av_vec = torch.tensor(av_vec, dtype=torch.float)
av_vec = F.normalize(speech_proj(torch.tensor(av_vec)), dim=0)

  av_vec = F.normalize(speech_proj(torch.tensor(av_vec)), dim=0)


In [25]:
cosine_similarity(
    av_vec.detach().numpy().reshape(1, -1),
    np.array(list(final_country_embeddings['Estonia'])).reshape(1, -1)
)

array([[0.06839574]])