<a href="https://www.kaggle.com/code/rikuishiharaa/generate-country-speech-embeddings?scriptVersionId=249750375" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install pycountry
!pip install langdetect

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=47aa080ac6b4689feb9254986ea6efa96294ecf178fe73568e94d646cd7c0dcc
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfu

In [2]:
import numpy as np
import re
import json
import time
from langdetect import detect
from tqdm import tqdm
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
from openai import AzureOpenAI

# Setup OpenAI + BGE
client = AzureOpenAI(
    api_key="your_api_key",  # Replace with your method of loading key
    api_version="2024-11-01-preview",
    azure_endpoint="https://swedencentral.api.cognitive.microsoft.com"
)

stm = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cuda')

# Text cleaner
def clean_text(text: str) -> str:
    text = re.sub(r'[^\w\s\[\]]', ' ', text)
    text = text.lower()
    text = ' '.join(text.split()).strip()
    return text[:3000]

# GPT-4o segmenter
def get_segments_with_retry(text: str, max_tokens=200, max_retries=3) -> list[str]:
    paragraphs = [p for p in text.split('\n') if p.split()]
    num_seg = max(0, len(text.split()) // max_tokens) + 3

    system = f"""
    You will do semantic segmentation of the following text and output the result as a JSON string.
    Segment this diplomatic text into {num_seg} coherent policy segments.
    Each segment should focus on a single policy theme (e.g., economic policy, security, human rights).
    Return JSON in this format:
    {{"segments": ["segment1", "segment2", "segment3"]}}
    """

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model='gpt-4o',
                messages=[
                    {"role": "system", "content": system},
                    {"role": "user", "content": text}
                ],
                response_format={"type": "json_object"},
                timeout=30
            )
            segments = json.loads(response.choices[0].message.content)["segments"]
            segments.extend(paragraphs)
            segments.append(text)
            return list(set(s for s in segments if len(s.split()) > 4))
        except Exception:
            if attempt == max_retries - 1:
                # Fallback: simple chunking
                words = text.split()
                chunk_size = len(words) // max(1, num_seg)
                segments = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
                segments.append(text)
                return list(set(s for s in segments if len(s.split()) > 4))
            time.sleep(2 ** attempt)

# Batched OpenAI embeddings
def batch_openai_embeddings(texts: list[str], batch_size: int = 50):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="OpenAI embeddings"):
        batch = texts[i:i + batch_size]
        for attempt in range(3):
            try:
                response = client.embeddings.create(
                    model="text-embedding-3-large",
                    input=batch,
                    timeout=60
                )
                emb = [normalize([d.embedding])[0] for d in response.data]
                all_embeddings.extend(emb)
                break
            except Exception as e:
                if attempt == 2:
                    print(f"OpenAI embedding failed, fallback to zeros: {e}")
                    all_embeddings.extend([np.zeros(3072)] * len(batch))
                else:
                    time.sleep(2 ** attempt)
        time.sleep(0.5)
    return all_embeddings

# Fusion embedding generator
def generate_embeddings_batch(texts: list[str]):
    if not texts:
        return []
    
    print(f"Generating embeddings for {len(texts)} segments...")
    
    bge = stm.encode(texts, batch_size=32, show_progress_bar=True)
    bge = normalize(bge)
    
    openai = batch_openai_embeddings(texts, batch_size=50)
    
    combined = [np.concatenate([o, b]) for o, b in zip(openai, bge)]
    return combined

# Main average embedding function
def get_av_embedding(speech: str):
    try:
        if detect(speech) != 'en':
            print("Skipped non-English speech")
            return np.zeros(4096)
    except:
        pass

    clean = clean_text(speech)
    segments = get_segments_with_retry(clean)
    
    if not segments:
        return np.zeros(4096)
    
    embeddings = generate_embeddings_batch(segments)
    weights = [len(s.split()) for s in segments]
    
    avg = np.average(np.array(embeddings), axis=0, weights=weights)
    return avg / np.linalg.norm(avg)


2025-07-10 08:48:10.129949: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752137290.345750      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752137290.416617      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]