In [3]:
!pip install sentence-transformers pandas tqdm pyarrow
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!pip install hf_xet


Collecting ipywidgets
  Using cached ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Using cached widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Using cached jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Using cached ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Using cached jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Using cached widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
Installing collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets

   ---------------------------------------- 0/3 [widgetsnbextension]
   -------------------------- ------------- 2/3 [ipywidgets]
   -------------------------- ------------- 2/3 [ipywidgets]
   -------------------------- ------------- 2/3 [ipywidgets]
   -------------------------- ------------- 2/3 [ipywidgets]
   -------------------------- ------------- 2/3 [ipywidgets]
   --

usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: dejavu events execute kernel kernelspec lab
labextension labhub migrate nbconvert notebook run server troubleshoot trust

Jupyter command `jupyter-nbextension` not found.


Collecting hf_xet
  Downloading hf_xet-1.1.5-cp37-abi3-win_amd64.whl.metadata (883 bytes)
Downloading hf_xet-1.1.5-cp37-abi3-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ------- -------------------------------- 0.5/2.7 MB 4.6 MB/s eta 0:00:01
   ---------------------- ----------------- 1.6/2.7 MB 5.3 MB/s eta 0:00:01
   ---------------------------------------- 2.7/2.7 MB 6.0 MB/s eta 0:00:00
Installing collected packages: hf_xet
Successfully installed hf_xet-1.1.5


In [8]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# ── 0) Настройка многопоточности до любых импортов ──
import os, multiprocessing

num_cpus = multiprocessing.cpu_count()
os.environ["OMP_NUM_THREADS"]      = str(num_cpus)
os.environ["MKL_NUM_THREADS"]      = str(num_cpus)
os.environ["OPENBLAS_NUM_THREADS"] = str(num_cpus)
os.environ["NUMEXPR_NUM_THREADS"]  = str(num_cpus)

# ── 1) Импорты ──
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

# После этого можно на всякий случай задать:
torch.set_num_threads(num_cpus)
# (НЕ вызываем set_num_interop_threads)

# ── 2) Параметры ──
INPUT_CSV    = "./Dataset/poi_dataset_russia_filtered_enriched.csv"
OUTPUT_CSV   = "poi_with_embeddings.csv"
EMBED_MODEL  = "sentence-transformers/LaBSE"
BATCH_SIZE   = 64
EMB_NPY_FILE = "Embeddings/poi_embeddings.npy"

# ── 3) Считаем датасет ──
df = pd.read_csv(INPUT_CSV, dtype=str)
df["text_description"] = df["text_description"].fillna("").astype(str)
print(f"Loaded {len(df)} POIs, sample IDs: {df['id'].tolist()[:3]}")

# ── 4) Загружаем модель эмбеддинга на CPU ──
device = "cpu"
print("Running embeddings on", device)
model = SentenceTransformer(EMBED_MODEL, device=device)

# ── 5) Вычисляем эмбеддинги батчами ──
all_texts = df["text_description"].tolist()
dim = model.get_sentence_embedding_dimension()
embeddings = np.empty((len(all_texts), dim), dtype=np.float32)

for start in tqdm(range(0, len(all_texts), BATCH_SIZE), desc="Batches"):
    batch = all_texts[start:start+BATCH_SIZE]
    emb = model.encode(
        batch,
        batch_size=BATCH_SIZE,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    embeddings[start:start+len(emb)] = emb

# ── 6) Сохраняем результаты ──
np.save(EMB_NPY_FILE, embeddings)
print(f"Embeddings saved to {EMB_NPY_FILE}")

pd.DataFrame({
    "id": df["id"],
    "emb_index": np.arange(len(df))
}).to_csv(OUTPUT_CSV, index=False)
print(f"ID→index mapping saved to {OUTPUT_CSV}")


Loaded 50463 POIs, sample IDs: ['617015815', '639038291', '695065756']
Running embeddings on cpu


Batches: 100%|██████████| 789/789 [33:52<00:00,  2.58s/it] 

Embeddings saved to poi_embeddings.npy
ID→index mapping saved to poi_with_embeddings.csv



