In [3]:
!pip install sentence-transformers pandas tqdm pyarrow
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!pip install hf_xet


Collecting ipywidgets
  Using cached ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Using cached widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Using cached jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Using cached ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Using cached jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Using cached widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
Installing collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets

   ---------------------------------------- 0/3 [widgetsnbextension]
   -------------------------- ------------- 2/3 [ipywidgets]
   -------------------------- ------------- 2/3 [ipywidgets]
   -------------------------- ------------- 2/3 [ipywidgets]
   -------------------------- ------------- 2/3 [ipywidgets]
   -------------------------- ------------- 2/3 [ipywidgets]
   --

usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: dejavu events execute kernel kernelspec lab
labextension labhub migrate nbconvert notebook run server troubleshoot trust

Jupyter command `jupyter-nbextension` not found.


Collecting hf_xet
  Downloading hf_xet-1.1.5-cp37-abi3-win_amd64.whl.metadata (883 bytes)
Downloading hf_xet-1.1.5-cp37-abi3-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ------- -------------------------------- 0.5/2.7 MB 4.6 MB/s eta 0:00:01
   ---------------------- ----------------- 1.6/2.7 MB 5.3 MB/s eta 0:00:01
   ---------------------------------------- 2.7/2.7 MB 6.0 MB/s eta 0:00:00
Installing collected packages: hf_xet
Successfully installed hf_xet-1.1.5


In [2]:
import os, multiprocessing
num_cpus = multiprocessing.cpu_count()
os.environ["OMP_NUM_THREADS"]      = str(num_cpus)
os.environ["MKL_NUM_THREADS"]      = str(num_cpus)
os.environ["OPENBLAS_NUM_THREADS"] = str(num_cpus)
os.environ["NUMEXPR_NUM_THREADS"]  = str(num_cpus)

import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

torch.set_num_threads(num_cpus)

INPUT_CSV       = "./Dataset/poi_dataset_enriched_incremental.csv"
OUTPUT_NPY      = "./Embeddings/poi_embeddings.npy"
OUTPUT_MAP_CSV  = "./Dataset/poi_with_embeddings.csv"
EMBED_MODEL     = "sentence-transformers/LaBSE"
BATCH_SIZE      = 64

df = pd.read_csv(INPUT_CSV, dtype=str)
df = df.fillna("")

print(f"Loaded {len(df)} POIs")

def make_doc(row):
    parts = []
    if row["name"]:
        parts.append(f"Название: {row['name']}")
    if row["city"]:
        parts.append(f"Город: {row['city']}")
    if row["type"]:
        parts.append(f"Тип: {row['type']}")
    if row["tags"]:
        parts.append(f"Тэги: {row['tags']}")
    if row["text_description"]:
        parts.append(f"Описание: {row['text_description']}")
    if row.get("enriched_description", ""):
        parts.append(f"Обогащённое описание: {row['enriched_description']}")
    return " . ".join(parts)

tqdm.pandas(desc="Building docs")
df["__to_embed__"] = df.progress_apply(make_doc, axis=1)

device = "cpu"
print("Embedding on device:", device)
model = SentenceTransformer(EMBED_MODEL, device=device)

dim = model.get_sentence_embedding_dimension()
embeddings = np.zeros((len(df), dim), dtype=np.float32)

for start in tqdm(range(0, len(df), BATCH_SIZE), desc="Embedding batches"):
    end = min(start + BATCH_SIZE, len(df))
    texts = df["__to_embed__"].iloc[start:end].tolist()
    emb = model.encode(
        texts,
        batch_size=BATCH_SIZE,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=False
    )
    embeddings[start:end] = emb

np.save(OUTPUT_NPY, embeddings)
print(f"Embeddings saved to {OUTPUT_NPY}")

pd.DataFrame({
    "id": df["id"],
    "emb_index": np.arange(len(df))
}).to_csv(OUTPUT_MAP_CSV, index=False, encoding="utf-8-sig")
print(f"Index→ID mapping saved to {OUTPUT_MAP_CSV}")


Loaded 50464 POIs


Building docs:   0%|          | 0/50464 [00:00<?, ?it/s]

Embedding on device: cpu


Embedding batches:   0%|          | 0/789 [00:00<?, ?it/s]

Embeddings saved to ./Embeddings/poi_embeddings.npy
Index→ID mapping saved to ./Dataset/poi_with_embeddings.csv
