In [None]:
import os
import json
import re
import time
from typing import Dict, Any, Optional
import pandas as pd
from tqdm import tqdm
import requests

In [None]:
OPENROUTER_API_KEY = "" 
OPENROUTER_MODEL = "openai/gpt-4o-mini"

OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"

OPENROUTER_HEADERS = {
    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
    "Content-Type": "application/json"
}

In [20]:
with open("features.json", "r", encoding="utf-8") as f:
    FEATURES_SPEC = json.load(f)

In [None]:
def extract_feature_names(features_spec) -> list[str]:
    if isinstance(features_spec, dict):
        reserved = {"name", "schema", "features", "feature_names", "version"}
        names = [k for k in features_spec.keys() if k not in reserved]
        if names:
            return sorted(names)
    raise ValueError("Не смог извлечь имена фичей из features.json")

In [22]:
feature_names = extract_feature_names(FEATURES_SPEC)

In [None]:
FEATURE_NAMES = feature_names
FEATURE_KEYS_STR = ", ".join(FEATURE_NAMES)

In [None]:
import numpy as np
from typing import Dict, Any, List

FEATURES_SPEC_STR = json.dumps(FEATURES_SPEC, ensure_ascii=False, indent=2)

def build_messages(text: str) -> List[Dict[str, str]]:
    system = (
"Ты разметчик туристических направлений/мест отдыха."
"Нужно заполнить 15 фичей значениями от 0 до 1."

"Правила:"
"Используй только информацию из текста."
"- Если из текста ясно, что фича выражена слабо/отсутствует — ставь ближе к 0."
"Не старайся ставить 0.5 везде, будь объективен"
"- Верни строго JSON по схеме."
    )

    user = f"""FEATURES_JSON:
```json
{FEATURES_SPEC_STR}
```

TEXT:
""" + text

    return [
        {"role": "system", "content": system},
        {"role": "user", "content": user},
    ]

def _extract_json_object(s: str) -> str:
    if not s:
        raise ValueError("Empty model output")
    s = s.strip()
    if s.startswith("{") and s.endswith("}"):
        return s
    m = re.search(r"\{[\s\S]*\}", s)
    if not m:
        raise ValueError(f"Could not find JSON object in output: {s[:200]}")
    return m.group(0)

def _openrouter_chat(messages, temperature: float = 0.0, max_tokens: int = 512, timeout: int = 60, retries: int = 3):
    payload = {
        "model": OPENROUTER_MODEL,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
    }
    last_err = None
    for attempt in range(retries):
        try:
            resp = requests.post(
                url=OPENROUTER_URL,
                headers=OPENROUTER_HEADERS,
                data=json.dumps(payload),
                timeout=timeout,
            )
            if resp.status_code >= 400:
                raise RuntimeError(f"HTTP {resp.status_code}: {resp.text[:500]}")
            return resp.json()
        except Exception as e:
            last_err = e
            time.sleep(2 ** attempt)
    raise last_err

def score_text_with_openrouter(text: str) -> Dict[str, float]:
    messages = build_messages(text)
    data = _openrouter_chat(messages, temperature=0.0, max_tokens=700)

    content = data["choices"][0]["message"]["content"]
    obj = json.loads(_extract_json_object(content))

    out: Dict[str, float] = {}
    for k in FEATURE_NAMES:
        v = obj.get(k, None)
        if v is None:
            out[k] = np.nan
            continue
        try:
            out[k] = float(v)
        except Exception:
            out[k] = np.nan
    return out

In [None]:
first_text = df.loc[df.index[0], "text"]
messages_example = build_messages(first_text)

example_payload = {
    "model": OPENROUTER_MODEL,
    "messages": messages_example,
}

print(json.dumps(example_payload, ensure_ascii=False, indent=2)[:8000])

{
  "model": "openai/gpt-4o-mini",
  "messages": [
    {
      "role": "system",
      "content": "Ты разметчик туристических направлений/мест отдыха.Нужно заполнить 15 фичей значениями от 0 до 1.Правила:Используй только информацию из текста.- Если из текста ясно, что фича выражена слабо/отсутствует — ставь ближе к 0.Не старайся ставить 0.5 везде, будь объективен- Верни строго JSON по схеме."
    },
    {
      "role": "user",
      "content": "FEATURES_JSON:\n```json\n{\n  \"service_quality\": \"Quality of service and interactions with staff, from 0 to 1.\",\n  \"cleanliness\": \"Cleanliness and hygiene of the place, from 0 to 1.\",\n  \"condition_state\": \"Physical condition and maintenance of the place, from 0 to 1.\",\n  \"comfort\": \"Overall comfort and convenience for visitors, from 0 to 1.\",\n  \"location_convenience\": \"Convenience of location and surroundings, from 0 to 1.\",\n  \"accessibility\": \"Ease of access for different visitors (stairs, ramps, entry), from 0 to 1.

In [None]:
df = pd.read_csv('filtred_data_less_320_len.csv')

assert 'text' in df.columns, "В датафрейме должна быть колонка 'text'"
df.head()

Unnamed: 0,address,name_ru,rating,rubrics,text,len_text,count_note
0,"Санкт-Петербург, Московский проспект, 183-185Ак4",Лик,5.0,Косметология;Салон красоты;Ногтевая студия;Эпи...,Посещаю клинику уже 4 месяца. Радует всё! Обсл...,312,5
1,"Москва, Шлюзовая набережная, 2А",Libro Beauty,5.0,"Ногтевая студия;Визажисты, стилисты;Салон бров...",Мой самый любимый салон вот уже около трёх лет...,266,8
2,"Москва, Шлюзовая набережная, 2А",Libro Beauty,5.0,"Ногтевая студия;Визажисты, стилисты;Салон бров...","Шикарный салон: прекрасное расположение, встре...",235,8
3,"Москва, Шлюзовая набережная, 2А",Libro Beauty,5.0,"Ногтевая студия;Визажисты, стилисты;Салон бров...","Периодически хожу сюда на педикюр, всегда отли...",215,8
4,"Москва, Шлюзовая набережная, 2А",Libro Beauty,5.0,"Ногтевая студия;Визажисты, стилисты;Салон бров...",Всем здравствуйте! Я постоянный клиент на прот...,293,8


In [28]:
df = df.iloc[6000:10000]

In [None]:
def enrich_and_save_batches(
    df: pd.DataFrame,
    text_col: str = "text",
    out_dir: str = "batches_out",
    batch_size: int = 50,
    base_name: str = "enriched",
    sleep_every: int = 10,
    sleep_seconds: float = 1.0,
    resume: bool = True,
) -> list[str]:
    
    os.makedirs(out_dir, exist_ok=True)

    total = len(df)
    saved_paths: list[str] = []

    df = df.reset_index(drop=False).rename(columns={"index": "_row_id"})

    n_batches = (total + batch_size - 1) // batch_size

    for b in range(n_batches):
        start = b * batch_size
        end = min(start + batch_size, total)

        out_path = os.path.join(out_dir, f"{base_name}_rows_{start:07d}_{end-1:07d}.csv")
        if resume and os.path.exists(out_path):
            saved_paths.append(out_path)
            continue

        batch_df = df.iloc[start:end].copy()

        rows = []
        for i, text in tqdm(enumerate(batch_df[text_col].tolist()), total=len(batch_df), desc=f"batch {b+1}/{n_batches}"):
            try:
                scores = score_text_with_openrouter(str(text))
                scores["_error"] = ""
            except Exception as e:
                scores = {k: 0.5 for k in FEATURE_NAMES}
                scores["_error"] = str(e)
            rows.append(scores)

            if sleep_every and ((i + 1) % sleep_every == 0):
                time.sleep(sleep_seconds)

        features_df = pd.DataFrame(rows)
        enriched_batch = pd.concat([batch_df.reset_index(drop=True), features_df.reset_index(drop=True)], axis=1)

        enriched_batch.to_csv(out_path, index=False)
        saved_paths.append(out_path)

    return saved_paths

In [None]:
BATCH_OUT_DIR = "batches_enriched"
BATCH_SIZE = 50

saved_files = enrich_and_save_batches(
    df,
    text_col="text",
    out_dir=BATCH_OUT_DIR,
    batch_size=BATCH_SIZE,
    base_name="dataset_enriched",
    resume=True,
)

print(f"Saved {len(saved_files)} batch files into: {BATCH_OUT_DIR}")
saved_files[:3]

batch 1/80: 100%|██████████| 50/50 [02:50<00:00,  3.41s/it]
batch 2/80: 100%|██████████| 50/50 [03:16<00:00,  3.93s/it]
batch 3/80: 100%|██████████| 50/50 [03:37<00:00,  4.34s/it]
batch 4/80: 100%|██████████| 50/50 [16:03<00:00, 19.26s/it]   
batch 5/80: 100%|██████████| 50/50 [02:49<00:00,  3.40s/it]
batch 6/80: 100%|██████████| 50/50 [02:57<00:00,  3.55s/it]
batch 7/80: 100%|██████████| 50/50 [02:39<00:00,  3.19s/it]
batch 8/80: 100%|██████████| 50/50 [02:47<00:00,  3.35s/it]
batch 9/80: 100%|██████████| 50/50 [02:52<00:00,  3.44s/it]
batch 10/80: 100%|██████████| 50/50 [03:49<00:00,  4.59s/it]
batch 11/80: 100%|██████████| 50/50 [02:47<00:00,  3.36s/it]
batch 12/80: 100%|██████████| 50/50 [02:37<00:00,  3.16s/it]
batch 13/80: 100%|██████████| 50/50 [02:37<00:00,  3.15s/it]
batch 14/80: 100%|██████████| 50/50 [02:49<00:00,  3.40s/it]
batch 15/80: 100%|██████████| 50/50 [02:47<00:00,  3.35s/it]
batch 16/80: 100%|██████████| 50/50 [03:57<00:00,  4.75s/it]
batch 17/80: 100%|██████████| 

Saved 80 batch files into: batches_enriched





['batches_enriched/dataset_enriched_rows_0000000_0000049.csv',
 'batches_enriched/dataset_enriched_rows_0000050_0000099.csv',
 'batches_enriched/dataset_enriched_rows_0000100_0000149.csv']

In [None]:
import glob

batch_paths = sorted(glob.glob(os.path.join(BATCH_OUT_DIR, "dataset_enriched_rows_*.csv")))
combined_df = pd.concat((pd.read_csv(p) for p in batch_paths), ignore_index=True)

In [32]:
pd.set_option('display.max_columns', None)
combined_df

Unnamed: 0,_row_id,address,name_ru,rating,rubrics,text,len_text,count_note,accessibility,atmosphere,cleanliness,comfort,condition_state,crowdedness,infrastructure_availability,location_convenience,navigation_ease,noise_level,recommend_intent,safety,service_quality,value_for_money,waiting_time,_error
0,6000,"Московская область, городской округ Истра, Дед...",Оптика 2.0,5.0,Салон оптики,Самая лучшая оптика в городе. Уже много лет бе...,297,5,0.6,0.9,0.8,0.9,1.0,0.4,0.7,0.7,0.6,0.3,1.0,0.8,1.0,1.0,0.5,
1,6001,"Московская область, городской округ Истра, Дед...",Оптика 2.0,5.0,Салон оптики,Отличная оптика. Уже дважды заказываем тут лин...,83,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,
2,6002,"Московская область, городской округ Истра, Дед...",Оптика 2.0,5.0,Салон оптики,В этой оптике много лет заказываю очки и всегд...,264,5,0.5,1.0,0.5,1.0,0.5,0.5,0.5,0.5,0.5,0.5,1.0,0.5,1.0,1.0,0.5,
3,6003,"Московская область, городской округ Истра, Дед...",Оптика 2.0,5.0,Салон оптики,"Заказывала прогрессивные линзы в свои очки , о...",265,5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,
4,6004,"Ростовская область, Таганрог, Большая Бульварн...",Гипер Лента,3.0,Продуктовый гипермаркет,"Объявили скидки в Ленте на конкретные товары, ...",216,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.2,0.1,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,9995,"Москва, проезд Одоевского, 13",Православная классическая гимназия Радонеж,5.0,Гимназия;Дополнительное образование;Детский са...,Отдали ребёнка в гимназию и пока все устраивае...,217,5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,
3996,9996,"Москва, проезд Одоевского, 13",Православная классическая гимназия Радонеж,5.0,Гимназия;Дополнительное образование;Детский са...,"Ходим в детский сад, мальчик 3, девочка 6. Дет...",262,5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,
3997,9997,"Москва, проезд Одоевского, 13",Православная классическая гимназия Радонеж,5.0,Гимназия;Дополнительное образование;Детский са...,Очень удобно добираться до гимназии. Учителя т...,180,5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,
3998,9998,"Москва, проезд Одоевского, 13",Православная классическая гимназия Радонеж,5.0,Гимназия;Дополнительное образование;Детский са...,"Потрясающее учебное заведение!! Рядом храм, хо...",200,5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,


In [None]:
FINAL_OUTPUT_CSV = os.path.join(BATCH_OUT_DIR, "dataset_enriched_all.csv")
combined_df.to_csv(FINAL_OUTPUT_CSV, index=False)
print(f"Saved combined CSV: {FINAL_OUTPUT_CSV} ({len(combined_df)} rows)")

Saved combined CSV: batches_enriched/dataset_enriched_all.csv (4000 rows)
