In [1]:
import os
import json
from openai import OpenAI

In [None]:
OPENAI_API_KEY = ""

In [3]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [5]:
with open("features.json", "r", encoding="utf-8") as f:
    FEATURES_SPEC = json.load(f)


In [10]:
def extract_feature_names(features_spec) -> list[str]:
    # Ваш случай: dict {feature_name: description}
    if isinstance(features_spec, dict):
        # отсекаем "служебные" ключи на всякий (если появятся)
        reserved = {"name", "schema", "features", "feature_names", "version"}
        names = [k for k in features_spec.keys() if k not in reserved]
        # если это чистый dict фичей — их ровно 15
        if names:
            return sorted(names)  # сортировка для стабильного порядка
    raise ValueError("Не смог извлечь имена фичей из features.json")


In [11]:
feature_names = extract_feature_names(FEATURES_SPEC)


In [48]:
OUTPUT_SCHEMA = {
    "name": "PlaceLabel",
    "strict": True,
    "schema": {
        "type": "object",
        "additionalProperties": False,
        "properties": {
            "features": {
                "type": "object",
                "additionalProperties": False,
                "properties": {fn: {"type": "number", "minimum": 0, "maximum": 1} for fn in feature_names},
                # ВАЖНО для strict:true — required должен включать ВСЕ ключи properties
                "required": feature_names,
            },
            "confidence": {"type": "number", "minimum": 0, "maximum": 1},
            "rationale": {"type": "string"},
        },
        "required": [ "features", "confidence", "rationale"],
    },
}

In [49]:
OUTPUT_SCHEMA

{'name': 'PlaceLabel',
 'strict': True,
 'schema': {'type': 'object',
  'additionalProperties': False,
  'properties': {'features': {'type': 'object',
    'additionalProperties': False,
    'properties': {'accommodation_quality': {'type': 'number',
      'minimum': 0,
      'maximum': 1},
     'adventure_level': {'type': 'number', 'minimum': 0, 'maximum': 1},
     'beach_quality': {'type': 'number', 'minimum': 0, 'maximum': 1},
     'cost_level': {'type': 'number', 'minimum': 0, 'maximum': 1},
     'cultural_richness': {'type': 'number', 'minimum': 0, 'maximum': 1},
     'family_friendliness': {'type': 'number', 'minimum': 0, 'maximum': 1},
     'food_variety': {'type': 'number', 'minimum': 0, 'maximum': 1},
     'historical_significance': {'type': 'number', 'minimum': 0, 'maximum': 1},
     'mountain_terrain': {'type': 'number', 'minimum': 0, 'maximum': 1},
     'natural_scenery': {'type': 'number', 'minimum': 0, 'maximum': 1},
     'nightlife_intensity': {'type': 'number', 'minimum':

In [50]:
def label_article(article_text: str, source: str, date_iso: str) -> dict:
    prompt = f"""
Ты разметчик туристических направлений/мест отдыха.
Нужно заполнить 15 фичей значениями от 0 до 1.

Правила:
- Используй только информацию из текста.
- Если фичу невозможно уверенно вывести из текста — ставь 0.5 (unknown).
- Если из текста ясно, что фича выражена слабо/отсутствует — ставь ближе к 0.
- Верни строго JSON по схеме.

Описание фичей (JSON: ключ=имя фичи, значение=описание):
{json.dumps(FEATURES_SPEC, ensure_ascii=False)}

Данные:
source: {source}
date: {date_iso}

Текст:
{article_text}
""".strip()

    resp = client.responses.create(
        model="gpt-4o-mini",
        input=[
            {"role": "system", "content": "Возвращай строго JSON по заданной схеме, без лишнего текста."},
            {"role": "user", "content": prompt},
        ],
        text={
            "format": {
                "type": "json_schema",
                "name": OUTPUT_SCHEMA["name"],
                "strict": True,
                "schema": OUTPUT_SCHEMA["schema"],
            }
        },
        temperature=0.0,
    )

    return json.loads(resp.output_text)

In [15]:
if __name__ == "__main__":
    example_text = "Были в межсезонье на Байкале. Байкал сам еще не застыл, снег лежит и девственная природа т.к туристы почти отсутствуют. Получили массу удовольствия и наделали кучу фотографий! Погода не подвела. Смогли даже покататься на юге Ольхона по льду на озерах. Так же понравился наш водитель/экскурсовод Виктор. Было приятно с ним работать. Единственный минус наверное то, что закрыты многие заведения, но это не было особой проблемой для нас."
    result = label_article(
        article_text=example_text,
        source="tg:some_channel",
        date_iso="2025-12-18",
        place_id="p123",
    )
    print(json.dumps(result, ensure_ascii=False, indent=2))

{
  "place_id": "p123",
  "features": {
    "accommodation_quality": 0.5,
    "adventure_level": 0.7,
    "beach_quality": 0,
    "cost_level": 0.5,
    "cultural_richness": 0.5,
    "family_friendliness": 0.5,
    "food_variety": 0.3,
    "historical_significance": 0.5,
    "mountain_terrain": 0.6,
    "natural_scenery": 0.9,
    "nightlife_intensity": 0,
    "relaxation_level": 0.8,
    "safety": 0.5,
    "transportation_accessibility": 0.5,
    "urban_vibrancy": 0.2
  },
  "confidence": 0.8,
  "rationale": "The text describes a visit to Baikal during the off-season, highlighting the natural beauty and adventure activities available, such as ice skating. However, it mentions limited dining options and nightlife, leading to moderate scores in those areas."
}


Парсер

In [16]:
import csv
import time
from dataclasses import dataclass, asdict
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

In [17]:
BASE_URL = "https://bolshayastrana.com"
START_URL = f"{BASE_URL}/otzyvy"

In [18]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
}


@dataclass
class Review:
    author: str | None
    rating: int | None
    tour_title: str | None
    tour_url: str | None
    date_text: str | None
    text: str | None

In [19]:
def fetch_page(session: requests.Session, page: int) -> str:
    url = START_URL if page == 1 else f"{START_URL}?page={page}"
    r = session.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r.text


In [20]:
def parse_reviews(html: str) -> list[Review]:
    soup = BeautifulSoup(html, "lxml")

    # 1 отзыв = div.review
    cards = soup.select("div.review")
    out: list[Review] = []

    for card in cards:
        author_el = card.select_one(".review__name")
        rating_el = card.select_one(".rating__value")
        tour_a = card.select_one(".review__info a[href]")
        body_el = card.select_one(".review__body span")
        date_el = card.select_one(".review__date")

        author = author_el.get_text(strip=True) if author_el else None

        rating = None
        if rating_el:
            txt = rating_el.get_text(strip=True)
            digits = "".join(ch for ch in txt if ch.isdigit())
            rating = int(digits) if digits else None

        tour_title = tour_a.get_text(strip=True) if tour_a else None
        tour_url = urljoin(BASE_URL, tour_a["href"]) if tour_a and tour_a.has_attr("href") else None

        # Внутри есть <br>, поэтому используем разделитель \n
        text = body_el.get_text("\n", strip=True) if body_el else None

        date_text = date_el.get_text(strip=True) if date_el else None

        out.append(Review(
            author=author,
            rating=rating,
            tour_title=tour_title,
            tour_url=tour_url,
            date_text=date_text,
            text=text,
        ))

    return out

In [21]:
def scrape(max_pages: int = 200, sleep_sec: float = 0.8) -> list[Review]:
    all_reviews: list[Review] = []
    with requests.Session() as session:
        for page in range(1, max_pages + 1):
            html = fetch_page(session, page)
            items = parse_reviews(html)

            # если на странице отзывов нет — заканчиваем
            if not items:
                break

            all_reviews.extend(items)
            print(f"page={page}: +{len(items)} (total={len(all_reviews)})")
            time.sleep(sleep_sec)

    return all_reviews


In [22]:
def save_csv(path: str, reviews: list[Review]) -> None:
    if not reviews:
        return
    with open(path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=list(asdict(reviews[0]).keys()))
        w.writeheader()
        for r in reviews:
            w.writerow(asdict(r))

In [24]:
if __name__ == "__main__":
    reviews = scrape(max_pages=20)
    save_csv("bolshayastrana_reviews.csv", reviews)
    print("Saved: bolshayastrana_reviews.csv")

page=1: +20 (total=20)
page=2: +20 (total=40)
page=3: +20 (total=60)
page=4: +20 (total=80)
page=5: +20 (total=100)
page=6: +20 (total=120)
page=7: +20 (total=140)
page=8: +20 (total=160)
page=9: +20 (total=180)
page=10: +20 (total=200)
page=11: +20 (total=220)
page=12: +20 (total=240)
page=13: +20 (total=260)
page=14: +20 (total=280)
page=15: +20 (total=300)
page=16: +20 (total=320)
page=17: +20 (total=340)
page=18: +20 (total=360)
page=19: +20 (total=380)
page=20: +20 (total=400)
Saved: bolshayastrana_reviews.csv


In [25]:
import pandas as pd

In [26]:
df = pd.read_csv('bolshayastrana_reviews.csv')

In [27]:
df.head()

Unnamed: 0,author,rating,tour_title,tour_url,date_text,text
0,Никита,5,Знакомство с зимним или весенним Ольхоном,https://bolshayastrana.com/bajkal/znakomstvo-s...,15 декабря 2025,Были в межсезонье на Байкале. Байкал сам еще н...
1,Юлия,5,Нижегородский калейдоскоп. Зимне-осенний отдых,https://bolshayastrana.com/nizhegorodskaya-obl...,15 декабря 2025,"Все было организовано отлично: логистика, лока..."
2,Екатерина,5,Зимний мультиактив в горах Адыгеи,https://bolshayastrana.com/adygeya/multiaktiv-...,14 декабря 2025,Прекрасно проведенный отдых. По нагрузке все с...
3,Сергей,5,Осенне-зимнее знакомство с озером Байкал,https://bolshayastrana.com/bajkal/osennee-znak...,13 декабря 2025,Всем привет! Хочу поделиться отдыхом на море Б...
4,Светлана Жукова,5,Кавказская мозаика. Весна-лето,https://bolshayastrana.com/kavkaz/kavkazskaya-...,8 декабря 2025,Мы с подругой купили тур в Ставрополье «Кавка...


In [29]:
from tqdm import tqdm

In [53]:
def flatten_llm_json(data: dict, author: str, place_id: str, rating: int) -> dict:
    row = {
        "confidence": data.get("confidence"),
        "rationale": data.get("rationale"),
        "author" : author,
        "place_id" : place_id,
        "rating" : rating
    }

    for k, v in data.get("features", {}).items():
        row[k] = v

    return row


In [60]:
import time

In [None]:
count = 0
for i, r in tqdm(df.iterrows(), total=len(df)):
    try:
      result = label_article(
        article_text=r['text'],
        source=r['tour_url'],
        date_iso=r['date_text'],

    )

      flat = flatten_llm_json(result, r['author'], place_id=str(i), rating=r['rating'])
      rows.append(flat)
    except Exception as e:
        print(f"Error for place_id={r['place_id']}: {e}")
    count += 1
    if count == 10:
      time.sleep(20)
      count = 0

In [64]:
final_df = pd.DataFrame(rows)

In [65]:
final_df.head()

Unnamed: 0,confidence,rationale,author,place_id,rating,accommodation_quality,adventure_level,beach_quality,cost_level,cultural_richness,family_friendliness,food_variety,historical_significance,mountain_terrain,natural_scenery,nightlife_intensity,relaxation_level,safety,transportation_accessibility,urban_vibrancy
0,0.8,The natural scenery is described as pristine a...,Никита,0,5,0.5,0.7,0.5,0.5,0.5,0.5,0.3,0.5,0.6,0.9,0.2,0.8,0.8,0.5,0.2
1,0.8,The text indicates good organization and logis...,Юлия,1,5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5
2,0.8,The text indicates a balanced mix of activitie...,Екатерина,2,5,0.5,0.7,0.0,0.5,0.5,0.5,0.5,0.0,0.8,0.9,0.0,0.6,0.5,0.5,0.5
3,0.8,The text emphasizes the scenic beauty of Baika...,Сергей,3,5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,1.0,0.5,0.5,0.5,0.5,0.5
4,0.9,The text highlights the stunning natural scene...,Светлана Жукова,4,5,0.7,0.9,0.0,0.5,0.6,0.7,0.5,0.5,1.0,1.0,0.4,0.6,0.8,0.6,0.5


In [66]:
final_df.shape

(31, 20)

In [67]:
final_df.to_csv('1_part_dataset.csv', index=False)