In [None]:
import json
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm.auto import tqdm

INPUT_CSV       = "poi_dataset_russia_filtered_enriched.csv"
OUTPUT_CSV      = "poi_dataset_enriched_incremental.csv"
MODEL_NAME      = "tiiuae/falcon-7b-instruct"
MAX_TOKENS      = 256
PIPE_BATCH      = 4
CHUNK_SIZE      = 1000

PROMPT_TEMPLATE = """
Преобразуй это описание POI в более развернутый, живой, «человеческий» стиль —
добавь атмосферные детали, контекст и немного рекомендаций, но не придумывай факты:

\"\"\"{desc}\"\"\"
"""

assert torch.cuda.is_available(), "CUDA-GPU не обнаружена!"
print(f"CUDA devices: {torch.cuda.device_count()}, запускаем в FP16")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model     = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    max_new_tokens=MAX_TOKENS,
    do_sample=False,
    batch_size=PIPE_BATCH,
    return_full_text=False,
)

reader = pd.read_csv(INPUT_CSV, dtype=str, chunksize=CHUNK_SIZE)
first_chunk = True

for idx, chunk in enumerate(reader):
    enriched = []
    for desc in tqdm(chunk["text_description"], desc=f"Chunk {idx+1}"):
        if not isinstance(desc, str) or not desc.strip():
            enriched.append("")
        else:
            prompt = PROMPT_TEMPLATE.format(desc=desc.strip())
            out    = generator(prompt)[0]["generated_text"]
            enriched.append(out.strip())

    chunk["enriched_description"] = enriched

    if first_chunk:
        chunk.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
        first_chunk = False
    else:
        chunk.to_csv(OUTPUT_CSV, index=False, header=False, mode="a", encoding="utf-8-sig")

    del chunk, enriched

print("✅ Обогащение завершено. Результат в", Path(OUTPUT_CSV).resolve())


In [6]:
!python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org

!python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --trusted-host download.pytorch.org

!python -m pip install transformers accelerate pandas tqdm --trusted-host pypi.org --trusted-host files.pythonhosted.org


Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.7.1%2Bcpu-cp313-cp313-win_amd64.whl.metadata (27 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.22.1%2Bcpu-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.7.1%2Bcpu-cp313-cp313-win_amd64.whl.metadata (6.8 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading https://download.pytorch.org/whl/networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Downloading https://download.pytorch.org/whl/mpmath-1.3.0-py3-none-any.whl (536 kB)
     ---------------------------------------- 0.0/536.2 kB ? eta -:--:--
     -------------------------------------- 536.2/536.

ERROR: Could not install packages due to an OSError: [WinError 32] Процесс не может получить доступ к файлу, так как этот файл занят другим процессом: 'C:\\Users\\emil1\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\sympy\\interactive\\session.py'
Consider using the `--user` option or check the permissions.



^C


In [1]:
import os
os.environ["OMP_NUM_THREADS"]     = "20"
os.environ["MKL_NUM_THREADS"]     = "20"
os.environ["OPENBLAS_NUM_THREADS"] = "20"
os.environ["NUMEXPR_NUM_THREADS"]  = "20"

import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

INPUT_CSV  = "poi_dataset_russia_filtered_enriched.csv"
MODEL_NAME = "tiiuae/falcon-7b-instruct"
MAX_TOKENS = 256

df   = pd.read_csv(INPUT_CSV, dtype=str, nrows=1)
desc = df.loc[0, "text_description"]
print("Original description:\n", desc, "\n")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model     = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=-1,
    max_new_tokens=MAX_TOKENS,
    do_sample=False,
    return_full_text=False,
)

prompt = f"""
Преобразуй это описание POI в более живой «человеческий» стиль —
добавь атмосферные детали, контекст и рекомендации, но не придумывай факты:

\"\"\"{desc}\"\"\"
"""
out = generator(prompt)[0]["generated_text"]
print("Enriched description:\n", out)


  from .autonotebook import tqdm as notebook_tqdm


Original description:
 Библиотека КГМА. Тип: library 



Loading checkpoint shards: 100%|██████████| 2/2 [00:50<00:00, 25.39s/it]
Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


KeyboardInterrupt: 