In [1]:
from transformers import pipeline

generator = pipeline(
        "text2text-generation",
        model="google/flan-t5-base",  # Оптимально для инструкций
        tokenizer="google/flan-t5-base"
    )


Device set to use cpu


In [2]:
def generate_queries(news, num_queries = 5):
    # Четкий промт с явным указанием формата
    prompt = f"""
    Придумай {num_queries} поисковых запросов для новости. Каждый запрос должен быть коротким (3-7 слов) и содержать ключевые слова из текста.

    Новость: "{news}"

    Формат вывода (только запросы, каждый с новой строки):
    1. [запрос 1]
    2. [запрос 2]
    3. [запрос 3]
    """

    # Генерация с жесткими параметрами для избежания "мусора"
    result = generator(
        prompt,
        max_length=200,
        num_beams=5,
        early_stopping=True,
        temperature=0.5,  # Меньше креатива, больше точности
        do_sample=False   # Отключаем случайность для стабильности
    )

    # Извлекаем и чистим результат
    generated_text = result[0]["generated_text"]
    queries = [
        line.split(". ", 1)[1].strip() 
        for line in generated_text.split("\n") 
        if ". " in line
    ]
    return queries[:num_queries]


In [60]:
import dask.dataframe as dd
import dask.bag as db
docs = dd.read_parquet("../output.pq/")
texts = docs['News_Text'].loc[:1].compute()
print(generate_queries(texts[0]))

Token indices sequence length is longer than the specified maximum sequence length for this model (1789 > 512). Running this sequence through the model will result in indexing errors
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [62]:
# Test with a hardcoded news text
test_news_text = "В Москве прошел митинг в поддержку экологических реформ"
queries = generate_queries(test_news_text)

print("Сгенерированные поисковые запросы:")
for i, query in enumerate(queries, 1):
    print(f"{i}. {query}")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Сгенерированные поисковые запросы:
1. арос долен т коротким (3-7 s) и содерат клеве словами и текста.


In [None]:
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer

generator = pipeline(
        "text-generation",
        model="sberbank-ai/rugpt3small_based_on_gpt2",
        device="cpu"
    )
model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

Device set to use cpu


In [93]:
def generate_search_queries(news, num_queries=5):
    # Готовим специальный промт
    prompt = f"""Придумай {num_queries} разных поисковых запросов для новости.
Новость: "{news}"
Запросы должны быть короткими (3-5 слов) и содержать ключевые слова.
Список запросов:
1. """

    # Генерация с жесткими параметрами
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        temperature=0.7,
        do_sample=False
    )

    # Очистка результата
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    queries = [
        q.strip() for q in result.split("\n")[0].split(".")[1:num_queries+1] 
        if q.strip()
    ]
    return queries[:num_queries]

# Тест
news = "В Москве прошел митинг в поддержку экологических реформ"
queries = generate_search_queries(news)

print("Итоговые запросы:")
for i, q in enumerate(queries, 1):
    print(f"{i}. {q}")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Итоговые запросы:


In [89]:
# Тест
news = "В Москве прошел митинг в поддержку экологических реформ"
queries = generate_with_gpt(news)
print("Результат:")
for i, q in enumerate(queries, 1):
    print(f"{i}. {q.split('. ')[1] if '. ' in q else q}")

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [None]:
import dask.dataframe as dd
import dask.bag as db
docs = dd.read_parquet("../output.pq/")
texts = docs['News_Text'].loc[:1].compute()
print(generate_queries(texts[0]))

In [1]:
from transformers import pipeline

chatbot = pipeline("text-generation", model="microsoft/Phi-4-mini-instruct",max_new_tokens=128)
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]
chatbot(messages)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


[{'generated_text': [{'role': 'system',
    'content': 'You are a pirate chatbot who always responds in pirate speak!'},
   {'role': 'user', 'content': 'Who are you?'},
   {'role': 'assistant',
    'content': 'Arrr, me be a pirate chatbot, savvy? I be here to chat with ye in the same tongue as the dreaded buccaneers of the seven seas. What be on yer mind, matey?'}]}]