# Генерация портрета юзера по описанию прочитанных книг
Забегая вперед скажу попытка неудачная. Время генерации на одного юзера слишком большое, а набор описаний книг зачастую превышает контекст маленьких и быстрых LLM моделей

In [None]:
from langchain_openai import ChatOpenAI
from yandex_cloud_ml_sdk import YCloudML, AsyncYCloudML

YA_GPT_FOLDER_ID = 'YA_GPT_FOLDER_ID'
YA_GPT_AUTH = 'YA_GPT_AUTH'
YA_GPT_MODEL_NAME = 'yandexgpt'
YA_GPT_MODEL_VERSION = 'rc'

sdk = YCloudML(folder_id=YA_GPT_FOLDER_ID, auth=YA_GPT_AUTH) 
sdk.setup_default_logging() 


model = sdk.models.completions(model_name=YA_GPT_MODEL_NAME, model_version=YA_GPT_MODEL_VERSION) 


instruct_model = model.configure(temperature=0.1)
YAGPT_5_PRO = model.langchain(model_type="chat", timeout=60)




QWEN_LARGE_OPENAI = ChatOpenAI(model=f"gpt://{YA_GPT_FOLDER_ID}/qwen3-235b-a22b-fp8/latest",
                                openai_api_key=f"{YA_GPT_AUTH}",
                                openai_api_base="https://llm.api.cloud.yandex.net/v1",
                                temperature=0.1,
                                timeout=60)

GPT_OSS = ChatOpenAI(model=f"gpt://{YA_GPT_FOLDER_ID}/gpt-oss-20b/latest",
                                openai_api_key=f"{YA_GPT_AUTH}",
                                openai_api_base="https://llm.api.cloud.yandex.net/v1",
                                temperature=0.1,
                                timeout=60)





## Подготовка данных

In [None]:
import pandas as pd
import polars as pl
import numpy as np
np.random.seed(2025)


data_folder = "/home/gleb_galagan/tbank_recsys/sirius_recsys/sirius-2025-recsys/data/"
train = pl.read_parquet(data_folder + "train.pq")
books = pl.read_parquet('/home/gleb_galagan/tbank_recsys/baseline_solution/books_all_embs.parquet')
test_exploded = pl.read_parquet(data_folder + "test.pq")
test = test_exploded.group_by("user_id", maintain_order=True).agg(pl.col("item_id"))

In [None]:
# Конкатенируем название книги с описанием
books = books.with_columns(
    (pl.col("title") + " " + pl.col("description")).alias("title_description")
)

read_books_with_desc = (
    train
    .filter(pl.col("is_read"))  # Выбираем только книги с пометкой прочитано
    .join(
        books.select(["item_id", "title_description"]), 
        on="item_id", 
        how="inner"
    )
)
read_books_with_desc.head(3)

user_id,item_id,is_read,rating,date_added,title_description
str,i64,bool,i64,datetime[μs],str
"""bd3a389144e323f09d340a85b1481b…",4039,true,0,2014-05-15 12:51:31,"""Eldest (The Inheritance Cycle,…"
"""081af30272710f8b6f27e182282f91…",19293,true,5,2014-05-15 12:52:24,"""The Blood of Olympus (The Hero…"
"""3f03e62aae345d2a347e23d00f4973…",9147,true,3,2014-05-15 12:52:47,"""The Lightning Thief (Percy Jac…"
"""1805e0de286e00e97e984b281f238b…",2588,true,1,2014-05-15 12:53:19,"""Branded (Sinners, #1)Alternate…"
"""40b8d9dca682712012105330b511e2…",25354,true,4,2014-05-15 12:53:59,"""ConversionFrom the New York Ti…"
…,…,…,…,…,…
"""3938a953bc0ab5a3247f7fcf8279b3…",23960,true,5,2016-12-10 02:44:15,"""Crooked Kingdom (Six of Crows,…"
"""91a9e9654b69bc35a0001047125651…",7197,true,4,2016-12-10 02:44:18,"""Breakfast Served AnytimeA comi…"
"""f4cf0c5716c66a0814144eb6ded258…",30704,true,5,2016-12-10 02:44:47,"""The Lost Herondale (Tales from…"
"""6aa2230015d8ec1d6d1d467b0c44fc…",16897,true,4,2016-12-10 02:45:50,"""The One Memory of Flora BanksA…"


In [None]:
# Мерджим все описания книг юзера вместе
user_profiles_df = (
    read_books_with_desc
    .group_by("user_id")
    .agg(
        
        pl.col("title_description").implode().list.join(" ").alias("user_profile_description")
    )
)


user_profiles_df.head(3)

shape: (305_924, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ user_id                         ┆ user_profile_description        │
│ ---                             ┆ ---                             │
│ str                             ┆ str                             │
╞═════════════════════════════════╪═════════════════════════════════╡
│ 39459fe8f1b597292ff2ef9a45b7a4… ┆ The Fault in Our StarsDespite … │
│ ea2a3deed21e9d82e15050d1172feb… ┆ Holes (Holes, #1)(Librarian's … │
│ e2ac8bac5212f355aa3a5c3f2b594c… ┆ Stargirl (Stargirl, #1)A celeb… │
│ f1adbc2c4663246fed40a59c97aa3f… ┆ City of Heavenly Fire (The Mor… │
│ 8b87f215b31743ff1ed18d855c41aa… ┆ FangirlThis special edition in… │
│ …                               ┆ …                               │
│ 730f3e30a97b8946d932978b7fcf7a… ┆ The Hunger Games (The Hunger G… │
│ 20f84473a54e958ef4ff2517657bcf… ┆ The Fault in Our StarsDespite … │
│ 0286f66a76993aa732bcdc4887ba6f… ┆ Crossed (Matched, #2)Cassia fl… │


user_id,user_profile_description
str,str
"""39459fe8f1b597292ff2ef9a45b7a4…","""The Fault in Our StarsDespite …"
"""ea2a3deed21e9d82e15050d1172feb…","""Holes (Holes, #1)(Librarian's …"
"""e2ac8bac5212f355aa3a5c3f2b594c…","""Stargirl (Stargirl, #1)A celeb…"
"""f1adbc2c4663246fed40a59c97aa3f…","""City of Heavenly Fire (The Mor…"
"""8b87f215b31743ff1ed18d855c41aa…","""FangirlThis special edition in…"
…,…
"""730f3e30a97b8946d932978b7fcf7a…","""The Hunger Games (The Hunger G…"
"""20f84473a54e958ef4ff2517657bcf…","""The Fault in Our StarsDespite …"
"""0286f66a76993aa732bcdc4887ba6f…","""Crossed (Matched, #2)Cassia fl…"
"""d3a3742e93bab74d2fe9017a272e55…","""The Power of Six (Lorien Legac…"


In [None]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(input_variables=['readed_books'], template='Hi provide detailed user portrait based on books that user have read {readed_books} provide only user portrait, nothing else')

chain = prompt | GPT_OSS 


In [None]:
sample = user_profiles_df.head(3)

gpt_oss_user_portraits = []

for row in sample.iter_rows(named=True):
    user_id = row['user_id']
    readed_books_summary = row['user_profile_description']
    
    print(f"\n--- Generating portrait for {user_id} ---")
    
    output = chain.invoke({"readed_books": readed_books_summary})
    user_portrait = output.content
    
    
    gpt_oss_user_portraits.append({"user_id": user_id, "portrait": user_portrait})


--- Generating portrait for 39459fe8f1b597292ff2ef9a45b7a4b5 ---

--- Generating portrait for ea2a3deed21e9d82e15050d1172feb99 ---

--- Generating portrait for e2ac8bac5212f355aa3a5c3f2b594c7a ---


In [31]:
us_portrait_gpt = pl.DataFrame(gpt_oss_user_portraits)

us_portrait_gpt

user_id,portrait
str,str
"""39459fe8f1b597292ff2ef9a45b7a4…","""**User Portrait – “The YA‑Lit …"
"""ea2a3deed21e9d82e15050d1172feb…","""**User Portrait – “The Curious…"
"""e2ac8bac5212f355aa3a5c3f2b594c…","""**User Portrait – “The Empathe…"


In [None]:
sample = sample.join(
        us_portrait_gpt.select(["user_id", "portrait"]), 
        on="user_id", 
        how="left"
    )
sample

user_id,user_profile_description,portrait
str,str,str
"""39459fe8f1b597292ff2ef9a45b7a4…","""The Fault in Our StarsDespite …","""**User Portrait – “The YA‑Lit …"
"""ea2a3deed21e9d82e15050d1172feb…","""Holes (Holes, #1)(Librarian's …","""**User Portrait – “The Curious…"
"""e2ac8bac5212f355aa3a5c3f2b594c…","""Stargirl (Stargirl, #1)A celeb…","""**User Portrait – “The Empathe…"


In [33]:
from langchain.prompts import PromptTemplate

prompt_yagpt = PromptTemplate(input_variables=['readed_books'], template='Hi provide detailed user portrait based on books that user have read {readed_books} provide only user portrait, nothing else')

chain_yagpt = prompt_yagpt | YAGPT_5_PRO


In [None]:

yagpt_user_portraits = []

for row in sample.iter_rows(named=True):
    user_id = row['user_id']
    readed_books_summary = row['user_profile_description']
    
    print(f"\n--- Generating portrait for {user_id} ---")
    
    # This is where you would call your LLM
    output = chain_yagpt.invoke({"readed_books": readed_books_summary})
    user_portrait = output.content
    
    
    yagpt_user_portraits.append({"user_id": user_id, "portrait_ya": user_portrait})


--- Generating portrait for 39459fe8f1b597292ff2ef9a45b7a4b5 ---

--- Generating portrait for ea2a3deed21e9d82e15050d1172feb99 ---

--- Generating portrait for e2ac8bac5212f355aa3a5c3f2b594c7a ---


In [35]:
us_portrait_yagpt = pl.DataFrame(yagpt_user_portraits)

us_portrait_yagpt

user_id,portrait_ya
str,str
"""39459fe8f1b597292ff2ef9a45b7a4…","""В интернете есть много сайтов …"
"""ea2a3deed21e9d82e15050d1172feb…","""**User Portrait:** **Age and …"
"""e2ac8bac5212f355aa3a5c3f2b594c…","""**User Portrait** **Age and L…"


In [None]:
sample = sample.join(
        us_portrait_yagpt.select(["user_id", "portrait_ya"]), 
        on="user_id", 
        how="left"
    )
sample

user_id,user_profile_description,portrait,portrait_ya
str,str,str,str
"""39459fe8f1b597292ff2ef9a45b7a4…","""The Fault in Our StarsDespite …","""**User Portrait – “The YA‑Lit …","""В интернете есть много сайтов …"
"""ea2a3deed21e9d82e15050d1172feb…","""Holes (Holes, #1)(Librarian's …","""**User Portrait – “The Curious…","""**User Portrait:** **Age and …"
"""e2ac8bac5212f355aa3a5c3f2b594c…","""Stargirl (Stargirl, #1)A celeb…","""**User Portrait – “The Empathe…","""**User Portrait** **Age and L…"


**Выводы** Генерация портрета на 3х юзеров на GPT OSS заняла 52 секунды, нало параллелить 

YandexGPT PRO справляется с этой задачей хуже, в для 1го описания ответ как видно поломался, также у юзеров с большой активностью огромный список книг с большим описанием, которые не влезают в контекст