In [13]:

import os
import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype

import xgboost as xgb
from catboost import CatBoostClassifier, Pool

W_READ = 0.7
W_ANY  = 0.3

W_XGB = 0.5
W_CB  = 0.5

N_COLD  = 15   
SPLIT_Q = 0.8  

DATA_DIR = "/Users/steksov_grigoriy/Desktop/НТО/individ/public" #  указать путь

TRAIN_PATH       = os.path.join(DATA_DIR, "train.csv")
TARGETS_PATH     = os.path.join(DATA_DIR, "targets.csv")
CANDIDATES_PATH  = os.path.join(DATA_DIR, "candidates.csv")
BOOKS_PATH       = os.path.join(DATA_DIR, "books.csv")
GENRES_PATH      = os.path.join(DATA_DIR, "genres.csv")
BOOK_GENRES_PATH = os.path.join(DATA_DIR, "book_genres.csv")


In [14]:

train = pd.read_csv(TRAIN_PATH)
targets = pd.read_csv(TARGETS_PATH)
candidates_raw = pd.read_csv(CANDIDATES_PATH)

print("train", train.shape)
print("targets", targets.shape)
print("candidates", candidates_raw.shape)
print(train.head())
print(candidates_raw.head())

if os.path.exists(BOOKS_PATH):
    books = pd.read_csv(BOOKS_PATH)
    print("books shape", books.shape)
else:
    books = None
    print("error 1")

if os.path.exists(GENRES_PATH) and os.path.exists(BOOK_GENRES_PATH):
    genres = pd.read_csv(GENRES_PATH)
    book_genres = pd.read_csv(BOOK_GENRES_PATH)
    print("genres shape", genres.shape)
    print("book genres shape", book_genres.shape)
else:
    genres = None
    book_genres = None
    print("error 2")

train["timestamp"] = pd.to_datetime(train["timestamp"])


train (269061, 5)
targets (3512, 1)
candidates (3512, 2)
   user_id  book_id  has_read  rating            timestamp
0     3870   310170         0       0  2008-04-27 21:06:16
1     3870   306406         0       0  2008-06-07 11:51:01
2     4091   195676         0       0  2008-08-06 00:40:55
3     3870   554261         1       8  2008-08-07 09:16:12
4     3870    33078         1       2  2008-08-07 09:17:20
   user_id                                       book_id_list
0      210  11936,254097,709075,840500,971259,1037723,1074...
1     1380  8369,28302,145975,482934,625734,998313,1098150...
2     2050  4902,8369,18790,308364,317849,460492,822326,86...
3     2740  39221,112023,149611,162418,181062,317050,43565...
4     4621  28638,28639,28642,28901,31479,307058,475353,57...
books shape (55785, 8)
genres shape (439, 3)
book genres shape (103646, 2)


In [15]:

split_point = train["timestamp"].quantile(SPLIT_Q)
print("T split", split_point)

train_hist = train[train["timestamp"] <= split_point].copy()
val_period = train[train["timestamp"] > split_point].copy()

print("train hist shape", train_hist.shape)
print("val period shape", val_period.shape)


T split 2020-09-11 23:28:35
train hist shape (215249, 5)
val period shape (53812, 5)


In [16]:

def build_history_and_popularity(df: pd.DataFrame):
    user_hist_books = (
        df.groupby("user_id")["book_id"]
          .agg(lambda x: set(x.tolist()))
          .to_dict()
    )
    book_pop = (
        df.groupby("book_id")["user_id"]
          .nunique()
          .sort_values(ascending=False)
    )
    popular_books = book_pop.index.to_numpy()
    return user_hist_books, popular_books


def sample_cold_candidates_for_user(user_id, user_hist_books, popular_books, n_cold=15):
    seen = user_hist_books.get(user_id, set())
    cold = [b for b in popular_books if b not in seen]
    if len(cold) > n_cold:
        cold = cold[:n_cold]
    return cold


def build_cold_candidates(users, user_hist_books, popular_books, n_cold=15):
    rows = []
    for u in users:
        for b in sample_cold_candidates_for_user(u, user_hist_books, popular_books, n_cold=n_cold):
            rows.append((u, b, 0)) 
    return pd.DataFrame(rows, columns=["user_id", "book_id", "rel"])


def dcg_at_k(rels, k=20):
    rels = np.asarray(rels)[:k]
    if rels.size == 0:
        return 0.0
    return float(np.sum(rels / np.log2(np.arange(2, len(rels) + 2))))


def ndcg_for_user(df_u, k=20):
    df_sorted = df_u.sort_values("pred", ascending=False)
    rels_pred = df_sorted["rel"].values
    dcg = dcg_at_k(rels_pred, k=k)
    ideal_rels = np.sort(df_u["rel"].values)[::-1]
    idcg = dcg_at_k(ideal_rels, k=k)
    if idcg == 0:
        return 0.0
    return dcg / idcg


def mean_ndcg(df_all, k=20):
    scores = []
    for _, df_u in df_all.groupby("user_id"):
        scores.append(ndcg_for_user(df_u, k=k))
    return float(np.mean(scores)) if scores else 0.0


def make_submission_user_list(df_pred: pd.DataFrame, top_k: int = 20) -> pd.DataFrame:
    rows = []
    for uid, df_u in df_pred.groupby("user_id"):
        df_sorted = df_u.sort_values("pred", ascending=False)
        df_sorted = df_sorted.drop_duplicates(subset="book_id", keep="first")
        top_books = df_sorted["book_id"].head(top_k).tolist()
        rows.append((uid, ",".join(map(str, top_books))))
    return pd.DataFrame(rows, columns=["user_id", "book_id_list"])


In [17]:

def compute_user_book_stats(df: pd.DataFrame):
    book_stats = (
        df.groupby("book_id")
          .agg(
              b_n_users=("user_id", "nunique"),
              b_n_events=("user_id", "size"),
              b_n_read=("has_read", lambda x: int((x == 1).sum())),
              b_n_plan=("has_read", lambda x: int((x == 0).sum())),
          )
          .reset_index()
    )
    denom = book_stats["b_n_read"] + book_stats["b_n_plan"] + 1e-6
    book_stats["b_read_rate"] = book_stats["b_n_read"] / denom

    user_stats = (
        df.groupby("user_id")
          .agg(
              u_n_books=("book_id", "nunique"),
              u_n_events=("book_id", "size"),
              u_n_read=("has_read", lambda x: int((x == 1).sum())),
              u_n_plan=("has_read", lambda x: int((x == 0).sum())),
          )
          .reset_index()
    )
    user_stats["u_read_share"] = user_stats["u_n_read"] / (user_stats["u_n_events"] + 1e-6)
    return user_stats, book_stats


user_stats_hist, book_stats_hist = compute_user_book_stats(train_hist)
print(user_stats_hist.head())
print(book_stats_hist.head())


   user_id  u_n_books  u_n_events  u_n_read  u_n_plan  u_read_share
0      151         75          75        36        39      0.480000
1      210         31          31         0        31      0.000000
2      560          5           5         0         5      0.000000
3     1380         46          46        19        27      0.413043
4     1850         77          77        38        39      0.493506
   book_id  b_n_users  b_n_events  b_n_read  b_n_plan  b_read_rate
0       20        111         111        94        17     0.846847
1       35          1           1         1         0     0.999999
2       52          1           1         1         0     0.999999
3       54          5           5         4         1     0.800000
4       69          1           1         1         0     0.999999


In [18]:

def add_interaction_features(candidates: pd.DataFrame, hist_df: pd.DataFrame) -> pd.DataFrame:
    df = candidates.copy()

    pair_hist = hist_df[["user_id", "book_id"]].drop_duplicates()
    pair_hist["has_interacted"] = 1
    df = df.merge(pair_hist, on=["user_id", "book_id"], how="left")
    df["has_interacted"] = df["has_interacted"].fillna(0).astype("int8")

    if "rating" in hist_df.columns:
        rating_map = (
            hist_df.dropna(subset=["rating"])
                   .groupby(["user_id", "book_id"])["rating"]
                   .mean()
                   .reset_index()
                   .rename(columns={"rating": "user_book_rating"})
        )
        df = df.merge(rating_map, on=["user_id", "book_id"], how="left")
        df["user_book_rating"] = df["user_book_rating"].fillna(0).astype("float32")
    else:
        df["user_book_rating"] = 0.0

    return df


In [19]:
def add_temporal_features(candidates: pd.DataFrame,
                          hist_df: pd.DataFrame,
                          split_time=None) -> pd.DataFrame:
    df = candidates.copy()
    logs = hist_df.copy()

    user_read_ts = (
        logs[logs["has_read"] == 1]
        .groupby("user_id")["timestamp"]
        .agg(["mean", "max", "min"])
        .add_prefix("u_read_ts_")
        .reset_index()
    )

    book_ts = (
        logs.groupby("book_id")["timestamp"]
            .agg(["mean", "max", "min"])
            .add_prefix("b_ts_")
            .reset_index()
    )

    df = df.merge(user_read_ts, on="user_id", how="left")
    df = df.merge(book_ts, on="book_id", how="left")

    last_ts = logs["timestamp"].max()
    df["days_since_book_event"] = (
        (last_ts - df["b_ts_max"]).dt.total_seconds() / (24 * 3600)
    )
    df["days_since_book_event"] = df["days_since_book_event"].fillna(1e4).astype("float32")

    logs["year"] = logs["timestamp"].dt.year
    last_year = logs["year"].max()
    pop_last_year = (
        logs[logs["year"] == last_year]
        .groupby("book_id")["user_id"]
        .nunique()
        .rename("book_popularity_last_year")
        .reset_index()
    )
    df = df.merge(pop_last_year, on="book_id", how="left")
    df["book_popularity_last_year"] = df["book_popularity_last_year"].fillna(0).astype("float32")

    time_cols = [c for c in df.columns if np.issubdtype(df[c].dtype, np.datetime64)]
    if time_cols:
        print("Удаляю datetime-фичи:", time_cols)
        df = df.drop(columns=time_cols)

    return df


def add_popularity_trend_features(candidates: pd.DataFrame,
                                  hist_df: pd.DataFrame,
                                  window_days: int = 180) -> pd.DataFrame:
    df = candidates.copy()
    logs = hist_df.copy()

    logs["ts_days"] = logs["timestamp"].view("int64") // (24 * 3600 * 10**9)
    last_day = logs["ts_days"].max()

    recent_start = last_day - window_days
    prev_start = last_day - 2 * window_days
    prev_end = recent_start

    recent = (
        logs[logs["ts_days"] >= recent_start]
        .groupby("book_id")["user_id"]
        .nunique()
        .reset_index()
        .rename(columns={"user_id": "b_users_recent"})
    )
    prev = (
        logs[(logs["ts_days"] >= prev_start) & (logs["ts_days"] < prev_end)]
        .groupby("book_id")["user_id"]
        .nunique()
        .reset_index()
        .rename(columns={"user_id": "b_users_prev"})
    )

    trend = recent.merge(prev, on="book_id", how="outer").fillna(0)
    trend["b_pop_ratio"] = (trend["b_users_recent"] + 1.0) / (trend["b_users_prev"] + 1.0)
    trend["b_pop_diff"] = trend["b_users_recent"] - trend["b_users_prev"]

    df = df.merge(trend, on="book_id", how="left").fillna(0)
    return df


In [20]:

def build_books_metadata(books_df, genres_df, book_genres_df):
    if books_df is None or genres_df is None or book_genres_df is None:
        return None

    g = genres_df.copy()
    if "genre" not in g.columns:
        for cand in ["genre_name", "name", "title"]:
            if cand in g.columns:
                g = g.rename(columns={cand: "genre"})
                break
    g = g[["genre_id", "genre"]]

    bg = book_genres_df[["book_id", "genre_id"]].copy()

    meta = (
        bg.merge(g, on="genre_id", how="left")
          .merge(books_df, on="book_id", how="left")
    )
    return meta


books_meta = build_books_metadata(books, genres, book_genres)


def add_genre_features(candidates: pd.DataFrame,
                       hist_df: pd.DataFrame,
                       books_meta_df=None) -> pd.DataFrame:
    df = candidates.copy()

    if books_meta_df is None or "genre" not in books_meta_df.columns:
        df["user_genre_share"] = 0.0
        df["genre_match"] = 0
        return df

    df = df.merge(books_meta_df[["book_id", "genre"]].drop_duplicates(),
                  on="book_id", how="left")

    tmp = (
        hist_df[["user_id", "book_id"]]
        .merge(books_meta_df[["book_id", "genre"]].drop_duplicates(),
               on="book_id", how="left")
        .dropna(subset=["genre"])
    )

    user_genre_counts = (
        tmp.groupby(["user_id", "genre"])["book_id"]
           .size()
           .reset_index(name="cnt")
    )

    total = user_genre_counts.groupby("user_id")["cnt"].transform("sum")
    user_genre_counts["genre_share"] = user_genre_counts["cnt"] / total

    idx = user_genre_counts.groupby("user_id")["cnt"].idxmax()
    fav_genre = (
        user_genre_counts.loc[idx, ["user_id", "genre"]]
        .rename(columns={"genre": "preferred_genre"})
    )

    df = df.merge(
        user_genre_counts[["user_id", "genre", "genre_share"]],
        on=["user_id", "genre"],
        how="left"
    )
    df["user_genre_share"] = df["genre_share"].fillna(0).astype("float32")
    df = df.drop(columns=["genre_share"], errors="ignore")

    df = df.merge(fav_genre, on="user_id", how="left")
    df["genre_match"] = (df["genre"] == df["preferred_genre"]).astype("int8")
    df["genre_match"] = df["genre_match"].fillna(0)

    return df


In [21]:

def add_basic_features(candidates: pd.DataFrame,
                       user_stats: pd.DataFrame,
                       book_stats: pd.DataFrame) -> pd.DataFrame:
    df = candidates.copy()
    df = df.merge(user_stats, on="user_id", how="left")
    df = df.merge(book_stats, on="book_id", how="left")
    return df.fillna(0)


def add_all_features(candidates: pd.DataFrame,
                     hist_df: pd.DataFrame,
                     user_stats: pd.DataFrame,
                     book_stats: pd.DataFrame,
                     split_time,
                     books_meta_df=None) -> pd.DataFrame:
    df = add_basic_features(candidates, user_stats, book_stats)
    df = add_interaction_features(df, hist_df)
    df = add_temporal_features(df, hist_df, split_time=split_time)
    df = add_popularity_trend_features(df, hist_df)
    df = add_genre_features(df, hist_df, books_meta_df)
    return df.fillna(0)


def convert_datetime_to_numeric(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in df.columns:
        if is_datetime64_any_dtype(df[c]):
            df[c] = df[c].view("int64") // 10**9
    return df


In [22]:
val_period = val_period.copy()
val_period["rel"] = np.where(val_period["has_read"] == 1, 2, 1)
val_pos = val_period[["user_id", "book_id", "rel"]].drop_duplicates()
print("val_pos", val_pos.shape)

user_hist_books_hist, popular_books_hist = build_history_and_popularity(train_hist)
val_users = val_period["user_id"].unique()
val_cold = build_cold_candidates(val_users, user_hist_books_hist,
                                popular_books_hist, n_cold=N_COLD)
print("val_cold", val_cold.shape)

val_candidates = pd.concat([val_pos, val_cold], ignore_index=True)
val_candidates = val_candidates.drop_duplicates(["user_id", "book_id"])
print("val_candidates", val_candidates.shape)


val_pos (53812, 3)
val_cold (60495, 3)
val_candidates (113171, 3)


In [23]:
train_pos = train_hist.copy()
train_pos["rel"] = np.where(train_pos["has_read"] == 1, 2, 1)
train_pos = train_pos[["user_id", "book_id", "rel"]].drop_duplicates()

train_users = train_hist["user_id"].unique()
user_hist_books_train, popular_books_train = build_history_and_popularity(train_hist)
train_cold = build_cold_candidates(train_users, user_hist_books_train, popular_books_train, n_cold=N_COLD)

train_candidates_hist = pd.concat([train_pos, train_cold], ignore_index=True)
train_candidates_hist = train_candidates_hist.drop_duplicates(["user_id", "book_id"])
print("train", train_candidates_hist.shape)

train_features_hist = add_all_features(
    train_candidates_hist, train_hist,
    user_stats_hist, book_stats_hist,
    split_time=split_point,
    books_meta_df=books_meta,
)
val_features = add_all_features(
    val_candidates, train_hist,
    user_stats_hist, book_stats_hist,
    split_time=split_point,
    books_meta_df=books_meta,
)

train_features_hist = convert_datetime_to_numeric(train_features_hist)
val_features = convert_datetime_to_numeric(val_features)

feature_cols = [c for c in train_features_hist.columns if c not in ["user_id", "book_id", "rel"]]

cat_features = [c for c in ["genre", "preferred_genre"] if c in feature_cols]

xgb_feature_cols = [c for c in feature_cols if train_features_hist[c].dtype != "O"]

print("num features", len(feature_cols))
print("cat features", cat_features)
print("xgb feature", len(xgb_feature_cols))

X_train_full = train_features_hist[feature_cols]
X_val_full = val_features[feature_cols]

X_train_xgb = train_features_hist[xgb_feature_cols]
X_val_xgb = val_features[xgb_feature_cols]

y_train_rel  = train_features_hist["rel"]
y_train_read = (y_train_rel == 2).astype(int)
y_train_any  = (y_train_rel > 0).astype(int)

y_val_rel = val_features["rel"]
y_val_read = (y_val_rel == 2).astype(int)
y_val_any = (y_val_rel > 0).astype(int)

print("Распределение rel (train)")
print(y_train_rel.value_counts().sort_index())
print("Распределение rel (val)")
print(y_val_rel.value_counts().sort_index())


train (305519, 3)
Удаляю datetime-фичи: ['u_read_ts_mean', 'u_read_ts_max', 'u_read_ts_min', 'b_ts_mean', 'b_ts_max', 'b_ts_min']


  logs["ts_days"] = logs["timestamp"].view("int64") // (24 * 3600 * 10**9)


Удаляю datetime-фичи: ['u_read_ts_mean', 'u_read_ts_max', 'u_read_ts_min', 'b_ts_mean', 'b_ts_max', 'b_ts_min']


  logs["ts_days"] = logs["timestamp"].view("int64") // (24 * 3600 * 10**9)


num features 22
cat features ['genre', 'preferred_genre']
xgb feature 20
Распределение rel (train)
rel
0    242819
1    198545
2    289893
Name: count, dtype: int64
Распределение rel (val)
rel
0    158821
1     51859
2     69355
Name: count, dtype: int64


In [24]:
xgb_read = xgb.XGBClassifier(
    n_estimators=700,
    max_depth=8,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
)

xgb_any = xgb.XGBClassifier(
    n_estimators=700,
    max_depth=8,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=43,
    n_jobs=-1,
)

xgb_read.fit(X_train_xgb.values, y_train_read.values)
xgb_any.fit(X_train_xgb.values, y_train_any.values)

p_read_xgb_val = xgb_read.predict_proba(X_val_xgb.values)[:, 1]
p_any_xgb_val  = xgb_any.predict_proba(X_val_xgb.values)[:, 1]
score_xgb_val  = W_READ * p_read_xgb_val + W_ANY * p_any_xgb_val

cat_idx = [X_train_full.columns.get_loc(c) for c in cat_features if c in X_train_full.columns]

cb_read = CatBoostClassifier(
    loss_function="Logloss",
    iterations=600,
    depth=7,
    learning_rate=0.05,
    random_seed=42,
    l2_leaf_reg=3.0,
    verbose=False
)
cb_any = CatBoostClassifier(
    loss_function="Logloss",
    iterations=600,
    depth=7,
    learning_rate=0.05,
    random_seed=43,
    l2_leaf_reg=3.0,
    verbose=False
)

train_pool_read = Pool(
    data=X_train_full,
    label=y_train_read,
    cat_features=cat_idx
)
train_pool_any = Pool(
    data=X_train_full,
    label=y_train_any,
    cat_features=cat_idx
)

cb_read.fit(train_pool_read)
cb_any.fit(train_pool_any)

val_pool = Pool(
    data=X_val_full,
    cat_features=cat_idx
)

p_read_cb_val = cb_read.predict_proba(val_pool)[:, 1]
p_any_cb_val  = cb_any.predict_proba(val_pool)[:, 1]
score_cb_val  = W_READ * p_read_cb_val + W_ANY * p_any_cb_val

val_features["score_xgb"] = score_xgb_val
val_features["score_cb"]  = score_cb_val
val_features["pred"] = W_XGB * score_xgb_val + W_CB * score_cb_val

print(val_features[["user_id", "book_id", "rel", "pred"]].head())

ndcg20 = mean_ndcg(val_features, k=20)
print(f"NDCG@20: {ndcg20:.6f}")


   user_id  book_id  rel      pred
0  1551451  2573361    2  0.291631
1  1551451  2573361    2  0.264879
2  1397150  2538344    2  0.075261
3  1397150  2538344    2  0.075234
4  1358090  2019613    2  0.000487
NDCG@20: 0.936428


In [25]:

user_stats_full, book_stats_full = compute_user_book_stats(train)

cand = candidates_raw.copy()
if "book_id_list" in cand.columns:
    cand["book_id_list"] = cand["book_id_list"].fillna("").astype(str)
    cand["book_id_list"] = cand["book_id_list"].str.split(",")
    cand_long = cand.explode("book_id_list")
    cand_long = cand_long[cand_long["book_id_list"].str.strip() != ""]
    cand_long["book_id"] = cand_long["book_id_list"].astype(int)
    candidates_long = cand_long[["user_id", "book_id"]].drop_duplicates()
else:
    candidates_long = cand[["user_id", "book_id"]].drop_duplicates()

print("candidates", candidates_long.shape)

train_pos_full = train.copy()
train_pos_full["rel"] = np.where(train_pos_full["has_read"] == 1, 2, 1)
train_pos_full = train_pos_full[["user_id", "book_id", "rel"]].drop_duplicates()

train_users_full = train["user_id"].unique()
user_hist_books_full, popular_books_full = build_history_and_popularity(train)
train_cold_full = build_cold_candidates(train_users_full, user_hist_books_full, popular_books_full, n_cold=N_COLD)

train_full_pairs = pd.concat([train_pos_full, train_cold_full], ignore_index=True)
train_full_pairs = train_full_pairs.drop_duplicates(["user_id", "book_id"])
print("train full pairs", train_full_pairs.shape)

train_features_full = add_all_features(
    train_full_pairs, train,
    user_stats_full, book_stats_full,
    split_time=split_point,
    books_meta_df=books_meta,
)
train_features_full = convert_datetime_to_numeric(train_features_full)

common_features = [c for c in feature_cols if c in train_features_full.columns]
xgb_common = [c for c in xgb_feature_cols if c in common_features]

X_full_all = train_features_full[common_features]
X_full_xgb = train_features_full[xgb_common]

y_full_rel  = train_features_full["rel"]
y_full_read = (y_full_rel == 2).astype(int)
y_full_any  = (y_full_rel > 0).astype(int)

print("full rel distribution")
print(y_full_rel.value_counts().sort_index())

xgb_read_full = xgb.XGBClassifier(
    n_estimators=700,
    max_depth=8,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=52,
    n_jobs=-1,
)
xgb_any_full = xgb.XGBClassifier(
    n_estimators=700,
    max_depth=8,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=53,
    n_jobs=-1,
)

xgb_read_full.fit(X_full_xgb.values, y_full_read.values)
xgb_any_full.fit(X_full_xgb.values, y_full_any.values)

cat_idx_full = [X_full_all.columns.get_loc(c) for c in cat_features if c in X_full_all.columns]

cb_read_full = CatBoostClassifier(
    loss_function="Logloss",
    iterations=600,
    depth=7,
    learning_rate=0.05,
    random_seed=52,
    l2_leaf_reg=3.0,
    verbose=False
)
cb_any_full = CatBoostClassifier(
    loss_function="Logloss",
    iterations=600,
    depth=7,
    learning_rate=0.05,
    random_seed=53,
    l2_leaf_reg=3.0,
    verbose=False
)

train_pool_read_full = Pool(
    data=X_full_all,
    label=y_full_read,
    cat_features=cat_idx_full
)
train_pool_any_full = Pool(
    data=X_full_all,
    label=y_full_any,
    cat_features=cat_idx_full
)

cb_read_full.fit(train_pool_read_full)
cb_any_full.fit(train_pool_any_full)

test_features = add_all_features(
    candidates_long, train,
    user_stats_full, book_stats_full,
    split_time=split_point,
    books_meta_df=books_meta,
)
test_features = convert_datetime_to_numeric(test_features)

test_features = test_features.merge(
    train_features_full[["user_id", "book_id"] + common_features],
    on=["user_id", "book_id"],
    how="left",
    suffixes=("", "_train_full")
)

for c in common_features:
    if c not in test_features.columns:
        test_features[c] = 0.0

X_test_all = test_features[common_features]
X_test_xgb = test_features[xgb_common]

p_read_xgb_test = xgb_read_full.predict_proba(X_test_xgb.values)[:, 1]
p_any_xgb_test  = xgb_any_full.predict_proba(X_test_xgb.values)[:, 1]
score_xgb_test  = W_READ * p_read_xgb_test + W_ANY * p_any_xgb_test

test_pool = Pool(
    data=X_test_all,
    cat_features=[X_test_all.columns.get_loc(c) for c in cat_features if c in X_test_all.columns]
)
p_read_cb_test = cb_read_full.predict_proba(test_pool)[:, 1]
p_any_cb_test  = cb_any_full.predict_proba(test_pool)[:, 1]
score_cb_test  = W_READ * p_read_cb_test + W_ANY * p_any_cb_test

test_features["pred"] = W_XGB * score_xgb_test + W_CB * score_cb_test

print(test_features[["user_id", "book_id", "pred"]].head())


candidates (81048, 2)
train full pairs (378396, 3)
Удаляю datetime-фичи: ['u_read_ts_mean', 'u_read_ts_max', 'u_read_ts_min', 'b_ts_mean', 'b_ts_max', 'b_ts_min']


  logs["ts_days"] = logs["timestamp"].view("int64") // (24 * 3600 * 10**9)


full rel distribution
rel
0    302883
1    250404
2    359248
Name: count, dtype: int64
Удаляю datetime-фичи: ['u_read_ts_mean', 'u_read_ts_max', 'u_read_ts_min', 'b_ts_mean', 'b_ts_max', 'b_ts_min']


  logs["ts_days"] = logs["timestamp"].view("int64") // (24 * 3600 * 10**9)


   user_id  book_id      pred
0      210    11936  0.000011
1      210    11936  0.000011
2      210    11936  0.000010
3      210   254097  0.000032
4      210   254097  0.000024


In [26]:

submission_user_list = make_submission_user_list(test_features, top_k=20)
print(submission_user_list.head())

SUBMIT_PATH = os.path.join(DATA_DIR, "submission.csv")
submission_user_list.to_csv(SUBMIT_PATH, index=False)
print("Саб сохранён в", SUBMIT_PATH)


   user_id                                       book_id_list
0      210  1673950,971259,1281035,3015694,2447113,2225251...
1     1380  2548861,2290484,482934,1326209,2379664,1098150...
2     2050  1021078,460492,317849,2053462,867246,2254200,2...
3     2740  987516,1296620,2327258,1834192,2307893,549194,...
4     4621  2595660,1809950,1964216,2446687,2347566,244668...
Саб сохранён в /Users/steksov_grigoriy/Desktop/НТО/individ/public/submission.csv
