In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/eto-ai-nto-suchka/book_genres.csv
/kaggle/input/eto-ai-nto-suchka/sample_submission.csv
/kaggle/input/eto-ai-nto-suchka/book_descriptions.csv
/kaggle/input/eto-ai-nto-suchka/users.csv
/kaggle/input/eto-ai-nto-suchka/constants.py
/kaggle/input/eto-ai-nto-suchka/genres.csv
/kaggle/input/eto-ai-nto-suchka/books.csv
/kaggle/input/eto-ai-nto-suchka/config.py
/kaggle/input/eto-ai-nto-suchka/submission.csv
/kaggle/input/eto-ai-nto-suchka/train.csv
/kaggle/input/eto-ai-nto-suchka/test.csv


In [3]:
SEED = 42
def seed_everything(seed=42):
    # 1. Python standard library
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
    # 2. NumPy (used by Pandas)
    np.random.seed(seed)
    
    # 3. PyTorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


# Call it once at the start
seed_everything(SEED)

pd.set_option('display.max_columns', None)

In [4]:
book_decs = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/book_descriptions.csv')
book_genres = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/book_genres.csv')
genres = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/genres.csv')
books = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/books.csv')
sample_sub = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/sample_submission.csv')
test = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/test.csv')
train = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/train.csv')
users = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/users.csv')

In [5]:
# ============================
# 1. LOAD DATA
# ============================
train = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/train.csv')
test = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/test.csv')
books = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/books.csv')
genres = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/book_genres.csv')

train = train[train["has_read"] == 1].copy()
global_mean = train["rating"].mean()

# ============================
# 2. MERGE META
# ============================
train = train.merge(books[["book_id", "author_id"]], on="book_id", how="left")
test  = test.merge(books[["book_id", "author_id"]], on="book_id", how="left")

book_genres_full = genres.copy()

# ============================
# 2.1 USER STD (для твоего правила)
# ============================
user_stats_full = (
    train.groupby("user_id")["rating"]
        .agg(["mean", "std", "count"])
        .reset_index()
        .rename(columns={"mean": "user_mean", "std": "user_std", "count": "user_count"})
)
# std может быть NaN если 1 оценка — считаем шум 0 (или можно big number, но ты просил по std>4)
user_stats_full["user_std"] = user_stats_full["user_std"].fillna(0.0)

# ============================
# 3. EXPAND TRAIN TO MULTI-GENRE
# ============================
train_expanded = train.merge(book_genres_full, on="book_id", how="left")

# ============================
# 4. USER–GENRE MEAN & COUNT
# ============================
user_genre_stats = (
    train_expanded.groupby(["user_id", "genre_id"])["rating"]
        .agg(["mean", "count"])
        .reset_index()
        .rename(columns={"mean": "user_genre_mean", "count": "user_genre_count"})
)

# ============================
# 5. USER–AUTHOR MEAN
# ============================
user_author_mean = (
    train.groupby(["user_id", "author_id"])["rating"]
        .mean()
        .reset_index()
        .rename(columns={"rating": "user_author_mean"})
)

# ============================
# 6. USER MEAN (уже есть в user_stats_full, но оставим для совместимости)
# ============================
user_mean = user_stats_full[["user_id", "user_mean"]].copy()

# ============================
# 7. BOOK MEAN
# ============================
book_mean = (
    train.groupby("book_id")["rating"]
        .mean()
        .reset_index()
        .rename(columns={"rating": "book_mean"})
)

# ============================
# 8. AUTHOR GLOBAL MEAN
# ============================
author_global_mean = (
    train.groupby("author_id")["rating"]
        .mean()
        .reset_index()
        .rename(columns={"rating": "author_global_mean"})
)

# ============================
# 9. GENRE GLOBAL MEAN + BOOK GENRE GLOBAL MEAN
# ============================
genre_global_mean = (
    train_expanded.groupby("genre_id")["rating"]
        .mean()
        .reset_index()
        .rename(columns={"rating": "genre_global_mean"})
)

book_genre_global_mean = (
    book_genres_full
        .merge(genre_global_mean, on="genre_id", how="left")
        .groupby("book_id")["genre_global_mean"]
        .mean()
        .reset_index()
        .rename(columns={"genre_global_mean": "book_genre_global_mean"})
)

# ============================
# 10. EXPAND TEST TO MULTI-GENRE
# ============================
test_multi = test.merge(book_genres_full, on="book_id", how="left")

test_multi = test_multi.merge(
    user_genre_stats,
    on=["user_id", "genre_id"],
    how="left"
)

test_multi["user_genre_mean"]  = test_multi["user_genre_mean"].fillna(global_mean)
test_multi["user_genre_count"] = test_multi["user_genre_count"].fillna(0)

# ============================
# 11. GENRE WEIGHTS
# ============================
test_multi["user_genre_count_smooth"] = test_multi["user_genre_count"] + 1

test_multi["user_total_genre_reads"] = (
    test_multi.groupby("user_id")["user_genre_count_smooth"].transform("sum")
)

test_multi["genre_weight"] = (
    test_multi["user_genre_count_smooth"] / test_multi["user_total_genre_reads"]
)

test_multi["normalized_weight"] = (
    test_multi["genre_weight"] /
    test_multi.groupby(["user_id","book_id"])["genre_weight"].transform("sum")
).fillna(0)

# ============================
# 12. WEIGHTED GENRE MEAN
# ============================
test_multi["weighted_genre_mean"] = (
    test_multi["normalized_weight"] * test_multi["user_genre_mean"]
)

user_multi_genre_mean = (
    test_multi.groupby(["user_id", "book_id"])["weighted_genre_mean"]
        .sum()
        .reset_index()
        .rename(columns={"weighted_genre_mean": "genre_weighted_prediction"})
)

# ============================
# 13. MERGE EVERYTHING В TEST2
# ============================
test2 = test.merge(user_multi_genre_mean, on=["user_id", "book_id"], how="left")
test2["genre_weighted_prediction"] = test2["genre_weighted_prediction"].fillna(global_mean)

test2 = test2.merge(user_author_mean, on=["user_id", "author_id"], how="left")

test2 = test2.merge(user_mean, on="user_id", how="left")
test2 = test2.merge(user_stats_full[["user_id", "user_std"]], on="user_id", how="left")
test2["user_std"] = test2["user_std"].fillna(0.0)

test2 = test2.merge(book_mean, on="book_id", how="left")

test2 = test2.merge(author_global_mean, on="author_id", how="left")
test2["author_global_mean"] = test2["author_global_mean"].fillna(global_mean)

test2 = test2.merge(book_genre_global_mean, on="book_id", how="left")
test2["book_genre_global_mean"] = test2["book_genre_global_mean"].fillna(global_mean)

# ============================
# 14. FLAGS: known user / known book
# ============================
known_users = set(train["user_id"].unique())
known_books = set(train["book_id"].unique())

test2["known_user"] = test2["user_id"].isin(known_users)
test2["known_book"] = test2["book_id"].isin(known_books)

# ============================
# 15. CONDITIONAL PREDICTIONS (WITH YOUR STD>4 RULE)
# ============================
def weighted_avg(candidates):
    vals = [(v, w) for (v, w) in candidates if pd.notna(v)]
    if not vals:
        return global_mean
    ws = np.array([w for _, w in vals], dtype=float)
    ws = ws / ws.sum()
    xs = np.array([v for v, _ in vals], dtype=float)
    return float((xs * ws).sum())

def rebalance_user_vs_global(cands, user_keys, global_keys, force_equal=True):
    """
    cands: list of tuples (key, value, weight)
    user_keys/global_keys: which keys belong to which group
    force_equal: если True -> сумма весов user == сумма весов global (только по доступным сигналам!)
    """
    # оставим только не-NaN
    cands = [(k, v, w) for (k, v, w) in cands if pd.notna(v)]
    if not cands:
        return []

    user = [(k, v, w) for (k, v, w) in cands if k in user_keys]
    glob = [(k, v, w) for (k, v, w) in cands if k in global_keys]
    other = [(k, v, w) for (k, v, w) in cands if (k not in user_keys and k not in global_keys)]

    # если одной группы вообще нет — нечего уравнивать
    if not user or not glob or not force_equal:
        return [(v, w) for (_, v, w) in cands]

    sum_u = sum(w for _, _, w in user)
    sum_g = sum(w for _, _, w in glob)

    # хотим: sum_u_new == sum_g_new == (sum_u + sum_g)/2
    target = 0.5 * (sum_u + sum_g)

    # масштабирование внутри групп, сохраняя пропорции
    scale_u = target / sum_u if sum_u > 0 else 1.0
    scale_g = target / sum_g if sum_g > 0 else 1.0

    user2 = [(v, w * scale_u) for _, v, w in user]
    glob2 = [(v, w * scale_g) for _, v, w in glob]
    other2 = [(v, w) for _, v, w in other]

    return user2 + glob2 + other2

def conditional_pred(row):
    ku = row["known_user"]
    kb = row["known_book"]
    ustd = row["user_std"]

    # сигналы
    genre_pred    = row["genre_weighted_prediction"]      # число
    author_user   = row["user_author_mean"]               # NaN возможно
    user_m        = row["user_mean"]                      # NaN возможно
    book_m        = row["book_mean"]                      # NaN возможно
    author_global = row["author_global_mean"]             # число
    genre_global  = row["book_genre_global_mean"]         # число

    # какие сигналы "связанные с юзером"
    USER_KEYS = {"genre_pred", "author_user", "user_m"}
    GLOBAL_KEYS = {"author_global", "genre_global", "book_m"}

    # --- 1) known user & known book ---
    if ku and kb:
        base = [
            ("genre_pred",    genre_pred,    0.30),
            ("author_user",   author_user,   0.20),
            ("user_m",        user_m,        0.15),
            ("book_m",        book_m,        0.15),
            ("author_global", author_global, 0.10),
            ("genre_global",  genre_global,  0.10),
        ]

        # твоя идея: если user_std > 4, сделать user-часть == global-части
        if ustd > 4:
            cands = rebalance_user_vs_global(base, USER_KEYS, GLOBAL_KEYS, force_equal=True)
        else:
            cands = [(v, w) for (_, v, w) in base]

        return weighted_avg(cands)

    # --- 2) known user, new book ---
    if ku and not kb:
        base = [
            ("genre_pred",    genre_pred,    0.40),
            ("author_user",   author_user,   0.35),
            ("user_m",        user_m,        0.15),
            ("author_global", author_global, 0.10),
            ("genre_global",  genre_global,  0.10),
        ]

        if ustd > 4:
            cands = rebalance_user_vs_global(base, USER_KEYS, GLOBAL_KEYS, force_equal=True)
        else:
            cands = [(v, w) for (_, v, w) in base]

        return weighted_avg(cands)

    # --- 3) new user, known book ---
    if (not ku) and kb:
        base = [
            ("book_m",        book_m,        0.40),
            ("author_global", author_global, 0.30),
            ("genre_global",  genre_global,  0.30),
        ]
        cands = [(v, w) for (_, v, w) in base]
        return weighted_avg(cands)

    # --- 4) cold start ---
    base = [
        ("author_global", author_global, 0.50),
        ("genre_global",  genre_global,  0.50),
    ]
    cands = [(v, w) for (_, v, w) in base]
    return weighted_avg(cands)

test2["rating_predict"] = test2.apply(conditional_pred, axis=1)
test2["rating_predict"] = np.clip(test2["rating_predict"], 0, 10) * 1.03

# ============================
# 16. SUBMISSION
# ============================
test2[["user_id","book_id","rating_predict"]].to_csv(
    "submission_superblend_conditional_3.0roundedto0.csv", index=False
)

print("DONE! submission_superblend_conditional_3.0roundedto0.csv saved")


DONE! submission_superblend_conditional_3.0roundedto0.csv saved


In [None]:
train = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/train.csv')
test = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/test.csv')
books = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/books.csv')
book_genres = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/book_genres.csv')

train = train[train['has_read'] == 1]

user_smooth = 10
book_smooth = 5
genre_smooth = 5

average_rating = train['rating'].mean()

user_group = train.groupby('user_id')['rating'].agg(['sum', 'count'])
user_group['user_adjust'] = (user_group['sum'] + user_smooth * average_rating) / (user_group['count'] + user_smooth) - average_rating
user_adj_dict = user_group['user_adjust'].to_dict()

train = train.merge(user_group[['user_adjust']], on='user_id', how='left')
train['temp_resid'] = train['rating'] - average_rating - train['user_adjust']

book_group = train.groupby('book_id')['temp_resid'].agg(['sum', 'count'])
book_group['book_adjust'] = book_group['sum'] / (book_group['count'] + book_smooth)
book_adj_dict = book_group['book_adjust'].to_dict()

train_genres = train[['user_id', 'book_id', 'temp_resid']].merge(book_genres, on='book_id')
train_genres['book_adjust_val'] = train_genres['book_id'].map(book_adj_dict).fillna(0)
train_genres['genre_resid'] = train_genres['temp_resid'] - train_genres['book_adjust_val']

user_genre_group = train_genres.groupby(['user_id', 'genre_id'])['genre_resid'].agg(['sum', 'count'])
user_genre_group['genre_adjust'] = user_genre_group['sum'] / (user_genre_group['count'] + genre_smooth)
user_genre_dict = user_genre_group['genre_adjust'].to_dict()

book_to_genres = book_genres.groupby('book_id')['genre_id'].apply(list).to_dict()

def make_prediction(row):
    user = row['user_id']
    book = row['book_id']
    
    pred_value = average_rating + user_adj_dict.get(user, 0.0)
    pred_value += book_adj_dict.get(book, 0.0)
    
    genres = book_to_genres.get(book, [])
    if genres:
        genre_vals = []
        for g in genres:
            val = user_genre_dict.get((user, g))
            if val is not None:
                genre_vals.append(val)
        if genre_vals:
            pred_value += np.mean(genre_vals)
            
    return np.clip(pred_value, 0, 10)

test['rating_predict'] = test.apply(make_prediction, axis=1)

submission = test[['user_id', 'book_id', 'rating_predict']]
submission.to_csv('submission_algo2.csv', index=False)

Global Mean: 7.6632
Генерация V2...
Готово! Пробуй submission_algo_v2.csv


In [7]:
base_pred = pd.read_csv('/kaggle/input/eto-ai-nto-suchka/submission.csv')
bias_pred = pd.read_csv('/kaggle/working/submission_algo2.csv')
my_pred = pd.read_csv('/kaggle/working/submission_superblend_conditional_3.0roundedto0.csv')

super_mega_blend = pd.DataFrame({
    'user_id': base_pred['user_id'],
    'book_id': base_pred['book_id'],
    'rating_predict': my_pred['rating_predict']*0.70 + bias_pred['rating_predict']*0.175 + base_pred['rating_predict']*0.125})
super_mega_blend.to_csv('SUPER_MEGA_BLEND_SIGMA_SOTA_KILLER_GONCHAROV_KIRPICHENKO_KHLOPOTNUKH_SOLUTION676767.csv', index=False)
super_mega_blend.head()

Unnamed: 0,user_id,book_id,rating_predict
0,281,2461928,8.381563
1,1250,31957,6.698717
2,4241,196603,8.108635
3,5140,468894,8.487695
4,7781,2141951,7.523158
