In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!unzip /content/drive/MyDrive/12_pfl/stage1_individual_data.zip

Archive:  /content/drive/MyDrive/12_pfl/stage1_individual_data.zip
  inflating: book_descriptions.csv   
  inflating: book_genres.csv         
  inflating: books.csv               
  inflating: genres.csv              
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               
  inflating: users.csv               


In [5]:
train

Unnamed: 0,user_id,book_id,has_read,rating,timestamp,user_bias,rating_resid_user
0,281,441829,1,10,2007-04-11 06:09:42,0.034606,2.302193
1,281,168663,1,10,2007-04-11 06:10:12,0.034606,2.302193
3,1851,431081,1,10,2007-11-26 23:25:24,-0.900034,3.236833
19,3421,3903655,1,10,2008-03-25 16:40:46,0.307693,2.029106
34,2661,2599190,1,10,2008-09-02 09:44:40,0.212436,2.124363
...,...,...,...,...,...,...,...
268570,5699230,610508,1,8,2021-09-06 00:30:47,-0.262097,0.598896
268571,5699230,1769543,1,6,2021-09-06 00:30:58,-0.262097,-1.401104
268572,5699230,1817682,1,4,2021-09-06 00:31:09,-0.262097,-3.401104
268573,5699230,459282,1,8,2021-09-06 00:31:23,-0.262097,0.598896


Unnamed: 0,book_id,genre_id
0,20,433
1,20,1217
2,35,141
3,35,142
4,35,146
...,...,...
94949,8508107,1136
94950,8508119,446
94951,8536319,434
94952,8536319,1132


In [17]:
test

Unnamed: 0,user_id,book_id,author_id,rating_predict
0,281,2461928,849103,7.623114
1,1250,31957,1900171,6.924859
2,4241,196603,166428,7.898254
3,5140,468894,231907,7.946770
4,7781,2141951,180063,7.508178
...,...,...,...,...
2889,5689120,2215170,648837,7.772581
2890,5699230,4627999,1078900,7.906437
2891,6459040,443150,202274,7.862342
2892,10335850,2435229,16611,7.771867


In [32]:
import pandas as pd
import numpy as np

# ============================
# 1. LOAD DATA
# ============================



train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
books = pd.read_csv('books.csv')
genres = pd.read_csv('book_genres.csv')

train = train[train["has_read"] == 1]
global_mean = train["rating"].mean()

# ============================
# 2. MERGE META
# ============================

train = train.merge(books[["book_id", "author_id"]], on="book_id", how="left")
test  = test.merge(books[["book_id", "author_id"]], on="book_id", how="left")

book_genres_full = genres.copy()

# ============================
# 3. EXPAND TRAIN TO MULTI-GENRE
# ============================

train_expanded = train.merge(book_genres_full, on="book_id", how="left")

# ============================
# 4. USER‚ÄìGENRE MEAN & COUNT
# ============================

user_genre_stats = (
    train_expanded.groupby(["user_id", "genre_id"])["rating"]
        .agg(["mean", "count"])
        .reset_index()
        .rename(columns={"mean": "user_genre_mean", "count": "user_genre_count"})
)

# ============================
# 5. USER‚ÄìAUTHOR MEAN
# ============================

user_author_mean = (
    train.groupby(["user_id", "author_id"])["rating"]
        .mean()
        .reset_index()
        .rename(columns={"rating": "user_author_mean"})
)

# ============================
# 6. USER MEAN
# ============================

user_mean = (
    train.groupby("user_id")["rating"]
        .mean()
        .reset_index()
        .rename(columns={"rating": "user_mean"})
)

# ============================
# 7. BOOK MEAN
# ============================

book_mean = (
    train.groupby("book_id")["rating"]
        .mean()
        .reset_index()
        .rename(columns={"rating": "book_mean"})
)

# ============================
# 8. AUTHOR GLOBAL MEAN
# ============================

author_global_mean = (
    train.groupby("author_id")["rating"]
        .mean()
        .reset_index()
        .rename(columns={"rating": "author_global_mean"})
)

# ============================
# 9. GENRE GLOBAL MEAN + BOOK GENRE GLOBAL MEAN
# ============================

genre_global_mean = (
    train_expanded.groupby("genre_id")["rating"]
        .mean()
        .reset_index()
        .rename(columns={"rating": "genre_global_mean"})
)

# —Å—Ä–µ–¥–Ω–∏–π —Ä–µ–π—Ç–∏–Ω–≥ –ø–æ –∂–∞–Ω—Ä–∞–º –¥–ª—è –∫–∞–∂–¥–æ–π –∫–Ω–∏–≥–∏ (–ø–æ –≤—Å–µ–º –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è–º)
book_genre_global_mean = (
    book_genres_full
        .merge(genre_global_mean, on="genre_id", how="left")
        .groupby("book_id")["genre_global_mean"]
        .mean()
        .reset_index()
        .rename(columns={"genre_global_mean": "book_genre_global_mean"})
)

# ============================
# 10. EXPAND TEST TO MULTI-GENRE
# ============================

test_multi = test.merge(book_genres_full, on="book_id", how="left")

test_multi = test_multi.merge(
    user_genre_stats,
    on=["user_id", "genre_id"],
    how="left"
)

test_multi["user_genre_mean"]  = test_multi["user_genre_mean"].fillna(global_mean)
test_multi["user_genre_count"] = test_multi["user_genre_count"].fillna(0)

# ============================
# 11. GENRE WEIGHTS
# ============================

test_multi["user_genre_count_smooth"] = test_multi["user_genre_count"] + 1

test_multi["user_total_genre_reads"] = (
    test_multi.groupby("user_id")["user_genre_count_smooth"].transform("sum")
)

test_multi["genre_weight"] = (
    test_multi["user_genre_count_smooth"] / test_multi["user_total_genre_reads"]
)

test_multi["normalized_weight"] = (
    test_multi["genre_weight"] /
    test_multi.groupby(["user_id","book_id"])["genre_weight"].transform("sum")
).fillna(0)

# ============================
# 12. WEIGHTED GENRE MEAN
# ============================

test_multi["weighted_genre_mean"] = (
    test_multi["normalized_weight"] * test_multi["user_genre_mean"]
)



user_multi_genre_mean = (
    test_multi.groupby(["user_id", "book_id"])["weighted_genre_mean"]
        .sum()
        .reset_index()
        .rename(columns={"weighted_genre_mean": "genre_weighted_prediction"})
)

# ============================
# 13. MERGE EVERYTHING –í TEST2
# ============================

test2 = test.merge(user_multi_genre_mean, on=["user_id", "book_id"], how="left")
test2["genre_weighted_prediction"] = test2["genre_weighted_prediction"].fillna(global_mean)

test2 = test2.merge(user_author_mean, on=["user_id", "author_id"], how="left")
test2["user_author_mean"] = test2["user_author_mean"].fillna(np.nan)  # –ø–æ–∫–∞ –Ω–µ global

test2 = test2.merge(user_mean, on="user_id", how="left")
test2["user_mean"] = test2["user_mean"].fillna(np.nan)  # —Ç–æ–∂–µ –ø–æ–∫–∞ –Ω–µ global

test2 = test2.merge(book_mean, on="book_id", how="left")
test2["book_mean"] = test2["book_mean"].fillna(np.nan)

test2 = test2.merge(author_global_mean, on="author_id", how="left")
test2["author_global_mean"] = test2["author_global_mean"].fillna(global_mean)

test2 = test2.merge(book_genre_global_mean, on="book_id", how="left")
test2["book_genre_global_mean"] = test2["book_genre_global_mean"].fillna(global_mean)

# ============================
# 14. FLAGS: known user / known book
# ============================

known_users = set(train["user_id"].unique())
known_books = set(train["book_id"].unique())

test2["known_user"] = test2["user_id"].isin(known_users)
test2["known_book"] = test2["book_id"].isin(known_books)

# ============================
# 15. CONDITIONAL PREDICTIONS
# ============================

def conditional_pred(row):
    ku = row["known_user"]
    kb = row["known_book"]

    # –±–∞–∑–æ–≤—ã–µ —Å–∏–≥–Ω–∞–ª—ã
    genre_pred = row["genre_weighted_prediction"]
    author_user = row["user_author_mean"] if not np.isnan(row["user_author_mean"]) else np.nan
    user_m = row["user_mean"] if not np.isnan(row["user_mean"]) else np.nan
    book_m = row["book_mean"] if not np.isnan(row["book_mean"]) else np.nan

    # –≥–ª–æ–±–∞–ª—å–Ω—ã–µ —Å–∏–≥–Ω–∞–ª—ã (–Ω–æ–≤—ã–µ)
    author_global = row["author_global_mean"]
    genre_global = row["book_genre_global_mean"]

    # —Ñ–æ–ª–ª–±–µ–∫–∏
    if np.isnan(author_user):
        author_user = author_global
    if np.isnan(user_m):
        user_m = global_mean
    if np.isnan(book_m):
        book_m = genre_global

    # üî• 1) –Æ–∑–µ—Ä –∏ –∫–Ω–∏–≥–∞ –∏–∑–≤–µ—Å—Ç–Ω—ã ‚Üí –∏—Å–ø–æ–ª—å–∑—É–µ–º –í–°–ï 6 —Å–∏–≥–Ω–∞–ª–æ–≤
    if ku and kb:
        return (
            0.30 * genre_pred +               # –ø–µ—Ä—Å–æ–Ω–∞–ª—å–Ω—ã–π user√ógenre
            0.20 * author_user +              # –ø–µ—Ä—Å–æ–Ω–∞–ª—å–Ω—ã–π user√óauthor
            0.15 * user_m +                   # –æ–±—â–∏–π –≤–∫—É—Å —é–∑–µ—Ä–∞
            0.15 * book_m +                   # mean –ø–æ –∫–Ω–∏–≥–µ
            0.10 * author_global +            # –≥–ª–æ–±–∞–ª—å–Ω—ã–π mean –∞–≤—Ç–æ—Ä–∞
            0.10 * genre_global               # –≥–ª–æ–±–∞–ª—å–Ω—ã–π mean –∂–∞–Ω—Ä–æ–≤ –∫–Ω–∏–≥–∏
        )

    # üî• 2) –ö–Ω–∏–≥–∞ –Ω–æ–≤–∞—è, —é–∑–µ—Ä –∏–∑–≤–µ—Å—Ç–µ–Ω
    if ku and not kb:
        return (
            0.40 * genre_pred +
            0.25 * author_user +
            0.20 * user_m +
            0.15 * author_global
        )

    # üî• 3) –Æ–∑–µ—Ä –Ω–æ–≤—ã–π, –∫–Ω–∏–≥–∞ –∏–∑–≤–µ—Å—Ç–Ω–∞
    if not ku and kb:
        return (
            0.4 * book_m +
            0.3 * author_global +
            0.3 * genre_global
        )

    # üî• 4) –ü–æ–ª–Ω—ã–π —Ö–æ–ª–æ–¥–Ω—ã–π —Å—Ç–∞—Ä—Ç
    return (
        0.5 * author_global +
        0.5 * genre_global
    )


test2["rating_predict"] = test2.apply(conditional_pred, axis=1)
test2["rating_predict"] = np.clip(test2["rating_predict"], 0, 10)

# ============================
# 16. SUBMISSION
# ============================

# test2[["user_id","book_id","rating_predict"]].to_csv("submission_superblend_conditional_2.0.csv", index=False)

print("DONE! submission_superblend_conditional_2.0.csv saved")


DONE! submission_superblend_conditional_2.0.csv saved


In [20]:
lgb = pd.read_csv("/content/submission_ensemble_lgb_cat8 (4).csv")
lgb


Unnamed: 0,user_id,book_id,rating_predict
0,281,2461928,8.897913
1,1250,31957,7.811142
2,4241,196603,8.535448
3,5140,468894,8.638038
4,7781,2141951,7.932904
...,...,...,...
2889,5689120,2215170,8.786316
2890,5699230,4627999,7.809387
2891,6459040,443150,8.540019
2892,10335850,2435229,7.757765


In [None]:
y_lgb = lgb['rating_predict']        
y_alco = test2['rating_predict']       


blended_pred = 0.5 * y_lgb + 0.5 * y_alco

test2['rating_predict'] = blended_pred

test2[["user_id", "book_id", "rating_predict"]].to_csv("ultra_mega_super_blend_for_kolegov_cum.csv", index=False)

In [26]:
test2

Unnamed: 0,user_id,book_id,author_id,genre_weighted_prediction,user_author_mean,user_mean,book_mean,author_global_mean,book_genre_global_mean,known_user,known_book,rating_predict
0,281,2461928,849103,8.000000,,7.714286,10.000000,6.666667,8.165793,True,True,
1,1250,31957,1900171,5.611111,,7.375000,,6.000000,7.829067,True,False,
2,4241,196603,166428,7.336457,,8.000000,8.388889,8.330097,7.704309,True,True,
3,5140,468894,231907,7.663201,,9.500000,,10.000000,7.651603,True,False,
4,7781,2141951,180063,7.663201,,,7.190476,7.292683,7.553533,False,True,
...,...,...,...,...,...,...,...,...,...,...,...,...
2889,5689120,2215170,648837,7.663201,,10.000000,7.487805,7.853448,7.681294,True,True,
2890,5699230,4627999,1078900,8.200000,,7.263158,,8.689655,7.796678,True,False,
2891,6459040,443150,202274,7.663201,,9.285714,6.833333,8.084337,7.607441,True,True,
2892,10335850,2435229,16611,8.000000,,6.969697,,8.500000,7.580951,True,False,
