In [1]:
!pip install deepctr-torch

Collecting deepctr-torch
  Downloading deepctr_torch-0.2.9-py3-none-any.whl.metadata (12 kB)
Downloading deepctr_torch-0.2.9-py3-none-any.whl (82 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m83.0/83.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deepctr-torch
Successfully installed deepctr-torch-0.2.9


In [6]:
import os
import gc
import json
import pickle
import glob
import pandas as pd
import numpy as np
import polars as pl
import lightgbm as lgb
from datetime import date
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import torch
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import DeepFM

# ==================== 1. C·∫§U H√åNH PATH ====================
DATA_DIR = "/kaggle/input/recsys-data"
RAW_DATA_DIR = "/kaggle/input/sales-dataset"

# T·ª∞ ƒê·ªòNG CH·ªåN GT: ∆Øu ti√™n Final Test n·∫øu c√≥
GT_PATH_FINAL = "/kaggle/input/final-test/final_groundtruth.pkl"
GT_PATH_OLD = "/kaggle/input/sales-test/groundtruth.pkl"
GT_PATH = GT_PATH_FINAL if os.path.exists(GT_PATH_FINAL) else GT_PATH_OLD

LGBM_MODEL_PATH = f"{DATA_DIR}/lightgbm_model.txt"
TRAIN_PATH = f"{DATA_DIR}/train_data.parquet"
VAL_PATH = f"{DATA_DIR}/val_data.parquet"
INF_PATH = f"{DATA_DIR}/inf_data.parquet"

# T·ª∑ tr·ªçng Ensemble
W_LGBM = 0.6
W_DEEP = 0.4

print(f">>> USING GROUND TRUTH AT: {GT_PATH}")

# ==================== 2. H√ÄM H·ªñ TR·ª¢ (ADVANCED COLD START) ====================
def load_data_cold_start():
    print(f">>> Loading raw data for Cold Start from: {RAW_DATA_DIR}...")
    try:
        purchase_path = f"{RAW_DATA_DIR}/sales_pers.purchase_history_daily_chunk_*.parquet"
        user_path = f"{RAW_DATA_DIR}/sales_pers.user_chunk_*.parquet"
        
        if not glob.glob(purchase_path):
            print(f"‚ö†Ô∏è Warning: Kh√¥ng t√¨m th·∫•y file t·∫°i {purchase_path}.")
            return None, None

        lz_trans = (
            pl.scan_parquet(purchase_path)
            .filter((pl.col("is_deleted") == False) & (pl.col("quantity") > 0))
            .select([
                pl.col("customer_id").cast(pl.Int32), 
                pl.col("item_id").cast(pl.String), 
                pl.col("date_key").cast(pl.String).str.strptime(pl.Date, "%Y%m%d").alias("date")
            ])
        )
        lz_users = pl.scan_parquet(user_path).filter(pl.col("is_deleted") == False).select([
            pl.col("customer_id").cast(pl.Int32), 
            pl.col("province").fill_null("Unknown").cast(pl.String)
        ])
        return lz_trans, lz_users
    except Exception as e:
        print(f"‚ö†Ô∏è Warning: Cold start load failed ({e}).")
        return None, None

def fill_cold_start_advanced(pred_recs, target_ids, lz_trans, lz_users):
    if lz_trans is None or lz_users is None: return pred_recs
    
    print(f">>> [Advanced] Filling Cold Start for {len(target_ids)} users...")
    
    safe_target_ids_int = []
    for uid in target_ids:
        try: safe_target_ids_int.append(int(uid))
        except: continue

    # 1. Repurchase History
    print("   -> Computing User Purchase History...")
    user_history = (
        lz_trans.filter(pl.col("customer_id").is_in(safe_target_ids_int))
        .group_by(["customer_id", "item_id"])
        .len()
        .sort(["customer_id", "len"], descending=True)
        .group_by("customer_id")
        .head(20)
        .group_by("customer_id")
        .agg(pl.col("item_id"))
        .collect()
    )
    hist_dict = dict(zip(user_history["customer_id"].to_list(), user_history["item_id"].to_list()))
    
    # 2. Top Global
    top_global = lz_trans.filter(pl.col("date") >= date(2024, 12, 1)).group_by("item_id").len().sort("len", descending=True).limit(20).select("item_id").collect().to_series().to_list()
    
    # 3. Top Province
    user_prov = lz_users.filter(pl.col("customer_id").is_in(safe_target_ids_int)).collect()
    u_prov_map = dict(zip(user_prov["customer_id"].to_list(), user_prov["province"].to_list()))
    
    top_prov = (
        lz_trans.filter(pl.col("date") >= date(2024, 11, 1))
        .join(lz_users, on="customer_id")
        .group_by(["province", "item_id"]).len()
        .sort(["province", "len"], descending=True)
        .group_by("province").head(10)
        .group_by("province").agg(pl.col("item_id"))
        .collect()
    )
    prov_dict = dict(zip(top_prov["province"].to_list(), top_prov["item_id"].to_list()))

    # 4. Filling Logic
    final_preds = pred_recs.copy()
    
    for u in target_ids:
        u_str = str(u)
        current_items = final_preds.get(u_str, [])
        
        # Priority: Repurchase -> Province -> Global
        if len(current_items) < 10:
            u_int = int(u) if str(u).isdigit() else -1
            
            # Repurchase
            for item in hist_dict.get(u_int, []):
                if item not in current_items: current_items.append(item)
                if len(current_items) >= 10: break
            
            # Province
            if len(current_items) < 10:
                prov = u_prov_map.get(u_int, "Unknown")
                for item in prov_dict.get(prov, []):
                    if item not in current_items: current_items.append(item)
                    if len(current_items) >= 10: break
            
            # Global
            if len(current_items) < 10:
                for item in top_global:
                    if item not in current_items: current_items.append(item)
                    if len(current_items) >= 10: break
        
        final_preds[u_str] = current_items[:10]
            
    return final_preds

# --- H√ÄM EVALUATE M·ªöI: H·ªñ TR·ª¢ C·∫¢ DICT V√Ä DATAFRAME ---
def evaluate_robust(pred, gt_path, model_users_set):
    """
    T√≠nh Precision@10 v√† chia t√°ch k·∫øt qu·∫£ th√†nh 2 nh√≥m (Warm/Cold).
    H·ªó tr·ª£ ƒë·ªçc c·∫£ file Dictionary (c≈©) v√† DataFrame (m·ªõi).
    """
    if not os.path.exists(gt_path): 
        print(f"‚ö†Ô∏è GT Path not found.")
        return
        
    print(f">>> Calculating Precision by Groups (Robust Mode)...")
    with open(gt_path, 'rb') as f: 
        gt_data = pickle.load(f)
    
    # X·ª≠ l√Ω ƒë·ªãnh d·∫°ng d·ªØ li·ªáu
    gt_dict = {}
    if isinstance(gt_data, dict):
        gt_dict = {str(k): v for k, v in gt_data.items()}
    elif isinstance(gt_data, pd.DataFrame):
        print("   -> Detected DataFrame Ground Truth. Converting...")
        # Gi·∫£ ƒë·ªãnh c·ªôt 0 l√† User, c·ªôt 1 l√† List Items
        user_col = gt_data.columns[0]
        item_col = gt_data.columns[1]
        gt_dict = gt_data.set_index(user_col)[item_col].to_dict()
        gt_dict = {str(k): v for k, v in gt_dict.items()}
    
    warm_precs = []
    cold_precs = []
    
    for u, true_items in gt_dict.items():
        if u in pred:
            rec_items = pred[u][:10]
            try:
                t_set = set(str(x) for x in true_items) if isinstance(true_items, (np.ndarray, list)) else set()
                r_set = set(str(x) for x in rec_items)
                hits = len(t_set & r_set)
                precision = hits / 10.0
                
                # Ph√¢n lo·∫°i User
                if u in model_users_set:
                    warm_precs.append(precision)
                else:
                    cold_precs.append(precision)
            except: continue
            
    # T√≠nh Mean
    warm_score = np.mean(warm_precs) if warm_precs else 0.0
    cold_score = np.mean(cold_precs) if cold_precs else 0.0
    total_score = np.mean(warm_precs + cold_precs) if (warm_precs + cold_precs) else 0.0
    
    print("-" * 50)
    print(f"üìä REPORT FOR {len(gt_dict)} USERS:")
    print(f"   1. Warm Users (Model Predict):  {len(warm_precs)} users | Precision: {warm_score:.4f}")
    print(f"   2. Cold Users (Fill Strategy):  {len(cold_precs)} users | Precision: {cold_score:.4f}")
    print(f"   --------------------------------------------------")
    print(f"   üèÜ OVERALL PRECISION@10:        {total_score:.4f}")
    print("-" * 50)

# ==================== 3. CHU·∫®N B·ªä D·ªÆ LI·ªÜU ====================
print(">>> [DeepFM] LOADING DATA...")
train_df = pd.read_parquet(TRAIN_PATH)
inf_df = pd.read_parquet(INF_PATH)

if os.path.exists(VAL_PATH):
    print("   -> Merging Val into Train...")
    val_df = pd.read_parquet(VAL_PATH)
    train_df = pd.concat([train_df, val_df], axis=0, ignore_index=True)
    del val_df

train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

sparse_features = ['customer_id', 'item_id', 'brand', 'category_l1', 
                   'cat_l1_lower', 'age_group', 'segment_value', 'segment_variety']
dense_features = ['price', 'avg_order_value', 'unique_cats', 'baby_age_months', 
                  'item_popularity', 'days_since_last_purchase', 'user_brand_buy_count']

for col in dense_features:
    if col not in train_df.columns: train_df[col] = 0
    if col not in inf_df.columns: inf_df[col] = 0

train_deep = train_df.copy()
inf_deep = inf_df.copy()

for df in [train_deep, inf_deep]:
    df[sparse_features] = df[sparse_features].fillna('-1').astype(str)
    df[dense_features] = df[dense_features].fillna(0)

print("   -> Encoding & Scaling...")
item_encoder = LabelEncoder()
all_items = pd.concat([train_deep['item_id'], inf_deep['item_id']]).unique()
item_encoder.fit(all_items)

for feat in sparse_features:
    lbe = LabelEncoder()
    if feat == 'item_id': lbe = item_encoder
    else: lbe.fit(pd.concat([train_deep[feat], inf_deep[feat]]).unique())
    train_deep[feat] = lbe.transform(train_deep[feat])
    inf_deep[feat] = lbe.transform(inf_deep[feat])

mms = MinMaxScaler(feature_range=(0, 1))
mms.fit(pd.concat([train_deep[dense_features], inf_deep[dense_features]]))
train_deep[dense_features] = mms.transform(train_deep[dense_features])
inf_deep[dense_features] = mms.transform(inf_deep[dense_features])

# ==================== 4. TRAIN DEEPFM ====================
print(">>> [DeepFM] TRAINING...")
fixlen_feature_columns = [
    SparseFeat(feat, vocabulary_size=pd.concat([train_deep[feat], inf_deep[feat]]).max() + 1, embedding_dim=16)
    for feat in sparse_features
] + [DenseFeat(feat, 1) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

train_model_input = {name: train_deep[name] for name in feature_names}
inf_model_input = {name: inf_deep[name] for name in feature_names}

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary',
               dnn_hidden_units=(256, 128, 64), l2_reg_embedding=1e-5, device=device)

model.compile("adam", "binary_crossentropy", metrics=['auc'])
model.fit(train_model_input, train_deep['label'].values, batch_size=1024, epochs=3, verbose=1, validation_split=0.1)

deepfm_scores = model.predict(inf_model_input, batch_size=2048).flatten()
print("   -> DeepFM Prediction Done.")

del train_deep, inf_deep, model, train_model_input, inf_model_input, train_df
gc.collect()

# ==================== 5. LIGHTGBM ====================
print(">>> [LightGBM] PREDICTING...")
lgbm_scores = deepfm_scores
lgbm_success = False

if os.path.exists(LGBM_MODEL_PATH):
    try:
        print("   -> Loading Inf Data via Polars...")
        df_inf_lgbm = pl.read_parquet(INF_PATH).to_pandas()
        cat_feats = ["brand", "category_l1", "age_group", "segment_value", "segment_variety"]
        ignore_cols = ["customer_id", "item_id", "label", "last_buy_date", "date", "cat_l1_lower", "score", "temp_score"]
        
        for c in cat_feats:
            if c in df_inf_lgbm.columns: df_inf_lgbm[c] = df_inf_lgbm[c].astype('category')

        features = [c for c in df_inf_lgbm.columns if c not in ignore_cols]
        bst = lgb.Booster(model_file=LGBM_MODEL_PATH)
        lgbm_scores = bst.predict(df_inf_lgbm[features])
        
        print("   -> LightGBM Prediction Done.")
        lgbm_success = True
        del df_inf_lgbm
    except Exception as e:
        print(f"‚ö†Ô∏è ERROR LightGBM: {e}")
        W_LGBM = 0.0
        W_DEEP = 1.0
    gc.collect()

# ==================== 6. ENSEMBLE & RANKING ====================
print(f">>> ENSEMBLING: {W_LGBM}*LGBM + {W_DEEP}*DeepFM")
inf_df_final = pd.read_parquet(INF_PATH, columns=['customer_id', 'item_id'])

if lgbm_success:
    inf_df_final['final_score'] = (W_LGBM * lgbm_scores) + (W_DEEP * deepfm_scores)
else:
    inf_df_final['final_score'] = deepfm_scores

print(">>> RANKING...")
inf_df_final = inf_df_final.sort_values(['customer_id', 'final_score'], ascending=[True, False])
top_k_df = inf_df_final.groupby('customer_id').head(10)

# L∆∞u danh s√°ch user m√† Model ƒë√£ d·ª± ƒëo√°n ƒë∆∞·ª£c (Warm Users)
top_k_df = top_k_df.copy()
top_k_df['customer_id'] = top_k_df['customer_id'].astype(str)
grouped = top_k_df.groupby('customer_id')['item_id'].apply(list).to_dict()

# SET quan tr·ªçng: D√πng ƒë·ªÉ ph√¢n bi·ªát Warm vs Cold User
MODEL_USERS_SET = set(grouped.keys())

final_submission = grouped

# ==================== 7. SUBMIT & ROBUST EVALUATION ====================
print("\n>>> üèÅ FINALIZING SUBMISSION...")

# Load Cold Start Data
lz_trans, lz_users = load_data_cold_start()

# X√°c ƒë·ªãnh Target IDs t·ª´ file GroundTruth chu·∫©n
target_ids = []
if os.path.exists(GT_PATH):
    print(f"   -> Loading Target IDs from GT: {GT_PATH}")
    with open(GT_PATH, 'rb') as f: 
        gt_data = pickle.load(f)
    
    if isinstance(gt_data, dict):
        target_ids = list(gt_data.keys())
    elif isinstance(gt_data, pd.DataFrame):
        # L·∫•y c·ªôt ƒë·∫ßu ti√™n l√†m ID
        target_ids = gt_data.iloc[:, 0].astype(str).tolist()
else:
    print("‚ö†Ô∏è No GT found. Using IDs from submission.")
    target_ids = list(final_submission.keys())

# ƒêi·ªÅn khuy·∫øt (Fill Cold Start)
final_submission = fill_cold_start_advanced(final_submission, target_ids, lz_trans, lz_users)

# ƒê√ÅNH GI√Å
if os.path.exists(GT_PATH):
    evaluate_robust(final_submission, GT_PATH, MODEL_USERS_SET)

# Save
with open("submission_final.json", 'w', encoding='utf-8') as f:
    json.dump(final_submission, f, ensure_ascii=False, indent=4)
print("\n‚úÖ Saved results to submission_final.json")

>>> USING GROUND TRUTH AT: /kaggle/input/final-test/final_groundtruth.pkl
>>> [DeepFM] LOADING DATA...
   -> Merging Val into Train...
   -> Encoding & Scaling...
>>> [DeepFM] TRAINING...
cuda:0
Train on 3213837 samples, validate on 357094 samples, 3139 steps per epoch


3139it [00:58, 53.38it/s]


Epoch 1/3
60s - loss:  0.4484 - auc:  0.8645 - val_auc:  0.8775


3139it [00:58, 53.70it/s]


Epoch 2/3
60s - loss:  0.3775 - auc:  0.9076 - val_auc:  0.8703


3139it [00:58, 53.72it/s]


Epoch 3/3
61s - loss:  0.2919 - auc:  0.9460 - val_auc:  0.8529
   -> DeepFM Prediction Done.
>>> [LightGBM] PREDICTING...
   -> Loading Inf Data via Polars...
   -> LightGBM Prediction Done.
>>> ENSEMBLING: 0.6*LGBM + 0.4*DeepFM
>>> RANKING...

>>> üèÅ FINALIZING SUBMISSION...
>>> Loading raw data for Cold Start from: /kaggle/input/sales-dataset...
   -> Loading Target IDs from GT: /kaggle/input/final-test/final_groundtruth.pkl


  gt_data = pickle.load(f)


>>> [Advanced] Filling Cold Start for 644970 users...
   -> Computing User Purchase History...
>>> Calculating Precision by Groups (Robust Mode)...


  gt_data = pickle.load(f)


   -> Detected DataFrame Ground Truth. Converting...
--------------------------------------------------
üìä REPORT FOR 644970 USERS:
   1. Warm Users (Model Predict):  225552 users | Precision: 0.1323
   2. Cold Users (Fill Strategy):  419418 users | Precision: 0.0379
   --------------------------------------------------
   üèÜ OVERALL PRECISION@10:        0.0709
--------------------------------------------------

‚úÖ Saved results to submission_final.json
