In [1]:
import joblib
import polars as pl
import numpy as np

In [2]:
model = joblib.load("train_data/model.pkl")

In [3]:
# Sử dụng lazy loading để không load hết vào RAM
df_lazy = pl.scan_parquet("train_data/inference_final.parquet")

# Lấy danh sách unique customers
unique_customers = df_lazy.select("customer_id").unique().collect().to_series().to_list()
print(f"Total customers: {len(unique_customers)}")

feature_cols = [
    # Feature 1: Frequency
    'feat1_customer_item_freq',
    # Feature 2: Recency Decay
    'feat2_brand_affinity', 'feat2_type_affinity',
    # Feature 3: Urgency (Window-Based)
    'feat3_dist_to_window_center', 'feat3_is_in_window',
    # Feature 4: Popularity
    'feat4_pop_30d_log', 'feat4_pop_trend', 'feat4_pop_category_rank', 'feat4_pop_global_rank',
    # Feature 5: Baby Age Alignment
    'feat5_score_age_end_hist', 'feat5_score_age_midpoint',
    # Feature 6: Price Compatibility
    'feat6_price_compatibility', 'feat6_is_above_user_capacity',
    # Feature 7: Brand Loyalty
    'feat7_brand_repeat_rate', 'feat7_brand_rank', 'feat7_user_brand_affinity',
    # Feature 8: Co-purchase
    'feat8_co_purchase_max', 'feat8_co_purchase_sum', 'feat8_co_purchase_count'
]

Total customers: 2442305


In [4]:
# Xử lý theo batch để tránh hết RAM
import gc

BATCH_SIZE = 100000  # Số customers mỗi batch (điều chỉnh theo RAM của bạn)
n_batches = (len(unique_customers) + BATCH_SIZE - 1) // BATCH_SIZE

print(f"Processing {len(unique_customers)} customers in {n_batches} batches")

# List để lưu kết quả từng batch
all_predictions = []

for batch_idx in range(n_batches):
    print(f"\n{'='*50}")
    print(f"Processing batch {batch_idx + 1}/{n_batches}")
    
    # Lấy customers cho batch này
    start_idx = batch_idx * BATCH_SIZE
    end_idx = min((batch_idx + 1) * BATCH_SIZE, len(unique_customers))
    batch_customers = unique_customers[start_idx:end_idx]
    
    # Filter data cho batch customers này và collect vào RAM
    df_batch = (
        df_lazy
        .filter(pl.col("customer_id").is_in(batch_customers))
        .collect()
    )
    
    print(f"Batch size: {len(df_batch)} rows for {len(batch_customers)} customers")
    
    # Extract features
    X_batch = df_batch.select(feature_cols).to_pandas()
    
    # Dự đoán xác suất
    y_pred_proba = model.predict_proba(X_batch)[:, 1]
    
    # Thêm cột dự đoán vào dataframe
    df_batch = df_batch.with_columns(pl.Series("pred_score", y_pred_proba))
    
    # Sắp xếp và lấy top 10 items cho mỗi customer trong batch này
    top_predictions_batch = (
        df_batch.sort(["customer_id", "pred_score"], descending=[False, True])
        .group_by("customer_id")
        .agg(pl.col("item_id").head(10).alias("item_id"))
    )
    
    all_predictions.append(top_predictions_batch)
    
    # Clear memory
    del df_batch, X_batch, y_pred_proba, top_predictions_batch
    gc.collect()
    
    print(f"Batch {batch_idx + 1} completed")

# Merge tất cả kết quả
print(f"\n{'='*50}")
print("Merging all predictions...")
top_predictions = pl.concat(all_predictions)

print(f"Total customers with predictions: {len(top_predictions)}")
print(top_predictions.head())

Processing 2442305 customers in 25 batches

Processing batch 1/25
Batch size: 10000681 rows for 100000 customers
Batch 1 completed

Processing batch 2/25
Batch size: 10002744 rows for 100000 customers
Batch 2 completed

Processing batch 3/25
Batch size: 10008220 rows for 100000 customers
Batch 3 completed

Processing batch 4/25
Batch size: 10008190 rows for 100000 customers
Batch 4 completed

Processing batch 5/25
Batch size: 10005615 rows for 100000 customers
Batch 5 completed

Processing batch 6/25
Batch size: 10002229 rows for 100000 customers
Batch 6 completed

Processing batch 7/25
Batch size: 10006186 rows for 100000 customers
Batch 7 completed

Processing batch 8/25
Batch size: 10004131 rows for 100000 customers
Batch 8 completed

Processing batch 9/25
Batch size: 10003239 rows for 100000 customers
Batch 9 completed

Processing batch 10/25
Batch size: 10004926 rows for 100000 customers
Batch 10 completed

Processing batch 11/25
Batch size: 10004452 rows for 100000 customers
Batc

In [5]:
# Chuyển đổi sang format JSON và lưu file
import json

predictions_dict = {}
for row in top_predictions.iter_rows(named=True):
    customer_id = str(row["customer_id"])
    item_ids = [str(iid) for iid in row["item_id"]]
    predictions_dict[customer_id] = item_ids

# Lưu ra file pred.json
with open("pred.json", "w") as f:
    json.dump(predictions_dict, f, indent=2)

print(f"Đã lưu predictions cho {len(predictions_dict)} customers vào pred.json")


Đã lưu predictions cho 2442305 customers vào pred.json


In [2]:
def precision_at_k(pred, gt, hist, filter_bought_items=True, K=10): # prediction, ground-truth, history items, candidate items
    precisions = []
    ideal_precs = []
    ncold_start = 0
    cold_start_users = []
    nusers = len(gt.keys())
    for user in gt.keys():
        if (user not in hist) or (user not in pred):
            ncold_start += 1
            cold_start_users.append(user) # THINKING: để giảm cold start có thể tăng khoảng HISTORY
            continue
        gt_items = gt[user]
        relevant_items = set(gt_items)
        if filter_bought_items:
            relevant_items -=set(hist[user])
        # Compute precision@k
        hits = len(set(pred[user][:K]) & relevant_items)
        precisions.append(hits / K)
    return np.mean(precisions), cold_start_users

In [3]:
# Load các argument để tính precision
import json
import joblib
import polars as pl

# 1. Load predictions từ pred.json
with open("pred.json", "r") as f:
    pred = json.load(f)

# 2. Load ground truth
gt = joblib.load("train_data/groundtruth.pkl")

# 3. Load history items từ processed_purchase.parquet
purchase_df = pl.read_parquet("processed_data/processed_purchase.parquet")

# Tạo dictionary history: customer_id -> list of item_ids (dùng Polars group_by)
hist_df = (
    purchase_df
    .group_by("customer_id")
    .agg(pl.col("item_id").alias("item_ids"))
)

# Chuyển sang dictionary
hist = {}
for row in hist_df.iter_rows(named=True):
    customer_id = str(row["customer_id"])
    item_ids = [str(iid) for iid in row["item_ids"]]
    hist[customer_id] = item_ids

print(f"Loaded {len(pred)} predictions")
print(f"Loaded {len(gt)} ground truth customers")
print(f"Loaded {len(hist)} customers with history")


Loaded 2442305 predictions
Loaded 391900 ground truth customers
Loaded 2442305 customers with history


In [4]:
# Convert gt keys từ int sang str để match với pred và hist
gt = {str(k): [str(item) for item in v] for k, v in gt.items()}

In [6]:
import numpy as np
# Tính precision@10
precision, cold_start_users = precision_at_k(pred, gt, hist, filter_bought_items=True, K=10)

print(f"Precision@10: {precision:.4f}")
print(f"Number of cold start users: {len(cold_start_users)}")


Precision@10: 0.0158
Number of cold start users: 58180
