# Thư viện

In [1]:
import polars as pl
import numpy as np
import datetime

# Tải dữ liệu

In [2]:
user_path = "processed_data/processed_user.parquet"
item_path = "processed_data/processed_item.parquet"
purchase_path = "processed_data/processed_purchase.parquet"

user_df = pl.scan_parquet(user_path)
item_df = pl.scan_parquet(item_path)
purchase_df = pl.scan_parquet(purchase_path)

# Hàm tạo dữ liệu base

In [None]:
import polars as pl
import numpy as np

def createBaseData(user_df: pl.LazyFrame, item_df: pl.LazyFrame, purchase_df: pl.LazyFrame, begin_hist, end_hist, begin_recent, end_recent, k, seed):
    # ---------------------------------------------------------
    # 1. TẠO POSITIVE
    # ---------------------------------------------------------
    positive_df = (
        purchase_df
        .filter(pl.col('datetime').is_between(begin_recent, end_recent))
        .unique(subset=['customer_id', 'item_id'], keep='first')
        .with_columns(pl.lit(1).alias('Y'))
        .select(['customer_id', 'item_id', 'Y'])
    )
    
    # Collect
    positive_df = positive_df.collect() 
    n_positive = positive_df.height
    
    # Danh sách Active Users
    active_user_ids = positive_df.get_column("customer_id").unique().to_numpy()

    # ---------------------------------------------------------
    # 2. TẠO NEGATIVE LOẠI 1: HISTORY (Món cũ đã chán)
    # ---------------------------------------------------------
    hist_candidates = (
        purchase_df
        .filter(pl.col('datetime').is_between(begin_hist, end_hist))
        .filter(pl.col('customer_id').is_in(active_user_ids))
        .unique(subset=['customer_id', 'item_id'])
        .join(positive_df.lazy().select(['customer_id', 'item_id']), on=['customer_id', 'item_id'], how='anti')
        
        # --- QUAN TRỌNG: CHỈ LẤY 2 CỘT ID ĐỂ KHỚP VỚI HARD NEGATIVE ---
        .select(['customer_id', 'item_id']) 
    )
    
    n_neg_hist_needed = int((k / 2) * n_positive)
    hist_candidates_collected = hist_candidates.collect()
    
    # Sampling History
    if hist_candidates_collected.height > n_neg_hist_needed:
        np.random.seed(seed)
        indices = np.random.choice(hist_candidates_collected.height, size=n_neg_hist_needed, replace=False)
        negative_hist_df = hist_candidates_collected[indices]
    else:
        negative_hist_df = hist_candidates_collected

    # ---------------------------------------------------------
    # 3. TẠO NEGATIVE LOẠI 2: HARD POPULAR (Direct Sampling)
    # ---------------------------------------------------------
    top_50_item_ids = (
        purchase_df
        .filter(pl.col('datetime').is_between(begin_hist, end_hist))
        .group_by('item_id')
        .agg(pl.len().alias('cnt'))
        .sort('cnt', descending=True)
        .head(50)
        .select('item_id')
        .collect()
        .get_column('item_id')
        .to_numpy()
    )

    n_neg_hard_needed = max(0, int((k * n_positive) - negative_hist_df.height))
    
    if n_neg_hard_needed > 0:
        safety_factor = 1.2 
        n_generate = int(n_neg_hard_needed * safety_factor)
        
        np.random.seed(seed + 1)
        rand_users = np.random.choice(active_user_ids, size=n_generate)
        rand_items = np.random.choice(top_50_item_ids, size=n_generate)
        
        pop_candidates_df = pl.DataFrame({
            "customer_id": rand_users,
            "item_id": rand_items
        })
        
        # Đảm bảo schema khớp với negative_hist_df (chỉ 2 cột)
        pop_candidates_df = pop_candidates_df.select(['customer_id', 'item_id'])
        
        negative_hard_df = (
            pop_candidates_df.lazy()
            .unique()
            .join(positive_df.lazy().select(['customer_id', 'item_id']), on=['customer_id', 'item_id'], how='anti')
            .head(n_neg_hard_needed) 
            .collect()
        )
    else:
        # Tạo bảng rỗng với schema đúng (2 cột)
        negative_hard_df = pl.DataFrame(schema={"customer_id": positive_df.schema["customer_id"], "item_id": positive_df.schema["item_id"]})

    # ---------------------------------------------------------
    # 4. GỘP LẠI VÀ XUẤT FILE
    # ---------------------------------------------------------
    final_negative_df = pl.concat([negative_hist_df, negative_hard_df], how="vertical").with_columns(pl.lit(0).alias('Y'))
    
    print(f"✅ Đã tạo dữ liệu (Fixed ShapeError):")
    print(f"   - Positive: {n_positive}")
    print(f"   - Negative History: {negative_hist_df.height}")
    print(f"   - Negative Hard: {negative_hard_df.height}")
    print(f"   - Tổng mẫu: {n_positive + final_negative_df.height}")
    
    pl.concat([positive_df.lazy(), final_negative_df.lazy()]) \
      .sort(['customer_id', 'item_id']) \
      .sink_parquet("train_data/base.parquet")

    return True

# Tạo dữ liệu train với k = 6, seed = 42
### Nhãn negative gấp 6 lần positive
### Nhãn positive từ tháng 1/2025

In [5]:
begin_hist = datetime.datetime(
    day=1,
    month=1,
    year=2024
)
end_hist = datetime.datetime(
    day=31,
    month=12,
    year=2024
)
begin_recent = datetime.datetime(
    day=1,
    month=1,
    year=2025
)
end_recent = datetime.datetime(
    day=30,
    month=1,
    year=2025
)

createBaseData(user_df, item_df, purchase_df, begin_hist, end_hist, begin_recent, end_recent, k=6, seed=42)

✅ Đã tạo dữ liệu (Fixed ShapeError):
   - Positive: 2912531
   - Negative History: 8737593
   - Negative Hard: 8737593
   - Tổng mẫu: 20387717


True

# Xem thử dữ liệu

In [6]:
base_df = pl.scan_parquet("train_data/base.parquet")

In [7]:
base_df.collect().n_unique("customer_id")

648833

In [8]:
base_df.filter(
    pl.col('Y') == 1
).collect()

customer_id,item_id,Y
i32,str,i32
28879,"""0007090000157""",1
28879,"""0029130000030""",1
28879,"""2678000000002""",1
28879,"""2700000000002""",1
28879,"""3052000000001""",1
…,…,…
8296522,"""1771000000002""",1
8296522,"""4603024000001""",1
8296522,"""5950000000001""",1
8296522,"""6498000000007""",1


In [9]:
base_df.filter(
    pl.col('Y') == 0
).collect()

customer_id,item_id,Y
i32,str,i32
28879,"""0020010000210""",0
28879,"""0020130000004""",0
28879,"""0029130000029""",0
28879,"""0068000000033""",0
28879,"""0068000000159""",0
…,…,…
8296523,"""2803000000011""",0
8296523,"""3880000000002""",0
8296523,"""4603024000001""",0
8296523,"""4950000000001""",0
