In [1]:
import polars as pl
import numpy as np
import scipy.sparse as sparse
from datetime import datetime

# C·∫§U H√åNH
FILE_PATH = "processed_data/processed_purchase.parquet"
ALPHA = 0.01 # H·ªá s·ªë suy gi·∫£m (Time Decay)

print("--- B∆Ø·ªöC 1: KH·ªûI T·∫†O LAZY FRAME ---")

q = pl.scan_parquet(FILE_PATH)
max_date = q.select(pl.col("date").max()).collect().item()

print(f"ƒê√£ load LazyFrame. Ng√†y m·ªëc (Max Date): {max_date}")

--- B∆Ø·ªöC 1: KH·ªûI T·∫†O LAZY FRAME ---
ƒê√£ load LazyFrame. Ng√†y m·ªëc (Max Date): 2025-01-30


In [2]:
print("--- B∆Ø·ªöC 2: T√çNH TO√ÅN TIME DECAY (LAZY) ---")

# X√¢y d·ª±ng c√°c bi·ªÉu th·ª©c t√≠nh to√°n
q_weighted = q.with_columns([
    # 1. T√≠nh s·ªë ng√†y tr√¥i qua: (Max Date - Current Date)
    (max_date - pl.col("date")).dt.total_days().alias("days_ago")
]).with_columns([
    # 2. T√≠nh h·ªá s·ªë suy gi·∫£m
    (1.0 / (1.0 + ALPHA * pl.col("days_ago"))).alias("decay_factor")
]).with_columns([
    # 3. T√≠nh tr·ªçng s·ªë cu·ªëi c√πng (Weight)
    (pl.col("quantity") * pl.col("decay_factor")).alias("weighted_quantity")
])

--- B∆Ø·ªöC 2: T√çNH TO√ÅN TIME DECAY (LAZY) ---


In [3]:
print("--- B∆Ø·ªöC 3: GOM NH√ìM & TH·ª∞C THI (COLLECT) ---")

# Group theo User v√† Item -> T√≠nh t·ªïng tr·ªçng s·ªë
df_grouped = q_weighted.group_by(["customer_id", "item_id"]).agg(
    pl.col("weighted_quantity").sum()
).collect()

print(f"D·ªØ li·ªáu sau khi g·ªôp: {df_grouped.shape[0]} d√≤ng")
df_grouped.head(5)

--- B∆Ø·ªöC 3: GOM NH√ìM & TH·ª∞C THI (COLLECT) ---
D·ªØ li·ªáu sau khi g·ªôp: 26790043 d√≤ng


customer_id,item_id,weighted_quantity
i32,str,f64
3322262,"""0020090000043""",0.204499
5515344,"""3052000000001""",0.472813
7157628,"""0064000000006""",0.228311
7833907,"""0199000000002""",0.8
1343940,"""3500000000069""",0.389105


In [4]:
print("--- B∆Ø·ªöC 4: MAPPING ID -> INDEX ---")

# 1. T·∫°o danh s√°ch Unique User v√† Item (sort ƒë·ªÉ c·ªë ƒë·ªãnh th·ª© t·ª±)
unique_users = df_grouped.select("customer_id").unique().sort("customer_id")
unique_items = df_grouped.select("item_id").unique().sort("item_id")

# 2. Th√™m c·ªôt index (row number)
user_map_df = unique_users.with_row_index("user_index")
item_map_df = unique_items.with_row_index("item_index")

# 3. Join b·∫£ng map v√†o b·∫£ng ch√≠nh ƒë·ªÉ l·∫•y index
df_final = df_grouped.join(user_map_df, on="customer_id", how="left") \
                     .join(item_map_df, on="item_id", how="left")

# 4. T·∫°o Dictionary ƒë·ªÉ tra c·ª©u ng∆∞·ª£c
user_lookup = dict(zip(user_map_df["user_index"], user_map_df["customer_id"]))
item_lookup = dict(zip(item_map_df["item_index"], item_map_df["item_id"]))

print("Mapping ho√†n t·∫•t.")
df_final.select(["user_index", "item_index", "weighted_quantity"]).head()

--- B∆Ø·ªöC 4: MAPPING ID -> INDEX ---
Mapping ho√†n t·∫•t.


user_index,item_index,weighted_quantity
u32,u32,f64
517449,2191,0.204499
966896,9919,0.472813
1492352,3006,0.228311
2135505,3603,0.8
188827,12149,0.389105


In [5]:
print("--- B∆Ø·ªöC 5: T·∫†O MA TR·∫¨N CSR (SCIPY) ---")

# Chuy·ªÉn c·ªôt Polars sang Numpy Array
rows = df_final["user_index"].to_numpy()  # D√≤ng l√† User
cols = df_final["item_index"].to_numpy()  # C·ªôt l√† Item
data = df_final["weighted_quantity"].to_numpy()

# K√≠ch th∆∞·ªõc ma tr·∫≠n
n_users = user_map_df.height
n_items = item_map_df.height

# T·∫°o CSR Matrix
sparse_user_item = sparse.csr_matrix((data, (rows, cols)), shape=(n_users, n_items))

print(f"‚úÖ HO√ÄN TH√ÄNH! Ma tr·∫≠n s·∫µn s√†ng.")
print(f"Shape: {sparse_user_item.shape}")
print(f"Sparsity: {1.0 - (sparse_user_item.nnz / (n_items * n_users)):.6f}")

--- B∆Ø·ªöC 5: T·∫†O MA TR·∫¨N CSR (SCIPY) ---
‚úÖ HO√ÄN TH√ÄNH! Ma tr·∫≠n s·∫µn s√†ng.
Shape: (2569977, 21095)
Sparsity: 0.999506


In [6]:
import implicit
import tqdm as tqdm

print("--- B∆Ø·ªöC 6: HU·∫§N LUY·ªÜN MODEL ---")

# 1. Kh·ªüi t·∫°o v√† Train TF-IDF
# K l√† s·ªë l∆∞·ª£ng nearest neighbors ƒë·ªÉ t√≠nh similarity, n√™n set l·ªõn h∆°n N trong recommend
model_tfidf = implicit.nearest_neighbours.TFIDFRecommender(K=100, num_threads=0)  # num_threads=0 t·ª± ƒë·ªông d√πng max
model_tfidf.fit(sparse_user_item)  # D√πng User x Item matrix
print("‚úÖ ƒê√£ train xong TF-IDF")

# 2. Kh·ªüi t·∫°o v√† Train Cosine (nhanh h∆°n TF-IDF)
model_cosine = implicit.nearest_neighbours.CosineRecommender(K=100, num_threads=0)
model_cosine.fit(sparse_user_item)  # D√πng User x Item matrix
print("‚úÖ ƒê√£ train xong Cosine")

--- B∆Ø·ªöC 6: HU·∫§N LUY·ªÜN MODEL ---


  from .autonotebook import tqdm as notebook_tqdm
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21095/21095 [00:01<00:00, 18338.09it/s]


‚úÖ ƒê√£ train xong TF-IDF


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21095/21095 [00:01<00:00, 17567.33it/s]

‚úÖ ƒê√£ train xong Cosine





In [7]:
print("--- B∆Ø·ªöC 7: T·∫†O CANDIDATE (BATCH PROCESSING) ---")

def generate_candidates(model, model_name, sparse_user_item, user_indices, N=50, batch_size=1000):
    """
    H√†m t·∫°o g·ª£i √Ω cho danh s√°ch user, tr·∫£ v·ªÅ Polars DataFrame
    """
    results_user = []
    results_item = []
    results_score = []

    # L·∫∑p qua t·ª´ng batch user ƒë·ªÉ ti·∫øt ki·ªám RAM
    for start in tqdm.tqdm(range(0, len(user_indices), batch_size), desc=f"Running {model_name}"):
        end = min(start + batch_size, len(user_indices))
        batch_users = user_indices[start:end]

        # H√†m recommend tr·∫£ v·ªÅ (ids, scores)
        # filter_already_liked_items=False: V·∫´n g·ª£i √Ω m√≥n ƒë√£ mua (ƒë·ªÉ t√≠nh vi·ªác mua l·∫°i)
        ids, scores = model.recommend(batch_users, sparse_user_item[batch_users], N=N, filter_already_liked_items=False)

        # Flatten d·ªØ li·ªáu ƒë·ªÉ ƒë∆∞a v√†o list
        # ids v√† scores l√† m·∫£ng 2 chi·ªÅu, c·∫ßn flatten ra 1 chi·ªÅu
        results_user.extend(np.repeat(batch_users, N)) # L·∫∑p l·∫°i user_id N l·∫ßn
        results_item.extend(ids.flatten())
        results_score.extend(scores.flatten())

    # T·∫°o Polars DataFrame v√† filter out -1 (items kh√¥ng t·ªìn t·∫°i)
    return pl.DataFrame({
        "user_index": results_user,
        "item_index": results_item,
    }).filter(pl.col("item_index") >= 0)

# L·∫•y danh s√°ch t·∫•t c·∫£ User Index c·∫ßn d·ª± ƒëo√°n
all_user_indices = np.arange(n_users) # n_users l·∫•y t·ª´ b∆∞·ªõc 5

# 1. Ch·∫°y model TF-IDF
df_tfidf = generate_candidates(model_tfidf, "tfidf", sparse_user_item, all_user_indices, N=50)

# 2. Ch·∫°y model Cosine
df_cosine = generate_candidates(model_cosine, "cosine", sparse_user_item, all_user_indices, N=50)

print("Sample TF-IDF Result:")
df_tfidf.head()

--- B∆Ø·ªöC 7: T·∫†O CANDIDATE (BATCH PROCESSING) ---


Running tfidf: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2570/2570 [01:58<00:00, 21.73it/s]
Running cosine: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2570/2570 [02:08<00:00, 20.01it/s]


Sample TF-IDF Result:


user_index,item_index
i32,i32
0,7275
0,18708
0,2613
0,2612
0,9529


---
## ‚úÖ L∆ØU K·∫æT QU·∫¢ B∆Ø·ªöC 7 - Sau ƒë√≥ c√≥ th·ªÉ **RESTART KERNEL**

In [8]:
# L∆∞u k·∫øt qu·∫£ B∆∞·ªõc 7 ƒë·ªÉ c√≥ th·ªÉ restart kernel
df_tfidf.write_parquet("temp_df_tfidf.parquet")
df_cosine.write_parquet("temp_df_cosine.parquet")

# L∆∞u c√°c mapping ƒë·ªÉ d√πng sau
user_map_df.write_parquet("temp_user_map.parquet")
item_map_df.write_parquet("temp_item_map.parquet")
df_grouped.write_parquet("temp_df_grouped.parquet")

print("‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ B∆∞·ªõc 7. B·∫°n c√≥ th·ªÉ RESTART KERNEL v√† ch·∫°y cell ti·∫øp theo.")

‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ B∆∞·ªõc 7. B·∫°n c√≥ th·ªÉ RESTART KERNEL v√† ch·∫°y cell ti·∫øp theo.


---
## üîµ LOAD L·∫†I D·ªÆ LI·ªÜU T·ª™ B∆Ø·ªöC 7 (Ch·∫°y cell n√†y sau khi RESTART KERNEL)

In [1]:
import polars as pl

# Load l·∫°i k·∫øt qu·∫£ t·ª´ B∆∞·ªõc 7
df_tfidf = pl.read_parquet("temp_df_tfidf.parquet")
df_cosine = pl.read_parquet("temp_df_cosine.parquet")

# Load l·∫°i c√°c mapping
user_map_df = pl.read_parquet("temp_user_map.parquet")
item_map_df = pl.read_parquet("temp_item_map.parquet")
df_grouped = pl.read_parquet("temp_df_grouped.parquet")

print("‚úÖ ƒê√£ load l·∫°i d·ªØ li·ªáu t·ª´ B∆∞·ªõc 7.")

‚úÖ ƒê√£ load l·∫°i d·ªØ li·ªáu t·ª´ B∆∞·ªõc 7.


In [2]:
print("--- B∆Ø·ªöC 8: G·ªòP K·∫æT QU·∫¢ (CONCAT) + TOP 50 POPULAR ITEMS ---")

# G·ªôp k·∫øt qu·∫£ t·ª´ c·∫£ 2 models b·∫±ng concat + unique
df_candidates = pl.concat([df_tfidf, df_cosine]).unique(subset=["user_index", "item_index"]).with_columns(
    pl.col("user_index").cast(pl.UInt32),
    pl.col("item_index").cast(pl.UInt32)
)

# Th√™m top 50 item ph·ªï bi·∫øn nh·∫•t cho m·ªói user (Pure Polars - hi·ªáu su·∫•t cao)
top_items = (
    df_grouped
    .group_by("item_id")
    .agg(pl.col("weighted_quantity").sum())
    .sort("weighted_quantity", descending=True).head(50)
    .join(item_map_df, on="item_id").select("item_index")
)
df_popular = user_map_df.select("user_index").join(top_items, how="cross")
df_candidates = pl.concat([df_candidates, df_popular]).unique(subset=["user_index", "item_index"])

print(f"T·ªïng s·ªë d√≤ng Candidate: {df_candidates.height}")

--- B∆Ø·ªöC 8: G·ªòP K·∫æT QU·∫¢ (CONCAT) + TOP 50 POPULAR ITEMS ---
T·ªïng s·ªë d√≤ng Candidate: 256561832


---
## ‚úÖ L∆ØU K·∫æT QU·∫¢ B∆Ø·ªöC 8 - Sau ƒë√≥ c√≥ th·ªÉ **RESTART KERNEL**

In [None]:
# L∆∞u k·∫øt qu·∫£ B∆∞·ªõc 8
df_candidates.write_parquet("temp_df_candidates.parquet")

print("‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ B∆∞·ªõc 8. B·∫°n c√≥ th·ªÉ RESTART KERNEL v√† ch·∫°y cell ti·∫øp theo.")

‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ B∆∞·ªõc 8. B·∫°n c√≥ th·ªÉ RESTART KERNEL v√† ch·∫°y cell ti·∫øp theo.


---
## üîµ LOAD L·∫†I D·ªÆ LI·ªÜU T·ª™ B∆Ø·ªöC 8 (Ch·∫°y cell n√†y sau khi RESTART KERNEL)

In [1]:
import polars as pl

# Load l·∫°i k·∫øt qu·∫£ t·ª´ B∆∞·ªõc 8
df_candidates = pl.read_parquet("temp_df_candidates.parquet")

# Load l·∫°i c√°c mapping
user_map_df = pl.read_parquet("temp_user_map.parquet")
item_map_df = pl.read_parquet("temp_item_map.parquet")

print("‚úÖ ƒê√£ load l·∫°i d·ªØ li·ªáu t·ª´ B∆∞·ªõc 8.")

‚úÖ ƒê√£ load l·∫°i d·ªØ li·ªáu t·ª´ B∆∞·ªõc 8.


In [2]:
print("--- B∆Ø·ªöC 9: MAPPING NG∆Ø·ª¢C V·ªÄ ID TH·∫¨T ---")

# user_map_df v√† item_map_df l·∫•y t·ª´ B∆Ø·ªöC 4 ·ªü response tr∆∞·ªõc
# C·∫•u tr√∫c map df: [user_index, customer_id]

# 1. Join l·∫•y Customer ID
df_final = df_candidates.join(
    user_map_df.with_columns(pl.col("user_index").cast(pl.UInt32)),
    on="user_index",
    how="left"
)

# 2. Join l·∫•y Item ID
df_final = df_final.join(
    item_map_df.with_columns(pl.col("item_index").cast(pl.UInt32)),
    on="item_index",
    how="left"
)

# 3. Ch·ªçn c·ªôt c·∫ßn thi·∫øt v√† s·∫Øp x·∫øp l·∫°i
df_final = df_final.select([
    "customer_id",
    "item_id",
])

print("‚úÖ HO√ÄN TH√ÄNH T·∫¨P CANDIDATE!")
df_final.head(10)
df_final.write_parquet("train_data/inference_base.parquet")

--- B∆Ø·ªöC 9: MAPPING NG∆Ø·ª¢C V·ªÄ ID TH·∫¨T ---
‚úÖ HO√ÄN TH√ÄNH T·∫¨P CANDIDATE!
