# Thư viện

In [1]:
import polars as pl
import numpy as np
import datetime

# Tải dữ liệu

In [2]:
user_path = "processed_data/processed_user.parquet"
item_path = "processed_data/processed_item.parquet"
purchase_path = "processed_data/processed_purchase.parquet"

user_df = pl.scan_parquet(user_path)
item_df = pl.scan_parquet(item_path)
purchase_df = pl.scan_parquet(purchase_path)

# Hàm tạo dữ liệu base

In [3]:
def createBaseData(user_df: pl.LazyFrame, item_df: pl.LazyFrame, purchase_df: pl.LazyFrame, begin_hist, end_hist, begin_recent, end_recent, k, seed):
    # Nhãn positive
    positive_df = (
        purchase_df
        .filter(
            pl.col('datetime').is_between(begin_recent, end_recent)
        )
        .unique(
            subset=['customer_id', 'item_id'],
            keep='first'
        )
        .with_columns(
            pl.lit(1).alias('Y')
        )
        .select(
            pl.col(['customer_id', 'item_id', 'Y'])
        )
    )
    # Số mẫu positive
    n_positive = positive_df.select(pl.len()).collect().item()

    # Các ứng cử cho nhãn negative
    negative_candidates_df = (
        purchase_df
        .filter(
            pl.col('datetime').is_between(begin_hist, end_hist)
        )
        .unique(subset=['customer_id', 'item_id'], keep='first')
        .join(
            positive_df.select(['customer_id', 'item_id']),
            on=['customer_id', 'item_id'],
            how='anti'
        )
        .select(
            pl.col(['customer_id', 'item_id'])
        )
        .sort(['customer_id', 'item_id'])
    )
    # Nhãn negative với số mẫu gấp k lần positive
    n_negative_candidates = negative_candidates_df.select(pl.len()).collect().item()
    # Chọn random theo seed
    np.random.seed(seed)
    indices = sorted(np.random.choice(n_negative_candidates, size=k*n_positive, replace=False))
    # Nhãn negative
    negative_df = (
        negative_candidates_df
        .with_columns(
            pl.lit(0).alias('Y')   
        )
        .select(
            pl.all().gather(indices)
        )
    )

#-------------------------------------------------------------------------------------------------------------------------

    # Gom negative và positive và xuất file
    pl.concat(items=[positive_df, negative_df]).sort(['customer_id', 'item_id']).sink_parquet("train_data/base.parquet")

# Tạo dữ liệu train với k = 3, seed = 42
### Nhãn negative gấp 3 lần positive
### Nhãn positive từ tháng 11-12/2024

In [4]:
begin_hist = datetime.datetime(
    day=1,
    month=1,
    year=2024
)
end_hist = datetime.datetime(
    day=31,
    month=10,
    year=2024
)
begin_recent = datetime.datetime(
    day=1,
    month=11,
    year=2024
)
end_recent = datetime.datetime(
    day=31,
    month=12,
    year=2024
)

createBaseData(user_df, item_df, purchase_df, begin_hist, end_hist, begin_recent, end_recent, k=3, seed=42)

# Xem thử dữ liệu

In [8]:
base_df = pl.scan_parquet("train_data/base.parquet")

In [11]:
base_df.filter(
    pl.col('Y') == 1
).collect()

customer_id,item_id,Y
i32,str,i32
14732,"""1386000000008""",1
14732,"""5468000000001""",1
17212,"""3775000000003""",1
17286,"""2005000000004""",1
17286,"""2155000000024""",1
…,…,…
8205007,"""1667000000003""",1
8205007,"""3669000000001""",1
8205007,"""3670000000001""",1
8205007,"""5952000000002""",1


In [12]:
base_df.filter(
    pl.col('Y') == 0
).collect()

customer_id,item_id,Y
i32,str,i32
14732,"""0029140000032""",0
15126,"""1237000000008""",0
15126,"""1606000000012""",0
15126,"""2006000000006""",0
15126,"""2278000000038""",0
…,…,…
8003804,"""3523000000152""",0
8003804,"""3944000000408""",0
8003806,"""1386000000005""",0
8003807,"""1974000000006""",0
