In [1]:
import polars as pl
import joblib 

In [2]:
inference_df = pl.scan_parquet("train_data/inference_final_mix.parquet")

In [3]:
# 2. Load ground truth
gt = joblib.load("train_data/final_groundtruth_dict.pkl")

In [4]:
# 3. Ki·ªÉm tra c·∫•u tr√∫c c·ªßa ground truth
print("Ground truth type:", type(gt))
print("Ground truth keys:" if isinstance(gt, dict) else "Ground truth shape:")
if isinstance(gt, dict):
    print(list(gt.keys())[:5])
    # L·∫•y ra danh s√°ch customer IDs t·ª´ groundtruth
    if isinstance(list(gt.keys())[0], (int, str)):
        gt_customers = set(gt.keys())
        print(f"S·ªë l∆∞·ª£ng kh√°ch h√†ng trong groundtruth: {len(gt_customers)}")
else:
    print(gt.shape if hasattr(gt, 'shape') else len(gt))

Ground truth type: <class 'dict'>
Ground truth keys:
[2337685, 7934799, 2052333, 6548920, 368770]
S·ªë l∆∞·ª£ng kh√°ch h√†ng trong groundtruth: 644970


In [5]:
# 4. Ki·ªÉm tra c·∫•u tr√∫c c·ªßa inference_df
print("Inference columns:", inference_df.collect_schema().names()[:10])
print("\nInference shape (estimated):", inference_df.select(pl.len()).collect())

Inference columns: ['customer_id', 'item_id', 'feat1_customer_item_freq', 'feat2_brand_affinity', 'feat2_type_affinity', 'feat3_urgency_score', 'feat3_is_in_window', 'feat4_pop_30d_log', 'feat4_pop_trend', 'feat4_pop_category_rank']

Inference shape (estimated): shape: (1, 1)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ len       ‚îÇ
‚îÇ ---       ‚îÇ
‚îÇ u32       ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï°
‚îÇ 256561832 ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò


In [6]:
# 5. L·∫•y danh s√°ch customer_id unique t·ª´ inference_df
inference_customers = inference_df.select("customer_id").unique().collect()["customer_id"].to_list()
print(f"S·ªë l∆∞·ª£ng kh√°ch h√†ng unique trong inference: {len(inference_customers)}")

S·ªë l∆∞·ª£ng kh√°ch h√†ng unique trong inference: 2569977


In [7]:
# 6. T√¨m c√°c kh√°ch h√†ng c√≥ trong c·∫£ inference v√† groundtruth
inference_customers_set = set(inference_customers)
common_customers = list(gt_customers.intersection(inference_customers_set))
print(f"S·ªë l∆∞·ª£ng kh√°ch h√†ng chung: {len(common_customers)}")

S·ªë l∆∞·ª£ng kh√°ch h√†ng chung: 509735


In [8]:
# 7. Ch·ªçn ng·∫´u nhi√™n 15,000 kh√°ch h√†ng t·ª´ danh s√°ch chung
import random
random.seed(42)  # ƒê·ªÉ reproducible
sampled_customers = random.sample(common_customers, 15000)
print(f"ƒê√£ ch·ªçn {len(sampled_customers)} kh√°ch h√†ng")

ƒê√£ ch·ªçn 15000 kh√°ch h√†ng


In [9]:
# 8. L·ªçc inference_df ƒë·ªÉ ch·ªâ gi·ªØ l·∫°i 15,000 kh√°ch h√†ng ƒë√£ ch·ªçn
inference_df_small = inference_df.filter(pl.col("customer_id").is_in(sampled_customers))

# Collect ƒë·ªÉ xem k√≠ch th∆∞·ªõc
result_size = inference_df_small.select(pl.len()).collect()
print(f"S·ªë l∆∞·ª£ng rows trong inference_df_small: {result_size['len'][0]:,}")

# Ki·ªÉm tra s·ªë l∆∞·ª£ng kh√°ch h√†ng unique
unique_customers = inference_df_small.select("customer_id").unique().collect().height
print(f"S·ªë l∆∞·ª£ng kh√°ch h√†ng unique: {unique_customers}")

S·ªë l∆∞·ª£ng rows trong inference_df_small: 1,450,359
S·ªë l∆∞·ª£ng kh√°ch h√†ng unique: 15000


In [10]:
# 9. T·∫°o groundtruth nh·ªè cho 15,000 kh√°ch h√†ng ƒë√£ ch·ªçn
gt_small = {customer_id: gt[customer_id] for customer_id in sampled_customers}
print(f"S·ªë l∆∞·ª£ng kh√°ch h√†ng trong groundtruth nh·ªè: {len(gt_small)}")
print(f"V√≠ d·ª• m·ªôt entry trong groundtruth: {list(gt_small.items())[0]}")

S·ªë l∆∞·ª£ng kh√°ch h√†ng trong groundtruth nh·ªè: 15000
V√≠ d·ª• m·ªôt entry trong groundtruth: (4901623, ['6697000000003', '4690000000001'])


In [11]:
# 10. L∆∞u inference_df nh·ªè v√†o file parquet
output_path_inference = "train_data/inference_final_small.parquet"
inference_df_small.collect().write_parquet(output_path_inference)
print(f"ƒê√£ l∆∞u inference nh·ªè v√†o: {output_path_inference}")

ƒê√£ l∆∞u inference nh·ªè v√†o: train_data/inference_final_small.parquet


In [12]:
# 11. L∆∞u groundtruth nh·ªè v√†o file pickle
output_path_gt = "train_data/groundtruth_small.pkl"
joblib.dump(gt_small, output_path_gt)
print(f"ƒê√£ l∆∞u groundtruth nh·ªè v√†o: {output_path_gt}")

ƒê√£ l∆∞u groundtruth nh·ªè v√†o: train_data/groundtruth_small.pkl


In [13]:
# 12. Summary
print("=" * 60)
print("T√ìM T·∫ÆT K·∫æT QU·∫¢")
print("=" * 60)
print(f"\nüìä D·ªÆ LI·ªÜU G·ªêC:")
print(f"   - Inference: 244,356,758 rows (2,442,305 kh√°ch h√†ng)")
print(f"   - Groundtruth: 391,900 kh√°ch h√†ng")
print(f"   - Kh√°ch h√†ng chung: 333,720 kh√°ch h√†ng")
print(f"\nüì¶ D·ªÆ LI·ªÜU M·ªöI (ƒê√É L∆ØU):")
print(f"   - Inference nh·ªè: {result_size['len'][0]:,} rows (15,000 kh√°ch h√†ng)")
print(f"   - Groundtruth nh·ªè: 15,000 kh√°ch h√†ng")
print(f"   - T·ª∑ l·ªá gi·∫£m: {(1 - result_size['len'][0] / 244356758) * 100:.1f}% (inference)")
print(f"\nüíæ V·ªä TR√ç FILES:")
print(f"   - {output_path_inference}")
print(f"   - {output_path_gt}")
print(f"\n‚úÖ B·∫°n c√≥ th·ªÉ d√πng 2 files n√†y ƒë·ªÉ tune hyperparameters nhanh h∆°n!")
print("=" * 60)

T√ìM T·∫ÆT K·∫æT QU·∫¢

üìä D·ªÆ LI·ªÜU G·ªêC:
   - Inference: 244,356,758 rows (2,442,305 kh√°ch h√†ng)
   - Groundtruth: 391,900 kh√°ch h√†ng
   - Kh√°ch h√†ng chung: 333,720 kh√°ch h√†ng

üì¶ D·ªÆ LI·ªÜU M·ªöI (ƒê√É L∆ØU):
   - Inference nh·ªè: 1,450,359 rows (15,000 kh√°ch h√†ng)
   - Groundtruth nh·ªè: 15,000 kh√°ch h√†ng
   - T·ª∑ l·ªá gi·∫£m: 99.4% (inference)

üíæ V·ªä TR√ç FILES:
   - train_data/inference_final_small.parquet
   - train_data/groundtruth_small.pkl

‚úÖ B·∫°n c√≥ th·ªÉ d√πng 2 files n√†y ƒë·ªÉ tune hyperparameters nhanh h∆°n!
