In [1]:
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# =============================================================================
# 1. LOAD DATA & DEFINE FEATURES
# =============================================================================

In [2]:
df = pl.read_parquet("train_data/train.parquet")

# ƒê·ªãnh nghƒ©a danh s√°ch Feature Columns d·ª±a tr√™n file HTML b·∫°n cung c·∫•p
feature_cols = [
    # Feature 1: Frequency
    'feat1_customer_item_freq',
    # Feature 2: Recency Decay
    'feat2_brand_affinity', 'feat2_type_affinity',
    # Feature 3: Urgency (Window-Based)
    'feat3_dist_to_window_center', 'feat3_is_in_window',
    # Feature 4: Popularity
    'feat4_pop_30d_log', 'feat4_pop_trend', 'feat4_pop_category_rank', 'feat4_pop_global_rank',
    # Feature 5: Baby Age Alignment
    'feat5_score_age_end_hist', 'feat5_score_age_midpoint',
    # Feature 6: Price Compatibility
    'feat6_price_compatibility', 'feat6_is_above_user_capacity',
    # Feature 7: Brand Loyalty
    'feat7_brand_repeat_rate', 'feat7_brand_rank', 'feat7_user_brand_affinity',
    # Feature 8: Co-purchase
    'feat8_co_purchase_max', 'feat8_co_purchase_sum', 'feat8_co_purchase_count'
]

target_col = 'Y'
X = df.select(feature_cols).to_pandas()
y = df.select(target_col).to_pandas().values.ravel()

print(f"Training shape: {X.shape}")
print(f"Positive Rate: {y.mean():.4f}")

Training shape: (19907800, 19)
Positive Rate: 0.2500


# =============================================================================
# 2. MODELING PIPELINE (Logistic Regression)
# =============================================================================

In [None]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)), # Fallback an to√†n
    ('scaler', StandardScaler()), 
    ('logreg', LogisticRegression(
        solver='saga',
        max_iter=1000,
        class_weight=None,
        n_jobs=-1,
        random_state=42
    ))
])

pipeline.fit(X, y)

# =============================================================================
# 3. PREDICTION & SCORING
# =============================================================================

In [None]:
print("üîÆ Predicting Probabilities...")
# L·∫•y x√°c su·∫•t thu·ªôc l·ªõp 1 (Positive)
scores = pipeline.predict_proba(X)[:, 1]

# G·∫Øn score ng∆∞·ª£c l·∫°i v√†o DataFrame g·ªëc
scored_df = df.select(['customer_id', 'item_id']).with_columns(
    pl.Series("score", scores)
)

# =============================================================================
# 4. RANKING & GENERATING OUTPUT
# =============================================================================

In [None]:
# S·∫Øp x·∫øp gi·∫£m d·∫ßn theo score v√† l·∫•y top 10 cho m·ªói user
top_10_recommendations = (
    scored_df
    .sort(['customer_id', 'score'], descending=[False, True]) # Sort user tƒÉng, score gi·∫£m
    .group_by("customer_id")
    .agg([
        pl.col("item_id").head(10).alias("top_10_items")
    ])
)

# Chuy·ªÉn ƒë·ªïi th√†nh Dictionary {customer_id: [item_id_1, ..., item_id_10]}
output_dict = dict(zip(
    top_10_recommendations["customer_id"].to_list(),
    top_10_recommendations["top_10_items"].to_list()
))

# =============================================================================
# 5. VERIFY OUTPUT
# =============================================================================

In [None]:
print(f"Generated recommendations for {len(output_dict)} users.")
sample_user = list(output_dict.keys())[0]
print(f"Example User {sample_user}: {output_dict[sample_user]}")

# Feature Importance Analysis
coefficients = pipeline.named_steps['logreg'].coef_[0]
feature_importance = sorted(zip(feature_cols, coefficients), key=lambda x: abs(x[1]), reverse=True)
print("\nüî• Top 5 Most Important Features:")
for feat, coef in feature_importance[:5]:
    print(f"   {feat}: {coef:.4f}")