In [3]:
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

# =============================================================================
# 1. LOAD DATA & DEFINE FEATURES
# =============================================================================

In [None]:
df = pl.read_parquet("train_data/train.parquet")

feature_cols = [
    # Feature 1: Frequency
    'feat1_customer_item_freq',
    # Feature 2: Recency Decay
    'feat2_brand_affinity', 'feat2_type_affinity',
    # Feature 3: Urgency (Window-Based)
    'feat3_dist_to_window_center', 'feat3_is_in_window',
    # Feature 4: Popularity
    'feat4_pop_30d_log', 'feat4_pop_trend', 'feat4_pop_category_rank', 'feat4_pop_global_rank',
    # Feature 5: Baby Age Alignment
    'feat5_score_age_end_hist', 'feat5_score_age_midpoint',
    # Feature 6: Price Compatibility
    'feat6_price_compatibility', 'feat6_is_above_user_capacity',
    # Feature 7: Brand Loyalty
    'feat7_brand_repeat_rate', 'feat7_brand_rank', 'feat7_user_brand_affinity',
    # Feature 8: Co-purchase
    'feat8_co_purchase_max', 'feat8_co_purchase_sum', 'feat8_co_purchase_count'
]

target_col = 'Y'
X = df.select(feature_cols).to_pandas()
y = df.select(target_col).to_pandas().values.ravel()

print(f"Training shape: {X.shape}")
print(f"Positive Rate: {y.mean():.4f}")

Training shape: (19907800, 19)
Positive Rate: 0.2500


# =============================================================================
# 2. MODELING PIPELINE (Logistic Regression)
# =============================================================================

In [None]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)), # Fallback an to√†n
    ('scaler', StandardScaler()), 
    ('logreg', LogisticRegression(
        solver='saga',
        max_iter=1000,
        class_weight=None,
        n_jobs=-1,
        random_state=42
    ))
])

pipeline.fit(X, y)
joblib.dump(pipeline, 'train_data/model.pkl')

['train_data/model.pkl']

: 