In [None]:
import os 
import polars as pl
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import KFold, TimeSeriesSplit
import pickle

SEED = 42
np.random.seed(SEED)

In [None]:
def lgbm_rf_feature_selection(X, y, threshold="mean", random_state=SEED):
    clf = LGBMClassifier(
        boosting_type='rf',
        n_estimators=100,
        n_jobs=16,
        random_state=random_state,
        bagging_freq=1,
        bagging_fraction=0.9,
        feature_fraction=0.9,
        # subsample=0.9,
        subsample=None,
        colsample_bytree=None,
        subsample_freq=None,
        verbose=-1
    )
    clf.fit(X, y)
    
    selector = SelectFromModel(clf, threshold=threshold, prefit=True)
    mask = selector.get_support()

    importances = clf.feature_importances_
    feature_ranking = np.argsort(importances)[::-1]
    print("Top 10 Feature Importances:")
    for i in feature_ranking[:10]:
        print(f"Feature {i}: {importances[i]:.4f}")

    return mask, clf


def process_dataset(name, train_path, test_path, output_dir, random_state=SEED):
    print(f"Processing dataset: {name}")
    
    df_train = pl.read_parquet(train_path).sort("id")
    df_test = pl.read_parquet(test_path).sort("id")

    feature_cols = [col for col in df_train.columns if col not in ["id", "smpl", "target"]]
    
    X_train = df_train.select(feature_cols).to_numpy()
    y_train = df_train["target"].to_numpy()

    feature_mask, model = lgbm_rf_feature_selection(X_train, y_train, threshold="mean", random_state=random_state)
    
    X_train_selected = X_train[:, feature_mask]
    selected_features = [feature_cols[i] for i, m in enumerate(feature_mask) if m]

    df_train_selected = df_train.select(["id", "smpl", "target"] + selected_features)
    
    X_test = df_test.select(feature_cols).to_numpy()
    X_test_selected = X_test[:, feature_mask]
    df_test_selected = df_test.select(["id", "smpl"] + selected_features)
    
    os.makedirs(output_dir, exist_ok=True)
    
    train_output = os.path.join(output_dir, f"{name}_train_selected.parquet")
    test_output = os.path.join(output_dir, f"{name}_test_selected.parquet")
    df_train_selected.write_parquet(train_output)
    df_test_selected.write_parquet(test_output)
    
    tscv = TimeSeriesSplit(n_splits=5)
    cv_splits = []
    for fold, (tr_idx, val_idx) in enumerate(tscv.split(X_train_selected, y_train), 1):
        cv_splits.append({
            "fold": fold,
            "train_index": tr_idx,
            "validation_index": val_idx
        })
        print(f"Fold {fold}: Train indices {tr_idx[0]} to {tr_idx[-1]}, Validation indices {val_idx[0]} to {val_idx[-1]}")
    
    cv_splits_path = os.path.join(output_dir, f"{name}_cv_splits.pkl")
    with open(cv_splits_path, "wb") as f:
        pickle.dump(cv_splits, f, protocol=pickle.HIGHEST_PROTOCOL)
    
    # Save feature mask
    feature_mask_path = os.path.join(output_dir, f"{name}_feature_mask.npy")
    np.save(feature_mask_path, feature_mask)
    
    print(f"Selected {X_train_selected.shape[1]} out of {X_train.shape[1]} features.")
    print(f"Train saved to: {train_output}")
    print(f"Test saved to: {test_output}")
    print(f"CV splits saved to: {cv_splits_path}")
    print(f"Feature mask saved to: {feature_mask_path}")
    print("-" * 50)

In [None]:
DATA_DIR = "/home/rbparchiev/alpha_hackathon/alpha_step_2/data/Alfa Hack. Данные для финалистов/data/"
data_paths = {
    "fl_credit_card_tendency": DATA_DIR+"fl_credit_card_tendency/fl_credit_card_tendency_train.parquet",
    "invest_prop_4": DATA_DIR+"invest_prop_4/train",
    "outflow_12": DATA_DIR+"outflow_12/outflow_12_train.parquet",
    "pd_fl": DATA_DIR+"pd_fl/pd_fl_train.parquet",
    "pd_ul_9": DATA_DIR+"pd_ul_9/pd_ul_9_train.parquet",
    "ul_leasing_outflow": DATA_DIR+"ul_leasing_outflow/ul_leasing_outflow_train.parquet",
}

data_paths_test = {
    "fl_credit_card_tendency": DATA_DIR+"fl_credit_card_tendency/fl_credit_card_tendency_test.parquet",
    "invest_prop_4": DATA_DIR+"invest_prop_4/invest_prop_4_test.parquet",
    "outflow_12": DATA_DIR+"outflow_12/outflow_12_test.parquet",
    "pd_fl": DATA_DIR+"pd_fl/pd_fl_test.parquet",
    "pd_ul_9": DATA_DIR+"pd_ul_9/pd_ul_9_test.parquet",
    "ul_leasing_outflow": DATA_DIR+"ul_leasing_outflow/ul_leasing_outflow_test.parquet",
}

output_dir = "/home/rbparchiev/alpha_hackathon/alpha_step_2/data/data_fs_rf/"
os.makedirs(output_dir, exist_ok=True)

In [4]:
for name, train_path in data_paths.items():
    test_path = data_paths_test[name]
    process_dataset(name, train_path, test_path, output_dir, random_state=SEED)

Processing dataset: fl_credit_card_tendency
Top 10 Feature Importances:
Feature 150: 349.0000
Feature 63: 283.0000
Feature 172: 275.0000
Feature 233: 223.0000
Feature 34: 208.0000
Feature 153: 196.0000
Feature 49: 190.0000
Feature 420: 172.0000
Feature 342: 133.0000
Feature 156: 96.0000
Fold 1: Train indices 0 to 147548, Validation indices 147549 to 295095
Fold 2: Train indices 0 to 295095, Validation indices 295096 to 442642
Fold 3: Train indices 0 to 442642, Validation indices 442643 to 590189
Fold 4: Train indices 0 to 590189, Validation indices 590190 to 737736
Fold 5: Train indices 0 to 737736, Validation indices 737737 to 885283
Selected 36 out of 500 features.
Train saved to: /home/rbparchiev/alpha_hackathon/alpha_step_2/data/data_fs_rf/fl_credit_card_tendency_train_selected.parquet
Test saved to: /home/rbparchiev/alpha_hackathon/alpha_step_2/data/data_fs_rf/fl_credit_card_tendency_test_selected.parquet
CV splits saved to: /home/rbparchiev/alpha_hackathon/alpha_step_2/data/data_