In [1]:
import os 
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import KFold, TimeSeriesSplit
import pickle

SEED = 42
np.random.seed(SEED)

In [2]:
def lgbm_rf_feature_selection(X, y, threshold="mean", random_state=SEED):
    clf = LGBMClassifier(
        boosting_type='rf',
        n_estimators=146,  # Number of trees
        max_depth=16,  # Maximum depth of trees
        min_child_samples=211,  # Minimum number of samples per leaf
        min_child_weight=649,  # Minimum sum of weights of all observations required in a child
        n_jobs=16,
        random_state=random_state,
        bagging_freq=1,  # Frequency for bagging
        bagging_fraction=0.9,  # Fraction of data to be used for bagging
        feature_fraction=0.9,  # Fraction of features to be used for training
        subsample=None,
        colsample_bytree=None,
        subsample_freq=None,
        verbose=-1,
        device='gpu',
        gpu_device_id=0
    )
    clf.fit(X, y)
    
    selector = SelectFromModel(clf, threshold=threshold, prefit=True)
    mask = selector.get_support()

    importances = clf.feature_importances_
    feature_ranking = np.argsort(importances)[::-1]
    print("Top 10 Feature Importances:")
    for i in feature_ranking[:10]:
        print(f"Feature {i}: {importances[i]:.4f}")

    return mask, clf


def process_dataset(name, train_path, test_path, output_dir, random_state=SEED):
    print(f"Processing dataset: {name}")
    
    df_train = pd.read_parquet(train_path).sort_values("id")
    df_test = pd.read_parquet(test_path).sort_values("id")

    feature_cols = [col for col in df_train.columns if col not in ["id", "smpl", "target"]]
    
    X_train = df_train[feature_cols].to_numpy()
    y_train = df_train["target"].to_numpy()

    feature_mask, model = lgbm_rf_feature_selection(X_train, y_train, threshold="mean", random_state=random_state)
    
    X_train_selected = X_train[:, feature_mask]
    selected_features = [feature_cols[i] for i, m in enumerate(feature_mask) if m]

    df_train_selected = df_train[["id", "smpl", "target"] + selected_features]
    
    X_test = df_test[feature_cols].to_numpy()
    X_test_selected = X_test[:, feature_mask]
    df_test_selected = df_test[["id", "smpl"] + selected_features]
    
    os.makedirs(output_dir, exist_ok=True)
    
    train_output = os.path.join(output_dir, f"{name}_train_selected.parquet")
    test_output = os.path.join(output_dir, f"{name}_test_selected.parquet")
    df_train_selected.to_parquet(train_output)
    df_test_selected.to_parquet(test_output)
    
    # tscv = TimeSeriesSplit(n_splits=5)
    # cv_splits = []
    # for fold, (tr_idx, val_idx) in enumerate(tscv.split(X_train_selected, y_train), 1):
    #     cv_splits.append({
    #         "fold": fold,
    #         "train_index": tr_idx,
    #         "validation_index": val_idx
    #     })
    #     print(f"Fold {fold}: Train indices {tr_idx[0]} to {tr_idx[-1]}, Validation indices {val_idx[0]} to {val_idx[-1]}")
    
    # cv_splits_path = os.path.join(output_dir, f"{name}_cv_splits.pkl")
    # with open(cv_splits_path, "wb") as f:
    #     pickle.dump(cv_splits, f, protocol=pickle.HIGHEST_PROTOCOL)
    
    # Save feature mask
    # feature_mask_path = os.path.join(output_dir, f"{name}_feature_mask.npy")
    # np.save(feature_mask_path, feature_mask)
    
    print(f"Selected {X_train_selected.shape[1]} out of {X_train.shape[1]} features.")
    print(f"Train saved to: {train_output}")
    print(f"Test saved to: {test_output}")
    # print(f"CV splits saved to: {cv_splits_path}")
    # print(f"Feature mask saved to: {feature_mask_path}")
    print("-" * 50)

In [3]:
# DATA_DIR = "/home/rbparchiev/alpha_hackathon/alpha_hack/data"
DATA_DIR = "/home/rbparchiev/alpha_hackathon/alpha_step_2/data/Alfa Hack. Данные для финалистов/data/"
data_paths = {
    "fl_credit_card_tendency": DATA_DIR+"fl_credit_card_tendency/fl_credit_card_tendency_train.parquet",
    "invest_prop_4": DATA_DIR+"invest_prop_4/train",
    "outflow_12": DATA_DIR+"outflow_12/outflow_12_train.parquet",
    "pd_fl": DATA_DIR+"pd_fl/pd_fl_train.parquet",
    "pd_ul_9": DATA_DIR+"pd_ul_9/pd_ul_9_train.parquet",
    "ul_leasing_outflow": DATA_DIR+"ul_leasing_outflow/ul_leasing_outflow_train.parquet",
}

data_paths_test = {
    "fl_credit_card_tendency": DATA_DIR+"fl_credit_card_tendency/fl_credit_card_tendency_test.parquet",
    "invest_prop_4": DATA_DIR+"invest_prop_4/invest_prop_4_test.parquet",
    "outflow_12": DATA_DIR+"outflow_12/outflow_12_test.parquet",
    "pd_fl": DATA_DIR+"pd_fl/pd_fl_test.parquet",
    "pd_ul_9": DATA_DIR+"pd_ul_9/pd_ul_9_test.parquet",
    "ul_leasing_outflow": DATA_DIR+"ul_leasing_outflow/ul_leasing_outflow_test.parquet",
}

output_dir = "./data_fs_rf/"
os.makedirs(output_dir, exist_ok=True)

In [4]:
for name, train_path in data_paths.items():
    test_path = data_paths_test[name]
    process_dataset(name, train_path, test_path, output_dir, random_state=SEED)

Processing dataset: fl_credit_card_tendency
Top 10 Feature Importances:
Feature 150: 601.0000
Feature 63: 393.0000
Feature 342: 350.0000
Feature 233: 323.0000
Feature 34: 298.0000
Feature 153: 286.0000
Feature 49: 282.0000
Feature 420: 258.0000
Feature 172: 148.0000
Feature 156: 147.0000
Selected 33 out of 500 features.
Train saved to: ./data_fs_rf/fl_credit_card_tendency_train_selected.parquet
Test saved to: ./data_fs_rf/fl_credit_card_tendency_test_selected.parquet
--------------------------------------------------
Processing dataset: invest_prop_4
Top 10 Feature Importances:
Feature 46: 491.0000
Feature 79: 416.0000
Feature 382: 343.0000
Feature 10: 319.0000
Feature 365: 293.0000
Feature 47: 260.0000
Feature 352: 212.0000
Feature 30: 154.0000
Feature 102: 145.0000
Feature 243: 140.0000
Selected 40 out of 500 features.
Train saved to: ./data_fs_rf/invest_prop_4_train_selected.parquet
Test saved to: ./data_fs_rf/invest_prop_4_test_selected.parquet
-------------------------------------