In [2]:
import numpy as np
import pandas as pd 
import gc
import lightgbm as lgb
import xgboost as xgb

In [3]:
# use the Kaggle input directory
train_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet'

# Define the feature names based on the number of features (79 in this case)
features_names = [f"feature_{i:02d}"for i in range(79)]

# Define the target 
target = 'responder_6'

# Number of validation dates to use
num_vaild_dates = 98

# Skip_dates
skip_dates = 1400

In [5]:
# ============================
# Reduce Memory Usage Function
# ============================
def reduce_memory_usage(df,float16_as32=False):
    start_mem = df.memory_usage().sum()/1024**2
    print(f'df memory usage before reduce : {start_mem} MB')
    for col in df.columns:
        col_type = df[col].dtype
        
        # Skip non-numeric columns
        if col_type.kind not in ['i','f']:
            continue
        
        c_min = df[col].min()
        c_max = df[col].max()

        # Integer types
        if col_type.kind in ['i']:
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)

        # Float types
        else:
            if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float32 if float16_as32 else np.float16)
            elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        
    end_mem = df.memory_usage().sum()/1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%")
    return df

In [6]:
df = pd.read_parquet(train_path, filters=[('date_id','>=', skip_dates)])
df = reduce_memory_usage(df)

df memory usage before reduce : 3711.112632751465 MB
Memory usage after optimization is: 1907.97 MB
Decreased by 48.6%


In [26]:
import numpy as np
import pandas as pd

# ===============================#
#        SETTINGS
# ===============================#
START_TRAIN = 1400
END_TRAIN   = 1599
VALID_START = 1600
VALID_END   = 1698
N_FOLDS     = 2

# ===============================#
#     LOAD DATA (Train + Final Valid)
# ===============================#
train_df = (
    df[df["date_id"].between(START_TRAIN, END_TRAIN)]
    .sort_values("date_id")
    .reset_index(drop=True)
)

valid_df = df[df["date_id"].between(VALID_START, VALID_END)]
X_valid, y_valid, w_valid = (
    valid_df[features_names],
    valid_df["responder_6"],
    valid_df["weight"],
)

# ===============================#
#     CREATE FOLDS FROM DATES
# ===============================#
all_dates = np.arange(START_TRAIN, END_TRAIN + 1)
folds = np.array_split(all_dates, N_FOLDS)   # ⭐ تقسيم نظيف جداً

# ===============================#
#     LOOP THROUGH FOLDS
# ===============================#
for fold, train_dates in enumerate(folds, start=1):
    
    fold_df = train_df[train_df["date_id"].isin(train_dates)]
    
    X_train = fold_df[features_names]
    y_train = fold_df["responder_6"]
    w_train = fold_df["weight"]

    print(f"Fold {fold}/{N_FOLDS}")
    print(f"  Train dates: {train_dates.min()} → {train_dates.max()}  ({len(train_dates)} days)")
    print(f"  Train rows : {len(fold_df):,}")
    print(f"  Final Valid: 1600 → 1698  ({len(X_valid):,} rows)")
    print("-" * 50)

    # model.fit(X_train, y_train, sample_weight=w_train)


Fold 1/2
  Train dates: 1400 → 1499  (100 days)
  Train rows : 3,595,152
  Final Valid: 1600 → 1698  (3,679,368 rows)
--------------------------------------------------
Fold 2/2
  Train dates: 1500 → 1599  (100 days)
  Train rows : 3,718,088
  Final Valid: 1600 → 1698  (3,679,368 rows)
--------------------------------------------------
