In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

train_df = pd.read_csv('train (1).csv')
test_df = pd.read_csv('test (1).csv')


TRAIN_WINDOW_PCT = 0.3
VAL_SPLIT_PCT = 0.20
RIDGE_ALPHA = 10

features_global = []


def get_raw_lag_features(df, is_test=False):
    df = df.copy()
    features = []

    if not is_test:

        current_close = df['Close']
        current_vol = df['Volume']

        for lag in range(30):

            df[f'Close_Lag_{lag}_Rel'] = (df['Close'].shift(lag) / current_close) - 1
            df[f'Open_Lag_{lag}_Rel']  = (df['Open'].shift(lag) / current_close) - 1
            df[f'High_Lag_{lag}_Rel']  = (df['High'].shift(lag) / current_close) - 1
            df[f'Low_Lag_{lag}_Rel']   = (df['Low'].shift(lag) / current_close) - 1
            df[f'Volume_Lag_{lag}_Rel'] = (df['Volume'].shift(lag) / (current_vol + 1e-9)) - 1

            features.extend([f'Close_Lag_{lag}_Rel', f'Open_Lag_{lag}_Rel',
                             f'High_Lag_{lag}_Rel', f'Low_Lag_{lag}_Rel',
                             f'Volume_Lag_{lag}_Rel'])

        df['SMA_30'] = df['Close'].rolling(window=30).mean()
        df['SMA_30_Rel'] = (df['SMA_30'] / current_close) - 1
        features.append('SMA_30_Rel')

        std = df['Close'].rolling(window=30).std()
        df['BB_Bandwidth'] = 4 * std/df['SMA_30']
        features.append('BB_Bandwidth')



        df['Target_LogRet'] = np.log(df['Target'] / df['Close'])
        features_global = features
        return df.dropna(), features

    else:

        current_close = df['Close_Lag_0']
        current_vol = df['Volume_Lag_0']

        for lag in range(30):
            c_col = f'Close_Lag_{lag}'
            o_col = f'Open_Lag_{lag}'
            h_col = f'High_Lag_{lag}'
            l_col = f'Low_Lag_{lag}'
            v_col = f'Volume_Lag_{lag}'

            df[f'Close_Lag_{lag}_Rel'] = (df[c_col] / current_close) - 1
            df[f'Open_Lag_{lag}_Rel']  = (df[o_col] / current_close) - 1
            df[f'High_Lag_{lag}_Rel']  = (df[h_col] / current_close) - 1
            df[f'Low_Lag_{lag}_Rel']   = (df[l_col] / current_close) - 1
            df[f'Volume_Lag_{lag}_Rel'] = (df[v_col] / (current_vol + 1e-9)) - 1

            features.extend([f'Close_Lag_{lag}_Rel', f'Open_Lag_{lag}_Rel',
                             f'High_Lag_{lag}_Rel', f'Low_Lag_{lag}_Rel',
                             f'Volume_Lag_{lag}_Rel'])

        close_lag = [f'Close_Lag_{i}' for i in range(30)]
        df['SMA_30'] = df[close_lag].mean(axis=1)
        df['SMA_30_Rel'] = (df['SMA_30'] / current_close) - 1
        features.append('SMA_30_Rel')

        std = df[close_lag].std(axis=1)
        df['BB_Bandwidth'] = 4 * std/df['SMA_30']
        features.append('BB_Bandwidth')

        return df, features


train_processed, feature_cols = get_raw_lag_features(train_df, is_test=False)
test_processed, _ = get_raw_lag_features(test_df, is_test=True)


cutoff_idx = int(len(train_processed) * (1 - TRAIN_WINDOW_PCT))
regime_data = train_processed.iloc[cutoff_idx:].reset_index(drop=True)


val_size = int(len(regime_data) * VAL_SPLIT_PCT)
train_split = regime_data.iloc[:-val_size]
val_split = regime_data.iloc[-val_size:]

X_t = train_split[feature_cols]
y_t = train_split['Target_LogRet']
X_v = val_split[feature_cols]
price_v = val_split['Target']
close_v = val_split['Close']

scaler_val = StandardScaler()
X_t_s = scaler_val.fit_transform(X_t)
X_v_s = scaler_val.transform(X_v)



ridge_v = Ridge(alpha=RIDGE_ALPHA)
ridge_v.fit(X_t_s, y_t)
raw_pred_log = ridge_v.predict(X_v_s)


best_damping = 0.9
best_rmse = float('inf')


for d in np.arange(0.70, 1.01, 0.05):
    # Apply Damping
    damped_pred_log = raw_pred_log * d
    damped_price = close_v * np.exp(damped_pred_log)

    rmse = np.sqrt(mean_squared_error(price_v, damped_price))
    print(f"Damping {d:.2f} -> RMSE: ${rmse:.4f}")

    if rmse < best_rmse:
        best_rmse = rmse
        best_damping = d

print(f"\nBest Damping Factor is {best_damping:.2f} (RMSE: ${best_rmse:.4f})")


X_final = regime_data[feature_cols]
y_final = regime_data['Target_LogRet']
X_test = test_processed[feature_cols]

scaler_final = StandardScaler()
X_final_s = scaler_final.fit_transform(X_final)
X_test_s = scaler_final.transform(X_test)

ridge_final = Ridge(alpha=RIDGE_ALPHA)
ridge_final.fit(X_final_s, y_final)

pred_log_test = ridge_final.predict(X_test_s) * best_damping
final_price = test_df['Close_Lag_0'] * np.exp(pred_log_test)

submission_lr_best = pd.DataFrame({'ID': test_df['ID'], 'Target': final_price})


  df[f'Low_Lag_{lag}_Rel']   = (df['Low'].shift(lag) / current_close) - 1
  df[f'Volume_Lag_{lag}_Rel'] = (df['Volume'].shift(lag) / (current_vol + 1e-9)) - 1
  df[f'Close_Lag_{lag}_Rel'] = (df['Close'].shift(lag) / current_close) - 1
  df[f'Open_Lag_{lag}_Rel']  = (df['Open'].shift(lag) / current_close) - 1
  df[f'High_Lag_{lag}_Rel']  = (df['High'].shift(lag) / current_close) - 1
  df[f'Low_Lag_{lag}_Rel']   = (df['Low'].shift(lag) / current_close) - 1
  df[f'Volume_Lag_{lag}_Rel'] = (df['Volume'].shift(lag) / (current_vol + 1e-9)) - 1
  df[f'Close_Lag_{lag}_Rel'] = (df['Close'].shift(lag) / current_close) - 1
  df[f'Open_Lag_{lag}_Rel']  = (df['Open'].shift(lag) / current_close) - 1
  df[f'High_Lag_{lag}_Rel']  = (df['High'].shift(lag) / current_close) - 1
  df[f'Low_Lag_{lag}_Rel']   = (df['Low'].shift(lag) / current_close) - 1
  df[f'Volume_Lag_{lag}_Rel'] = (df['Volume'].shift(lag) / (current_vol + 1e-9)) - 1
  df[f'Close_Lag_{lag}_Rel'] = (df['Close'].shift(lag) / current_close)

Damping 0.70 -> RMSE: $3.0630
Damping 0.75 -> RMSE: $3.0755
Damping 0.80 -> RMSE: $3.0899
Damping 0.85 -> RMSE: $3.1062
Damping 0.90 -> RMSE: $3.1241
Damping 0.95 -> RMSE: $3.1439
Damping 1.00 -> RMSE: $3.1654

Best Damping Factor is 0.70 (RMSE: $3.0630)


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

df = pd.read_csv('train (1).csv')

def get_optimized_features(df):
    df = df.copy()


    df['Return_1D'] = df['Close'].pct_change(1)
    df['Return_2D'] = df['Close'].pct_change(2)
    df['Return_3D'] = df['Close'].pct_change(3)
    df['Return_5D'] = df['Close'].pct_change(5)


    df['SMA_30'] = df['Close'].rolling(30).mean()
    df['Dist_MA30'] = df['Close'] / df['SMA_30'] - 1

    df['ATR'] = df['High'].rolling(14).max() - df['Low'].rolling(14).min()
    std = df['Close'].rolling(30).std()
    df['BB_Bandwidth'] = (4 * std) / df['SMA_30']



    df['Vol_Shock'] = df['Volume'] / df['Volume'].rolling(14).mean()


    df['Vol_Flow'] = np.sign(df['Close'].diff()) * df['Volume']

    df['OBV_Pressure'] = df['Vol_Flow'].rolling(5).sum() / df['Volume'].rolling(20).mean()


    return df.dropna()


df_opt = get_optimized_features(df)
df_opt['Target_Return'] = df_opt['Target'] / df_opt['Close'] - 1

features = ['Return_1D', 'Return_2D', 'Return_3D', 'Return_5D',
            'Dist_MA30', 'BB_Bandwidth',
            'Vol_Shock', 'OBV_Pressure',  'ATR']

X = df_opt[features]
y = df_opt['Target_Return']


split = int(len(X) * 0.3)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]
close_test = df_opt['Close'].iloc[split:]
target_test = df_opt['Target'].iloc[split:]



model1 = RandomForestRegressor(n_estimators=200, max_depth=3, min_samples_leaf=20, random_state=42)
model1.fit(X_train, y_train)
pred1 = close_test * (1 + model1.predict(X_test))
rmse1 = np.sqrt(mean_squared_error(target_test, pred1))



print(f"Model 1 (User Params + Clean Features) RMSE: {rmse1}")

test = pd.read_csv('test (1).csv')
def get_features_test(df):
  df = df.copy()
  df['Return_1D'] = (df['Close_Lag_0'] - df['Close_Lag_1']) / df['Close_Lag_1']
  df['Return_2D'] = (df['Close_Lag_0'] - df['Close_Lag_2']) / df['Close_Lag_2']
  df['Return_3D'] = (df['Close_Lag_0'] - df['Close_Lag_3']) / df['Close_Lag_3']
  df['Return_5D'] = (df['Close_Lag_0'] - df['Close_Lag_5']) / df['Close_Lag_5']


  close_lags = [f'Close_Lag_{i}' for i in range(30)]
  df['SMA_30'] = df[close_lags].mean(axis=1)
  df['Dist_MA30'] = df['Close_Lag_0'] / df['SMA_30'] - 1




  std_30 = df[close_lags].std(axis=1)
  df['BB_Bandwidth'] = (4 * std_30) / df['SMA_30']



  volume_lags_14 = [f'Volume_Lag_{i}' for i in range(14)]
  df['Vol_Shock'] = df['Volume_Lag_0'] / df[volume_lags_14].mean(axis=1)


  obv_sum = 0
  for i in range(5):

        direction = np.sign(df[f'Close_Lag_{i}'] - df[f'Close_Lag_{i+1}'])
        flow = direction * df[f'Volume_Lag_{i}']
        obv_sum += flow

  volume_lags_30 = [f'Volume_Lag_{i}' for i in range(30)]
  df['OBV_Pressure'] = obv_sum / df[volume_lags_30].mean(axis=1)


  high_lags_14 = [f'High_Lag_{i}' for i in range(14)]
  low_lags_14 = [f'Low_Lag_{i}' for i in range(14)]
  df['ATR'] = df[high_lags_14].max(axis=1) - df[low_lags_14].min(axis=1)


  return df

test_with_features = get_features_test(test)
features = ['Return_1D', 'Return_2D', 'Return_3D', 'Return_5D',
            'Dist_MA30', 'BB_Bandwidth',
            'Vol_Shock', 'OBV_Pressure', 'ATR']

close_test = test_with_features['Close_Lag_0']
X_test = test_with_features[features]
submission_rf= close_test * (1 + model1.predict(X_test))
submission_rf_best = pd.DataFrame({'ID': test_df['ID'], 'Target': submission_rf})







Model 1 (User Params + Clean Features) RMSE: 2.8027005318566593


In [None]:
final = (0.6*submission_lr_best['Target'] + 0.4*submission_rf_best['Target'] )
final = pd.DataFrame({'ID': test_df['ID'], 'Target': final})