In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import numpy as np
import pandas as pd

# 1. DATA PREP
df['Data_Delta'] = df['Data'].diff()
# Add a Lagged Price (Today's price helps predict tomorrow's)
df['Price_Today'] = df['Price'] 
df['Next_Day_Price'] = df['Price'].shift(-1)
df.dropna(inplace=True)

# 2. FEATURE SELECTION
# Using both the Data change AND the current price to predict next price
X = df[['Data_Delta', 'Price_Today']]
y = df['Next_Day_Price']

# 3. DOUBLE SCALING (Crucial)
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
# Reshape y for the scaler, then scale it
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

# 4. SPLIT & TRAIN
tscv = TimeSeriesSplit(n_splits=5)
for train_idx, test_idx in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y_scaled[train_idx], y_scaled[test_idx]

model = XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.05)
model.fit(X_train, y_train)

# Check which feature influenced the price prediction more
importances = model.feature_importances_
for i, col in enumerate(X.columns):
    print(f"Influence of {col}: {importances[i]*100:.2e}%")

# 5. EVALUATION (Inverse Scale to get real numbers)
y_pred_scaled = model.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
y_actual = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

rmse = np.sqrt(mean_squared_error(y_actual, y_pred))
r2 = r2_score(y_actual, y_pred)

print(f"Final RMSE: {rmse:.4f}")
print(f"Final R2 Score: {r2:.4f}")

Influence of Data_Delta: 3.97e-02%
Influence of Price_Today: 1.00e+02%
Final RMSE: 452.3528
Final R2 Score: 0.4923
