<a href="https://colab.research.google.com/github/ManjuRama/FinMath/blob/main/RandomRegressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Example DataFrame: Ensure your real dataset is properly loaded
# df = pd.read_csv('your_dataset.csv')

# Generate synthetic data for demonstration purposes
np.random.seed(42)
n = 1440  # 1 day of minute-by-minute data (60 * 24 = 1440 minutes)
data = {
    'volumelastminute': np.random.rand(n) * 100,
    'volumelasthour': np.random.rand(n) * 500,
    'spread': np.random.rand(n) * 5,
    'midprice': np.random.rand(n) * 1000,
    'nexthourvolume': np.random.rand(n) * 600
}
df = pd.DataFrame(data)

# Rolling window size: 60 minutes
window_size = 60

# Function to generate rolling window features for prediction
def generate_rolling_features(df, window_size):
    features = pd.DataFrame(index=df.index[window_size:])

    # Create rolling averages for the last 60 minutes for each feature
    features['volumelastminute_avg'] = df['volumelastminute'].rolling(window=window_size).mean().shift(1)
    features['volumelasthour_avg'] = df['volumelasthour'].rolling(window=window_size).mean().shift(1)
    features['spread_avg'] = df['spread'].rolling(window=window_size).mean().shift(1)
    features['midprice_avg'] = df['midprice'].rolling(window=window_size).mean().shift(1)

    # Drop NaN values (as the first 60 rows will not have rolling averages)
    features = features.dropna()

    return features

# Generate the rolling features
rolling_features = generate_rolling_features(df, window_size)

# The target variable (shifted by 60 minutes, because we want to predict the next hour's volume)
rolling_target = df['nexthourvolume'].shift(-60).dropna()

# Align the target with the features (since they are shifted differently)
rolling_features = rolling_features.loc[rolling_target.index]

# Split data into training and testing sets (80% train, 20% test)
train_size = int(len(rolling_features) * 0.8)
X_train = rolling_features[:train_size]
y_train = rolling_target[:train_size]
X_test = rolling_features[train_size:]
y_test = rolling_target[train_size:]

# Fit a regression model (e.g., RandomForestRegressor)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Plot actual vs predicted values for the test set
plt.figure(figsize=(10, 6))
plt.plot(y_test.index, y_test, label='Actual nexthourvolume', color='blue')
plt.plot(y_test.index, y_pred, label='Predicted nexthourvolume', color='red')
plt.xlabel('Time')
plt.ylabel('nexthourvolume')
plt.title('Actual vs Predicted nexthourvolume')
plt.legend()
plt.show()
