In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

data = pd.read_csv("fintech-x-ds-ucl-hackathon/train.csv")

data['Date'] = pd.to_datetime(data['Date'], errors='coerce', utc=True)

data = data.dropna(subset=['Date'])

data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['Quarter'] = data['Date'].dt.quarter

data['MA_5'] = data['Close'].rolling(window=5).mean()
data['MA_10'] = data['Close'].rolling(window=10).mean()

data['Volatility_5'] = data['Close'].rolling(window=5).std()
data['Volatility_10'] = data['Close'].rolling(window=10).std()

data['Momentum_5'] = data['Close'] - data['Close'].shift(5)
data['Momentum_10'] = data['Close'] - data['Close'].shift(10)

delta = data['Close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
data['RSI'] = 100 - (100 / (1 + rs))

data['Daily_Return'] = data['Close'].pct_change()

data['Cumulative_Volume'] = data['Volume'].cumsum()

data['Lag_1_Close'] = data['Close'].shift(1)
data['Lag_2_Close'] = data['Close'].shift(2)
data['Lag_3_Close'] = data['Close'].shift(3)

data = data.dropna()

X = data[['Open', 'High', 'Low', 'Volume', 'Year', 'Month', 'Day', 'DayOfWeek', 'Quarter',
          'MA_5', 'MA_10', 'Volatility_5', 'Volatility_10', 'Momentum_5', 'Momentum_10',
          'RSI', 'Daily_Return', 'Cumulative_Volume', 'Lag_1_Close', 'Lag_2_Close', 'Lag_3_Close']]
y = data['Close']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

data.loc[X_test.index, 'Predicted_Close'] = y_pred

data.to_csv("train_with_predictions.csv", index=False)

print("Predictions saved to 'train_with_predictions.csv'")

Mean Squared Error: 99.39674652658891
Predictions saved to 'train_with_predictions.csv'


In [None]:
import pandas as pd
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb

data = pd.read_csv("fintech-x-ds-ucl-hackathon/train.csv")

X = data.drop(columns=['Close', 'ID', 'Date']) 
y = data['Close']


base_models = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('xgb', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42))
]

stack_model = StackingRegressor(estimators=base_models, final_estimator=Ridge())

stack_model.fit(X_train, y_train)

y_pred_stack = stack_model.predict(X_test)
mse_stack = mean_squared_error(y_test, y_pred_stack)
print("Stacked Model Mean Squared Error:", mse_stack)

results_df = pd.DataFrame({'Actual_Close': y_test, 'Predicted_Close': y_pred_stack})

results_df.to_csv("actual_vs_predicted_close.csv", index=False)