In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# --- Load and Preprocess Data ---
def load_stock_data(root_folder):
    all_data = []
    
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".mat"):
                file_path = os.path.join(subdir, file)
                df = pd.read_csv(file_path)
                if {'Date', 'Open', 'High', 'Low', 'Close', 'Volume'}.issubset(df.columns):
                    df['Date'] = pd.to_datetime(df['Date'])
                    df.set_index('Date', inplace=True)
                    all_data.append(df)
    
    combined_df = pd.concat(all_data).sort_index()
    return combined_df

# Load data
root_folder = "../../data"
df = load_stock_data(root_folder)

df['Return'] = df['Close'].pct_change()
df.dropna(inplace=True)

features = df[['Open', 'High', 'Low', 'Volume']]
targets = df['Close']

# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(features)
feature_names = poly.get_feature_names_out(features.columns)
poly_df = pd.DataFrame(poly_features, columns=feature_names)
poly_df["Close"] = targets.reset_index(drop=True)

# --- Correlation Analysis ---
plt.figure(figsize=(10, 8))
corr_matrix = poly_df.corr()
sns.heatmap(corr_matrix, annot=False, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

# --- Train-Test Split ---
X = poly_df.drop(columns=["Close"])
y = poly_df["Close"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- XGBoost Model ---
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    max_depth=8,
    learning_rate=0.001,
    n_estimators=50,
    subsample=0.5,
    colsample_bytree=0.5,
    random_state=42
)
xgb_model.fit(X_train_scaled, y_train)

y_pred_xgb = xgb_model.predict(X_test_scaled)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mape_xgb = mean_absolute_percentage_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost - MAE: {mae_xgb:.4f}, MAPE: {mape_xgb:.4f}, MSE: {mse_xgb:.4f}, R²: {r2_xgb:.4f}")


Using device: cuda


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x90 in position 444: invalid start byte