In [None]:
import yfinance as yf
import os
import pandas as pd

# Create a directory to store S&P 500 stock data
directory = "SP500_10_25"
if not os.path.exists(directory):
    os.makedirs(directory)

# Retrieve the list of S&P 500 stocks from Wikipedia
sp500_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
table = pd.read_html(sp500_url)[0]
tickers = table['Symbol'].tolist()

# Add VIX and S&P 500 index (^VIX, ^GSPC)
tickers.extend(["^VIX", "^GSPC"])

# Fetch stock data and save as CSV
for ticker in tickers:
    print(f"Fetching data for: {ticker}")
    stock_data = yf.download(ticker, start="2010-01-01", end="2025-04-01")
    
    # Check if the number of data points is less than 100 days
    if len(stock_data) < 100:
        print(f"Skipping {ticker}, insufficient data: {len(stock_data)} days")
        continue  # Skip this stock and do not save data

    # Keep only the required columns (Open, High, Low, Close, Volume)
    stock_data = stock_data[['Open', 'High', 'Low', 'Close', 'Volume']]
    
    # Save data as CSV
    stock_data.to_csv(f"{directory}/{ticker}.csv")
    print(f"Data for {ticker} has been saved")

print("All eligible stock data has been successfully saved.")


In [None]:
import os
import pandas as pd

# Set storage path
directory = "SP500_10_25"

# Process each CSV file
for ticker in os.listdir(directory):
    if ticker.endswith(".csv"):
        file_path = os.path.join(directory, ticker)
        
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Delete the second and third rows
        df = df.drop([0, 1]).reset_index(drop=True)

        # Remove rows containing NaN values
        df = df.dropna()

        # Rename the first column to 'Date'
        df.columns.values[0] = 'Date'

        # Save the modified file
        df.to_csv(file_path, index=False)

        print(f"Processing complete: {ticker}")

print("🎉 All CSV files have been cleaned!")


In [None]:
import os
import pandas as pd

directory = "SP500_10_25"

# Get a list of all CSV files
csv_files = [f for f in os.listdir(directory) if f.endswith(".csv")]

# Initialize an empty list to store individual stock DataFrames
dfs = []

# Read and store each CSV file
for file in csv_files:
    stock_df = pd.read_csv(os.path.join(directory, file), parse_dates=["Date"])
    stock_df["Stock_ID"] = file.replace(".csv", "")  # Add stock ticker as an identifier
    dfs.append(stock_df)

# Concatenate all stock DataFrames
training_df = pd.concat(dfs, ignore_index=True)

# Sort by Date first, then by Stock_ID
training_df = training_df.sort_values(by=["Date", "Stock_ID"]).reset_index(drop=True)

# Reorder columns: Date → Stock_ID → Other Columns
cols = ["Date", "Stock_ID"] + [col for col in training_df.columns if col not in ["Date", "Stock_ID"]]
training_df = training_df[cols]

# Create new directory
directory = "training"
if not os.path.exists(directory):
    os.makedirs(directory)

# Save merged data to a new CSV file
training_csv_path = os.path.join(directory, "10_25_merged_stocks.csv")
training_df.to_csv(training_csv_path, index=False)

print(f"Merging completed. Merged file saved as '{training_csv_path}'.")

training_df.head(10)

In [None]:
import pandas as pd
import numpy as np

# Load the merged dataset
file_path = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/training/10_25_merged_stocks.csv"
df = pd.read_csv(file_path, parse_dates=["Date"])

# Ensure sorting by Date and Stock_ID
df = df.sort_values(by=["Stock_ID", "Date"]).reset_index(drop=True)

# Function to calculate technical indicators
def calculate_features(df):
    
    # Shift to avoid look-ahead bias (so this would be the next day's Return_1d)
    df["Return_1d"] = df.groupby("Stock_ID")["Close"].pct_change(1).shift(-1)
    
    # The rest of them remains the same
    df["Return_5d"] = df.groupby("Stock_ID")['Close'].pct_change(5)
    df["Return_10d"] = df.groupby("Stock_ID")['Close'].pct_change(10)
    df["Return_50d"] = df.groupby("Stock_ID")['Close'].pct_change(50)
    
    # Rolling volatility
    df["Volatility_5d"] = df.groupby("Stock_ID")["Return_1d"].rolling(5).std().reset_index(level=0, drop=True)
    df["Volatility_10d"] = df.groupby("Stock_ID")["Return_1d"].rolling(10).std().reset_index(level=0, drop=True)
    df["Volatility_20d"] = df.groupby("Stock_ID")["Return_1d"].rolling(20).std().reset_index(level=0, drop=True)
    
    # Momentum indicators
    df["SMA_10"] = df.groupby("Stock_ID")["Close"].rolling(10).mean().reset_index(level=0, drop=True)
    df["SMA_50"] = df.groupby("Stock_ID")["Close"].rolling(50).mean().reset_index(level=0, drop=True)
    df["SMA_200"] = df.groupby("Stock_ID")["Close"].rolling(200).mean().reset_index(level=0, drop=True)
    df["RSI_14"] = 100 - (100 / (1 + df.groupby("Stock_ID")["Return_1d"].rolling(14).apply(lambda x: np.mean(x[x > 0]) / np.mean(-x[x < 0]) if np.mean(-x[x < 0]) != 0 else np.inf).reset_index(level=0, drop=True)))
    
    # Volume-based features
    df["Volume_Change_5d"] = df.groupby("Stock_ID")["Volume"].pct_change(5)
    df["Volume_Change_10d"] = df.groupby("Stock_ID")["Volume"].pct_change(10)
    
    return df

# Apply feature calculations
df = calculate_features(df)

# Save the new dataset with features
output_path = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/training/10_25_merged_stocks_features.csv"
df.to_csv(output_path, index=False)
print(f"Feature engineering complete. Saved to {output_path}")

df.head(10)

In [None]:
# Based on my previous experience, this is basically the most troublesome ones
print(df[["Volume_Change_5d", "Volume_Change_10d"]].describe(percentiles=[0.01, 0.25, 0.5, 0.75, 0.99]))

In [None]:
# Replace infinite values with NaN (so they can be dropped)
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN values
df = df.dropna().reset_index(drop=True)

# Save the cleaned dataset
file_path = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/training/10_25_merged_stocks_features.csv"
df.to_csv(file_path, index=False)

print(f"Cleaned dataset saved as '{file_path}'.")

df.head(10)

In [None]:
# Double check again!
print(df[["Volume_Change_5d", "Volume_Change_10d"]].describe(percentiles=[0.01, 0.25, 0.5, 0.75, 0.99]))

In [None]:
import pandas as pd

# Load the merged dataset
file_path = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/training/10_25_merged_stocks_features.csv"
df = pd.read_csv(file_path, parse_dates=["Date"])

# Define split dates
train_end_date = "2020-12-31"
valid_start_date = "2021-01-01"
valid_end_date = "2021-12-31"
test_start_date = "2022-01-01"
test_end_date = "2025-04-01"

# Split into training, validation, and testing sets
train_df = df[df["Date"] <= train_end_date]
valid_df = df[(df["Date"] >= valid_start_date) & (df["Date"] <= valid_end_date)]
test_df = df[(df["Date"] >= test_start_date) & (df["Date"] <= test_end_date)]

# Save to CSV
output_dir = "/home/jesse/Projects/CWP_RL/03_XGBoost_Return_Prediction/training"
train_path = f"{output_dir}/train_2010_2020.csv"
valid_path = f"{output_dir}/valid_2021_2021.csv"
test_path = f"{output_dir}/test_2022_2025.csv"

train_df.to_csv(train_path, index=False)
valid_df.to_csv(valid_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"Training set saved: {train_path} ({len(train_df)} rows)")
print(f"Validation set saved: {valid_path} ({len(valid_df)} rows)")
print(f"Testing set saved: {test_path} ({len(test_df)} rows)")
