In [16]:
import os
from pathlib import Path
import yfinance as yf
import json
import matplotlib.pyplot as plt
from typing import List

import pandas as pd
import numpy as np

In [19]:
tickers = {
    "tech": ["AAPL", "MSFT", "TSLA"],
    "healthcare": ["JNJ", "PFE", "MRK"],
    "financials": ["JPM", "BAC", "WFC"],
    "energy": ["XOM", "CVX", "SLB"],
    "consumer": ["PG", "KO", "WMT"],
    "crypto": ["BTC-USD"],
    "commodities": ["GC=F", "CL=F", "LITM"]
}

In [8]:
for sector, symbols in tickers.items():
    print(f"Fetching data for {sector} sector...")
    for symbol in symbols:
        try:
            ticker_data = yf.Ticker(symbol)
            hist = ticker_data.history(period="max")
            hist.to_csv(f"historical_data/{symbol}_data.csv")
            print(f"{symbol} data fetched successfully.")
        except Exception as e:
            print(f"Error fetching data for {symbol}: {e}")
    print(f"Finished fetching data for {sector} sector.\n")

Fetching data for tech sector...
AAPL data fetched successfully.
MSFT data fetched successfully.
TSLA data fetched successfully.
Finished fetching data for tech sector.

Fetching data for healthcare sector...
JNJ data fetched successfully.
PFE data fetched successfully.
MRK data fetched successfully.
Finished fetching data for healthcare sector.

Fetching data for financials sector...
JPM data fetched successfully.
BAC data fetched successfully.
WFC data fetched successfully.
Finished fetching data for financials sector.

Fetching data for energy sector...
XOM data fetched successfully.
CVX data fetched successfully.
SLB data fetched successfully.
Finished fetching data for energy sector.

Fetching data for consumer sector...
PG data fetched successfully.
KO data fetched successfully.
WMT data fetched successfully.
Finished fetching data for consumer sector.

Fetching data for crypto sector...
BTC-USD data fetched successfully.
Finished fetching data for crypto sector.

Fetching data f

In [None]:
#!/usr/bin/env python3
"""
split_csv_by_date.py

For each CSV in `file_paths`, read it with pandas, split into train/test based on the 'Date' column,
and write out two new files:
    [original_filename]_train.csv   (all rows with Date < 2024-01-01)
    [original_filename]_test.csv    (all rows with Date >= 2024-01-01)

Usage:
    python split_csv_by_date.py

Adjust `file_paths` below (or modify this script to accept command-line arguments).
"""
# ----------------------------------------------------------------------
# STEP 1: Specify your CSV files here. You can list absolute or relative paths.
# ----------------------------------------------------------------------
file_paths = Path("historical_data").glob("*.csv")

# ----------------------------------------------------------------------
# STEP 2: Define the cutoff date for the test set
# ----------------------------------------------------------------------
# We want test = all rows with Date >= January 1, 2024
TEST_CUTOFF = pd.Timestamp("2024-01-01", tz="UTC")  # Ensure timezone consistency if needed

# ----------------------------------------------------------------------
# STEP 3: Loop over each CSV, read, split, and write out train/test
# ----------------------------------------------------------------------
for csv_path in file_paths:

    # 2. Read the CSV, parsing 'Date' as datetime:
    try:
        df = pd.read_csv(csv_path, parse_dates=["Date"])
    except Exception as e:
        print(f"Error reading '{csv_path}': {e}")
        continue

    # 3. Verify that the 'Date' column exists and is datetime64:
    if "Date" not in df.columns:
        print(f"Warning: 'Date' column not found in '{csv_path}'. Skipping.")
        continue

    if not pd.api.types.is_datetime64_any_dtype(df["Date"]):
        # If pandas did not parse it as datetime, try to convert explicitly:
        try:
            df["Date"] = pd.to_datetime(df["Date"], errors="raise", utc=True)
        except Exception as e:
            print(f"Could not convert 'Date' to datetime in '{csv_path}': {e}")
            continue

    # 4. Split into train/test
    mask_test = df["Date"] >= TEST_CUTOFF
    df_test = df.loc[mask_test].copy()
    df_train = df.loc[~mask_test].copy()

    # 5. Construct output filenames:
    base, ext = os.path.splitext(csv_path)
    train_path = f"{base}_train{ext}"
    test_path  = f"{base}_test{ext}"

    # 6. Write them out (index=False to avoid writing the index column unless you want it):
    try:
        df_train.to_csv(train_path, index=False)
        df_test.to_csv(test_path, index=False)
        print(f"Split '{csv_path}' →")
        print(f"    TRAIN ({len(df_train)} rows) → {train_path}")
        print(f"    TEST  ({len(df_test)} rows) → {test_path}")
    except Exception as e:
        print(f"Error writing split files for '{csv_path}': {e}")

Split 'historical_data/BTC-USD_data.csv' →
    TRAIN (3393 rows) → historical_data/BTC-USD_data_train.csv
    TEST  (522 rows) → historical_data/BTC-USD_data_test.csv
Split 'historical_data/SLB_data.csv' →
    TRAIN (10588 rows) → historical_data/SLB_data_train.csv
    TEST  (358 rows) → historical_data/SLB_data_test.csv
Split 'historical_data/JPM_data.csv' →
    TRAIN (11041 rows) → historical_data/JPM_data_train.csv
    TEST  (358 rows) → historical_data/JPM_data_test.csv
Split 'historical_data/GC=F_data.csv' →
    TRAIN (5854 rows) → historical_data/GC=F_data_train.csv
    TEST  (359 rows) → historical_data/GC=F_data_test.csv
Split 'historical_data/JNJ_data.csv' →
    TRAIN (15606 rows) → historical_data/JNJ_data_train.csv
    TEST  (357 rows) → historical_data/JNJ_data_test.csv
Split 'historical_data/BTC-USD_data_test.csv' →
    TRAIN (0 rows) → historical_data/BTC-USD_data_test_train.csv
    TEST  (522 rows) → historical_data/BTC-USD_data_test_test.csv
Split 'historical_data/BAC_d

In [25]:
def moving_average_crossover(df: pd.DataFrame,
                             short_window: int = 20,
                             long_window: int = 50) -> pd.DataFrame:
    """
    Compute a simple moving‐average crossover strategy on `df`.
    Assumes `df` has a 'Date' column (or datetime index) and a 'Close' column.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain 'Date' (or be indexed by datetime) and 'Close'.
    short_window : int
        Window size for the short SMA (e.g., 20 days).
    long_window : int
        Window size for the long SMA (e.g., 50 days).

    Returns
    -------
    pd.DataFrame
        Original DataFrame with the following added columns:
          - 'SMA_short' : short‐period moving average
          - 'SMA_long'  : long‐period moving average
          - 'Signal'    : 1 when short crosses above long,
                          -1 when short crosses below long, else 0
    """

    df = df.copy()

    # Ensure 'Date' is a datetime type and set as index if not already
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], errors="raise", utc=True)
        df.set_index("Date", inplace=True)
    elif not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("DataFrame must have a 'Date' column or be indexed by datetime.")

    # 1) Compute Simple Moving Averages
    df["SMA_short"] = df["Close"].rolling(window=short_window, min_periods=1).mean()
    df["SMA_long"]  = df["Close"].rolling(window=long_window, min_periods=1).mean()

    # 2) Generate raw crossover signals: 
    #    - When SMA_short > SMA_long → 1 (long), when SMA_short < SMA_long → 0
    df["Crossover"] = np.where(df["SMA_short"] > df["SMA_long"], 1, 0)

    # 3) The actual “Signal” is the change in Crossover:
    #    - A rising edge (0→1) means a buy signal (1)
    #    - A falling edge (1→0) means a sell signal (−1)
    df["Signal"] = df["Crossover"].diff().fillna(0)
    df["Signal"] = df["Signal"].map({1.0: 1, -1.0: -1}).fillna(0).astype(int)

    # Rows where no crossover event happen remain at 0
    # Cleanup temporary column if you don’t need it:
    df.drop(columns=["Crossover"], inplace=True)

    return df

In [26]:
def bollinger_bands_strategy(df: pd.DataFrame,
                             window: int = 20,
                             num_std: float = 2.0) -> pd.DataFrame:
    """
    Compute a simple Bollinger Bands breakout strategy on `df`.
    Assumes `df` has a 'Date' column (or datetime index) and a 'Close' column.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain 'Date' (or be indexed by datetime) and 'Close'.
    window : int
        Rolling window size for moving average and standard deviation.
    num_std : float
        Number of standard deviations for upper/lower bands.

    Returns
    -------
    pd.DataFrame
        Original DataFrame with these added columns:
          - 'BB_mid'   : rolling mean (middle band)
          - 'BB_std'   : rolling standard deviation
          - 'BB_upper' : BB_mid + num_std * BB_std
          - 'BB_lower' : BB_mid − num_std * BB_std
          - 'Signal'   :  1 when Close < BB_lower (buy)
                          -1 when Close > BB_upper (sell), else 0
    """

    df = df.copy()

    # Ensure 'Date' is datetime and set as index if needed
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], errors="raise", utc=True)
        df.set_index("Date", inplace=True)
    elif not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("DataFrame must have a 'Date' column or be indexed by datetime.")

    # 1) Compute rolling mean (mid) and rolling std (std) over 'window'
    df["BB_mid"] = df["Close"].rolling(window=window, min_periods=1).mean()
    df["BB_std"] = df["Close"].rolling(window=window, min_periods=1).std(ddof=0)

    # 2) Compute upper and lower bands
    df["BB_upper"] = df["BB_mid"] + (num_std * df["BB_std"])
    df["BB_lower"] = df["BB_mid"] - (num_std * df["BB_std"])

    # 3) Generate signals:
    #    - Buy (1)  when price closes below the lower band
    #    - Sell (-1) when price closes above the upper band
    conditions = [
        df["Close"] < df["BB_lower"],  # price has broken below lower band
        df["Close"] > df["BB_upper"],  # price has broken above upper band
    ]
    choices = [1, -1]

    df["Signal"] = np.select(conditions, choices, default=0).astype(int)

    return df

In [27]:
def backtest_with_shorts(
    df: pd.DataFrame,
    price_col: str = "Close",
    signal_col: str = "Signal"
) -> List[float]:
    """
    Backtest a strategy that can go long (signal=+1), short (signal=-1), or flat (signal=0).
    For each change in signal, it closes any open position and (if new signal != 0) opens a new position.
    Returns a list of percent PnLs for each completed trade.

    Parameters
    ----------
    df : pd.DataFrame
        Must be sorted in ascending date order and contain:
          - price_col   : price at which to execute trades (float)
          - signal_col  : integer signal (+1=open long, -1=open short, 0=flat)
    price_col : str
        Name of the column with execution prices.
    signal_col : str
        Name of the column with signals (+1, -1, 0).

    Returns
    -------
    List[float]
        Percent PnLs for each round-trip (long or short) trade.
    """
    pnls: List[float] = []
    current_position = 0   # +1 for long, -1 for short, 0 for flat
    entry_price = None     # price at which the current position was opened

    for idx, row in df.iterrows():
        sig = int(row[signal_col])
        price = float(row[price_col])

        # If the signal changes from whatever we currently hold:
        if sig != current_position:
            # 1) Close any existing position
            if current_position != 0 and entry_price is not None:
                if current_position == 1:
                    # closing a long
                    pnl = (price - entry_price) / entry_price * 100.0
                else:  # current_position == -1, closing a short
                    pnl = (entry_price - price) / entry_price * 100.0

                pnls.append(pnl)
                entry_price = None
                current_position = 0

            # 2) Open a new position if sig != 0
            if sig != 0:
                current_position = sig
                entry_price = price

        # If sig == current_position, do nothing (hold the position)
        # If sig == 0 and current_position == 0, do nothing (remain flat)

    # At the end: if a position is still open, close it at the last price
    if current_position != 0 and entry_price is not None:
        last_price = float(df.iloc[-1][price_col])
        if current_position == 1:
            pnl = (last_price - entry_price) / entry_price * 100.0
        else:  # current_position == -1
            pnl = (entry_price - last_price) / entry_price * 100.0

        pnls.append(pnl)

    return pnls

In [28]:
for sector, symbols in tickers.items():
    print(f"Processing strategies for {sector} sector...")
    for symbol in symbols:
        try:
            file_path = Path(f"historical_data/{symbol}_data.csv")
            if not file_path.exists():
                print(f"File {file_path} does not exist. Skipping.")
                continue

            df = pd.read_csv(file_path)

            # Apply moving average crossover strategy
            df_ma = moving_average_crossover(df)
            ma_pnls = backtest_with_shorts(df_ma)
            print(f"{symbol} MA Strategy PnLs: {ma_pnls}")

            with open(f"backtest_pnl/{symbol}_ma_strategy.json", "w") as f:
                json.dump(ma_pnls, f)

            # Apply Bollinger Bands strategy
            df_bb = bollinger_bands_strategy(df)
            bb_pnls = backtest_with_shorts(df_bb)
            print(f"{symbol} BB Strategy PnLs: {bb_pnls}")

            with open(f"backtest_pnl/{symbol}_bb_strategy.json", "w") as f:
                json.dump(bb_pnls, f)

        except Exception as e:
            print(f"Error processing {symbol}: {e}")
    print(f"Finished processing strategies for {sector} sector.\n")

Processing strategies for tech sector...
AAPL MA Strategy PnLs: [0.4098232207753784, -3.619667059805339, 0.0, -3.97985033450906, 1.2657470373739659, 3.3331112518271135, -4.3722809115622585, -0.6210764410992258, -1.0100015282172428, 5.579047496288184, 0.7490067921926988, 2.662526403280507, -0.9998939805271775, -0.2644635607854559, 4.2778261592897895, -4.499753200571398, 0.46941920990140296, -5.318821631949819, 5.976380372473626, -2.979280800160789, -2.2220800906675735, 0.9301622854902969, 2.4278797012246995, 6.075171978364753, -1.515035208620282, 3.7591384669264163, -2.3075525468034286, 2.4588713629576273, -0.7873596035147118, 2.3573368563395634, -1.9866365885743538, -1.1110956620613657, 0.3533220601420053, 3.4014504133595547, 4.658358039481357, 0.2984732279324393, 0.8063863579928209, -1.2196999619130087, -3.3334207370313824, 0.840529381037427, -1.5062020113282084, 1.2050009668552684, -1.7965851881888226, 0.0, 1.8690198699896374, 0.0, 0.857111423706889, -1.1301333533890294, 0.3389452395

In [17]:
# # Cumulative PnL plot
# cumulative_pnl = np.cumsum(backtest_results)
# plt.figure(figsize=(12, 6))
# plt.plot(cumulative_pnl, marker='o', linestyle='-', color='blue')
# plt.title("Cumulative PnL from Backtest")
# plt.xlabel("Trade Number")
# plt.ylabel("Cumulative PnL (%)")
# plt.grid()
# plt.axhline(0, color='red', linestyle='--', linewidth=1)
# plt.show()

In [32]:
pnls_df = pd.DataFrame()
for sector, symbols in tickers.items():
    print(f"Processing strategies for {sector} sector...")
    for symbol in symbols:
        try:
            backtest_ema_path = Path(f"backtest_pnl/{symbol}_ma_strategy.json")
            if not backtest_ema_path.exists():
                print(f"File {backtest_ema_path} does not exist. Skipping.")
                continue
            with open(backtest_ema_path, "r") as f:
                ma_pnls = json.load(f)
            
            backtest_bb_path = Path(f"backtest_pnl/{symbol}_bb_strategy.json")
            if not backtest_bb_path.exists():
                print(f"File {backtest_bb_path} does not exist. Skipping.")
                continue
            with open(backtest_bb_path, "r") as f:
                bb_pnls = json.load(f)
            
            # Handle different lengths by padding with NaN
            max_len = max(len(ma_pnls), len(bb_pnls))
            ma_pnls_padded = ma_pnls + [np.nan] * (max_len - len(ma_pnls))
            bb_pnls_padded = bb_pnls + [np.nan] * (max_len - len(bb_pnls))
            
            # Create a DataFrame for the PnLs
            symbol_df = pd.DataFrame({
                f"{symbol}_MA_PnL": ma_pnls_padded,
                f"{symbol}_BB_PnL": bb_pnls_padded
            })
            pnls_df = pd.concat([pnls_df, symbol_df], axis=1)
        except Exception as e:
            print(f"Error processing {symbol}: {e}")

Processing strategies for tech sector...
Processing strategies for healthcare sector...
Processing strategies for financials sector...
Processing strategies for energy sector...
Processing strategies for consumer sector...
Processing strategies for crypto sector...
Processing strategies for commodities sector...


In [33]:
pnls_df.head()

Unnamed: 0,AAPL_MA_PnL,AAPL_BB_PnL,MSFT_MA_PnL,MSFT_BB_PnL,TSLA_MA_PnL,TSLA_BB_PnL,JNJ_MA_PnL,JNJ_BB_PnL,PFE_MA_PnL,PFE_BB_PnL,...,WMT_MA_PnL,WMT_BB_PnL,BTC-USD_MA_PnL,BTC-USD_BB_PnL,GC=F_MA_PnL,GC=F_BB_PnL,CL=F_MA_PnL,CL=F_BB_PnL,LITM_MA_PnL,LITM_BB_PnL
0,0.409823,-1.408318,0.434728,2.499858,-4.205355,-3.730999,0.934788,-8.396756,0.29586,0.584696,...,0.0,-6.508539,-1.953947,0.368847,-0.628924,0.483267,-3.829158,-1.087271,-7.291666,8.724832
1,-3.619667,-2.212264,0.0,-3.936761,-5.392702,0.047514,-0.526474,1.67463,0.605989,-4.046283,...,1.135864,-0.368625,2.620586,2.894592,0.112608,1.005743,-3.338732,1.030372,-1.11524,8.061011
2,0.0,-4.762606,-0.819644,-3.030956,2.726774,4.73155,-0.167294,-1.648263,0.290557,0.546504,...,-5.20962,0.770333,0.171375,6.077758,-0.305455,-0.713758,0.797553,9.466665,5.078133,4.615386
3,-3.97985,-10.415958,0.840364,-4.274165,-0.743629,7.142813,-3.914073,-5.952346,0.0,1.538303,...,-1.112792,-0.371325,0.245751,1.195566,0.346563,1.369868,-0.412369,-0.57389,3.488378,-2.163458
4,1.265747,1.716632,0.0,8.333897,-3.927946,1.968886,2.491096,-0.542389,-0.284879,-2.906873,...,1.672598,1.901602,-0.506589,-2.818739,-0.075096,0.518324,-1.544942,1.067408,-1.582282,19.322709


In [34]:
cov_matrix = pnls_df.cov()

In [35]:
cov_matrix

Unnamed: 0,AAPL_MA_PnL,AAPL_BB_PnL,MSFT_MA_PnL,MSFT_BB_PnL,TSLA_MA_PnL,TSLA_BB_PnL,JNJ_MA_PnL,JNJ_BB_PnL,PFE_MA_PnL,PFE_BB_PnL,...,WMT_MA_PnL,WMT_BB_PnL,BTC-USD_MA_PnL,BTC-USD_BB_PnL,GC=F_MA_PnL,GC=F_BB_PnL,CL=F_MA_PnL,CL=F_BB_PnL,LITM_MA_PnL,LITM_BB_PnL
AAPL_MA_PnL,6.612058,1.047884,-0.039571,0.182523,-0.582529,0.01331,0.14348,0.324252,0.233974,0.859866,...,0.664274,-0.254922,-0.408901,-1.100821,-0.022583,0.023313,0.491267,-0.262265,22.3657,1.919502
AAPL_BB_PnL,1.047884,12.04113,-0.079131,1.049853,0.898736,-3.402344,0.108467,-0.142894,0.376143,0.112294,...,0.035594,-0.261867,-2.437087,3.531281,-0.845223,-0.061,0.340248,-1.895009,2.460203,-0.973584
MSFT_MA_PnL,-0.039571,-0.079131,6.936777,0.069529,-0.237782,0.759083,0.086165,0.005531,0.080032,-0.230675,...,0.434681,-0.238397,-1.210558,-0.387984,0.046374,0.188448,-0.303054,-0.150038,-15.673501,8.679506
MSFT_BB_PnL,0.182523,1.049853,0.069529,10.738644,-1.893901,-0.122449,0.584907,-0.519511,0.277329,-0.245826,...,0.522727,-1.334315,0.407966,-0.897525,-0.028556,-0.182787,-0.915011,0.441402,-12.600879,-22.575186
TSLA_MA_PnL,-0.582529,0.898736,-0.237782,-1.893901,16.488339,0.022888,0.177176,0.595431,-1.177133,0.387746,...,-0.970118,3.20401,0.783902,2.48321,-0.430089,0.339389,0.300595,-1.061876,14.156971,-5.747745
TSLA_BB_PnL,0.01331,-3.402344,0.759083,-0.122449,0.022888,41.867903,-0.596771,0.921788,0.550154,-0.727909,...,-0.674879,1.012062,2.864335,-2.328421,0.338887,-0.484048,-0.149129,0.218221,-3.349855,-10.20826
JNJ_MA_PnL,0.14348,0.108467,0.086165,0.584907,0.177176,-0.596771,1.974212,-0.286883,0.101659,-0.07646,...,0.141119,-0.01551,-0.012557,0.213178,0.149092,0.039101,-0.207054,0.282441,-3.726938,-0.915324
JNJ_BB_PnL,0.324252,-0.142894,0.005531,-0.519511,0.595431,0.921788,-0.286883,4.162703,0.294328,0.255643,...,-0.033791,-0.51915,2.367399,1.444769,0.051235,-0.163872,0.295211,-1.065563,10.789669,-1.598727
PFE_MA_PnL,0.233974,0.376143,0.080032,0.277329,-1.177133,0.550154,0.101659,0.294328,2.490745,0.254345,...,0.102718,0.037766,0.876191,1.112527,0.17241,-0.145856,-0.015447,-0.155972,6.299783,-4.203641
PFE_BB_PnL,0.859866,0.112294,-0.230675,-0.245826,0.387746,-0.727909,-0.07646,0.255643,0.254345,5.127396,...,0.360032,0.054583,-0.382907,0.068324,-0.228264,-0.161083,0.040386,-0.675884,1.356473,-4.42789


In [36]:
# Save the covariance matrix to a CSV file for later use
cov_matrix.to_csv("covariance_matrix.csv")