In [1]:
import yfinance as yf

In [5]:
tickers = {
    "tech": ["AAPL", "MSFT", "TSLA"],
    "healthcare": ["JNJ", "PFE", "MRK"],
    "financials": ["JPM", "BAC", "WFC"],
    "energy": ["XOM", "CVX", "SLB"],
    "consumer": ["PG", "KO", "WMT"],
    "crypto": ["BTC-USD"],
    "commodities": ["GC=F", "CL=F", "LITM"]
}

In [8]:
for sector, symbols in tickers.items():
    print(f"Fetching data for {sector} sector...")
    for symbol in symbols:
        try:
            ticker_data = yf.Ticker(symbol)
            hist = ticker_data.history(period="max")
            hist.to_csv(f"historical_data/{symbol}_data.csv")
            print(f"{symbol} data fetched successfully.")
        except Exception as e:
            print(f"Error fetching data for {symbol}: {e}")
    print(f"Finished fetching data for {sector} sector.\n")

Fetching data for tech sector...
AAPL data fetched successfully.
MSFT data fetched successfully.
TSLA data fetched successfully.
Finished fetching data for tech sector.

Fetching data for healthcare sector...
JNJ data fetched successfully.
PFE data fetched successfully.
MRK data fetched successfully.
Finished fetching data for healthcare sector.

Fetching data for financials sector...
JPM data fetched successfully.
BAC data fetched successfully.
WFC data fetched successfully.
Finished fetching data for financials sector.

Fetching data for energy sector...
XOM data fetched successfully.
CVX data fetched successfully.
SLB data fetched successfully.
Finished fetching data for energy sector.

Fetching data for consumer sector...
PG data fetched successfully.
KO data fetched successfully.
WMT data fetched successfully.
Finished fetching data for consumer sector.

Fetching data for crypto sector...
BTC-USD data fetched successfully.
Finished fetching data for crypto sector.

Fetching data f

In [11]:
#!/usr/bin/env python3
"""
split_csv_by_date.py

For each CSV in `file_paths`, read it with pandas, split into train/test based on the 'Date' column,
and write out two new files:
    [original_filename]_train.csv   (all rows with Date < 2024-01-01)
    [original_filename]_test.csv    (all rows with Date >= 2024-01-01)

Usage:
    python split_csv_by_date.py

Adjust `file_paths` below (or modify this script to accept command-line arguments).
"""

import os
import pandas as pd
from pathlib import Path

# ----------------------------------------------------------------------
# STEP 1: Specify your CSV files here. You can list absolute or relative paths.
# ----------------------------------------------------------------------
file_paths = Path("historical_data").glob("*.csv")

# ----------------------------------------------------------------------
# STEP 2: Define the cutoff date for the test set
# ----------------------------------------------------------------------
# We want test = all rows with Date >= January 1, 2024
TEST_CUTOFF = pd.Timestamp("2024-01-01", tz="UTC")  # Ensure timezone consistency if needed

# ----------------------------------------------------------------------
# STEP 3: Loop over each CSV, read, split, and write out train/test
# ----------------------------------------------------------------------
for csv_path in file_paths:

    # 2. Read the CSV, parsing 'Date' as datetime:
    try:
        df = pd.read_csv(csv_path, parse_dates=["Date"])
    except Exception as e:
        print(f"Error reading '{csv_path}': {e}")
        continue

    # 3. Verify that the 'Date' column exists and is datetime64:
    if "Date" not in df.columns:
        print(f"Warning: 'Date' column not found in '{csv_path}'. Skipping.")
        continue

    if not pd.api.types.is_datetime64_any_dtype(df["Date"]):
        # If pandas did not parse it as datetime, try to convert explicitly:
        try:
            df["Date"] = pd.to_datetime(df["Date"], errors="raise", utc=True)
        except Exception as e:
            print(f"Could not convert 'Date' to datetime in '{csv_path}': {e}")
            continue

    # 4. Split into train/test
    mask_test = df["Date"] >= TEST_CUTOFF
    df_test = df.loc[mask_test].copy()
    df_train = df.loc[~mask_test].copy()

    # 5. Construct output filenames:
    base, ext = os.path.splitext(csv_path)
    train_path = f"{base}_train{ext}"
    test_path  = f"{base}_test{ext}"

    # 6. Write them out (index=False to avoid writing the index column unless you want it):
    try:
        df_train.to_csv(train_path, index=False)
        df_test.to_csv(test_path, index=False)
        print(f"Split '{csv_path}' →")
        print(f"    TRAIN ({len(df_train)} rows) → {train_path}")
        print(f"    TEST  ({len(df_test)} rows) → {test_path}")
    except Exception as e:
        print(f"Error writing split files for '{csv_path}': {e}")

Split 'historical_data/BTC-USD_data.csv' →
    TRAIN (3393 rows) → historical_data/BTC-USD_data_train.csv
    TEST  (522 rows) → historical_data/BTC-USD_data_test.csv
Split 'historical_data/SLB_data.csv' →
    TRAIN (10588 rows) → historical_data/SLB_data_train.csv
    TEST  (358 rows) → historical_data/SLB_data_test.csv
Split 'historical_data/JPM_data.csv' →
    TRAIN (11041 rows) → historical_data/JPM_data_train.csv
    TEST  (358 rows) → historical_data/JPM_data_test.csv
Split 'historical_data/GC=F_data.csv' →
    TRAIN (5854 rows) → historical_data/GC=F_data_train.csv
    TEST  (359 rows) → historical_data/GC=F_data_test.csv
Split 'historical_data/JNJ_data.csv' →
    TRAIN (15606 rows) → historical_data/JNJ_data_train.csv
    TEST  (357 rows) → historical_data/JNJ_data_test.csv
Split 'historical_data/BTC-USD_data_test.csv' →
    TRAIN (0 rows) → historical_data/BTC-USD_data_test_train.csv
    TEST  (522 rows) → historical_data/BTC-USD_data_test_test.csv
Split 'historical_data/BAC_d

In [1]:
import pandas as pd
import numpy as np

def moving_average_crossover(df: pd.DataFrame,
                             short_window: int = 20,
                             long_window: int = 50) -> pd.DataFrame:
    """
    Compute a simple moving‐average crossover strategy on `df`.
    Assumes `df` has a 'Date' column (or datetime index) and a 'Close' column.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain 'Date' (or be indexed by datetime) and 'Close'.
    short_window : int
        Window size for the short SMA (e.g., 20 days).
    long_window : int
        Window size for the long SMA (e.g., 50 days).

    Returns
    -------
    pd.DataFrame
        Original DataFrame with the following added columns:
          - 'SMA_short' : short‐period moving average
          - 'SMA_long'  : long‐period moving average
          - 'Signal'    : 1 when short crosses above long,
                          -1 when short crosses below long, else 0
    """

    df = df.copy()

    # Ensure 'Date' is a datetime type and set as index if not already
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], errors="raise")
        df.set_index("Date", inplace=True)
    elif not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("DataFrame must have a 'Date' column or be indexed by datetime.")

    # 1) Compute Simple Moving Averages
    df["SMA_short"] = df["Close"].rolling(window=short_window, min_periods=1).mean()
    df["SMA_long"]  = df["Close"].rolling(window=long_window, min_periods=1).mean()

    # 2) Generate raw crossover signals: 
    #    - When SMA_short > SMA_long → 1 (long), when SMA_short < SMA_long → 0
    df["Crossover"] = np.where(df["SMA_short"] > df["SMA_long"], 1, 0)

    # 3) The actual “Signal” is the change in Crossover:
    #    - A rising edge (0→1) means a buy signal (1)
    #    - A falling edge (1→0) means a sell signal (−1)
    df["Signal"] = df["Crossover"].diff().fillna(0)
    df["Signal"] = df["Signal"].map({1.0: 1, -1.0: -1}).fillna(0).astype(int)

    # Rows where no crossover event happen remain at 0
    # Cleanup temporary column if you don’t need it:
    df.drop(columns=["Crossover"], inplace=True)

    return df

In [16]:
import pandas as pd
import numpy as np

def bollinger_bands_strategy(df: pd.DataFrame,
                             window: int = 20,
                             num_std: float = 2.0) -> pd.DataFrame:
    """
    Compute a simple Bollinger Bands breakout strategy on `df`.
    Assumes `df` has a 'Date' column (or datetime index) and a 'Close' column.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain 'Date' (or be indexed by datetime) and 'Close'.
    window : int
        Rolling window size for moving average and standard deviation.
    num_std : float
        Number of standard deviations for upper/lower bands.

    Returns
    -------
    pd.DataFrame
        Original DataFrame with these added columns:
          - 'BB_mid'   : rolling mean (middle band)
          - 'BB_std'   : rolling standard deviation
          - 'BB_upper' : BB_mid + num_std * BB_std
          - 'BB_lower' : BB_mid − num_std * BB_std
          - 'Signal'   :  1 when Close < BB_lower (buy)
                          -1 when Close > BB_upper (sell), else 0
    """

    df = df.copy()

    # Ensure 'Date' is datetime and set as index if needed
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], errors="raise")
        df.set_index("Date", inplace=True)
    elif not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("DataFrame must have a 'Date' column or be indexed by datetime.")

    # 1) Compute rolling mean (mid) and rolling std (std) over 'window'
    df["BB_mid"] = df["Close"].rolling(window=window, min_periods=1).mean()
    df["BB_std"] = df["Close"].rolling(window=window, min_periods=1).std(ddof=0)

    # 2) Compute upper and lower bands
    df["BB_upper"] = df["BB_mid"] + (num_std * df["BB_std"])
    df["BB_lower"] = df["BB_mid"] - (num_std * df["BB_std"])

    # 3) Generate signals:
    #    - Buy (1)  when price closes below the lower band
    #    - Sell (-1) when price closes above the upper band
    conditions = [
        df["Close"] < df["BB_lower"],  # price has broken below lower band
        df["Close"] > df["BB_upper"],  # price has broken above upper band
    ]
    choices = [1, -1]

    df["Signal"] = np.select(conditions, choices, default=0).astype(int)

    return df

In [2]:
import pandas as pd
from typing import List

def backtest_with_shorts(
    df: pd.DataFrame,
    price_col: str = "Close",
    signal_col: str = "Signal"
) -> List[float]:
    """
    Backtest a strategy that can go long (signal=+1), short (signal=-1), or flat (signal=0).
    For each change in signal, it closes any open position and (if new signal != 0) opens a new position.
    Returns a list of percent PnLs for each completed trade.

    Parameters
    ----------
    df : pd.DataFrame
        Must be sorted in ascending date order and contain:
          - price_col   : price at which to execute trades (float)
          - signal_col  : integer signal (+1=open long, -1=open short, 0=flat)
    price_col : str
        Name of the column with execution prices.
    signal_col : str
        Name of the column with signals (+1, -1, 0).

    Returns
    -------
    List[float]
        Percent PnLs for each round-trip (long or short) trade.
    """
    pnls: List[float] = []
    current_position = 0   # +1 for long, -1 for short, 0 for flat
    entry_price = None     # price at which the current position was opened

    for idx, row in df.iterrows():
        sig = int(row[signal_col])
        price = float(row[price_col])

        # If the signal changes from whatever we currently hold:
        if sig != current_position:
            # 1) Close any existing position
            if current_position != 0 and entry_price is not None:
                if current_position == 1:
                    # closing a long
                    pnl = (price - entry_price) / entry_price * 100.0
                else:  # current_position == -1, closing a short
                    pnl = (entry_price - price) / entry_price * 100.0

                pnls.append(pnl)
                entry_price = None
                current_position = 0

            # 2) Open a new position if sig != 0
            if sig != 0:
                current_position = sig
                entry_price = price

        # If sig == current_position, do nothing (hold the position)
        # If sig == 0 and current_position == 0, do nothing (remain flat)

    # At the end: if a position is still open, close it at the last price
    if current_position != 0 and entry_price is not None:
        last_price = float(df.iloc[-1][price_col])
        if current_position == 1:
            pnl = (last_price - entry_price) / entry_price * 100.0
        else:  # current_position == -1
            pnl = (entry_price - last_price) / entry_price * 100.0

        pnls.append(pnl)

    return pnls

In [3]:
df = pd.read_csv("historical_data/AAPL_data_train.csv")
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1980-12-12 05:00:00+00:00,0.098597,0.099025,0.098597,0.098597,469033600,0.0,0.0
1,1980-12-15 05:00:00+00:00,0.093881,0.093881,0.093453,0.093453,175884800,0.0,0.0
2,1980-12-16 05:00:00+00:00,0.087022,0.087022,0.086594,0.086594,105728000,0.0,0.0
3,1980-12-17 05:00:00+00:00,0.088737,0.089165,0.088737,0.088737,86441600,0.0,0.0
4,1980-12-18 05:00:00+00:00,0.091310,0.091738,0.091310,0.091310,73449600,0.0,0.0
...,...,...,...,...,...,...,...,...
10848,2023-12-22 05:00:00+00:00,193.761051,193.989390,191.567126,192.192551,37122800,0.0,0.0
10849,2023-12-26 05:00:00+00:00,192.202487,192.480450,191.428159,191.646561,28919300,0.0,0.0
10850,2023-12-27 05:00:00+00:00,191.090629,192.093281,189.700797,191.745819,48087700,0.0,0.0
10851,2023-12-28 05:00:00+00:00,192.728641,193.244865,191.765691,192.172714,34049900,0.0,0.0


In [4]:
df = moving_average_crossover(df, short_window=20, long_window=50)

In [5]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,SMA_short,SMA_long,Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1980-12-12 05:00:00+00:00,0.098597,0.099025,0.098597,0.098597,469033600,0.0,0.0,0.098597,0.098597,0
1980-12-15 05:00:00+00:00,0.093881,0.093881,0.093453,0.093453,175884800,0.0,0.0,0.096025,0.096025,0
1980-12-16 05:00:00+00:00,0.087022,0.087022,0.086594,0.086594,105728000,0.0,0.0,0.092881,0.092881,0
1980-12-17 05:00:00+00:00,0.088737,0.089165,0.088737,0.088737,86441600,0.0,0.0,0.091845,0.091845,0
1980-12-18 05:00:00+00:00,0.091310,0.091738,0.091310,0.091310,73449600,0.0,0.0,0.091738,0.091738,0
...,...,...,...,...,...,...,...,...,...,...
2023-12-22 05:00:00+00:00,193.761051,193.989390,191.567126,192.192551,37122800,0.0,0.0,192.260551,183.959353,0
2023-12-26 05:00:00+00:00,192.202487,192.480450,191.428159,191.646561,28919300,0.0,0.0,192.422367,184.245960,0
2023-12-27 05:00:00+00:00,191.090629,192.093281,189.700797,191.745819,48087700,0.0,0.0,192.558868,184.537131,0
2023-12-28 05:00:00+00:00,192.728641,193.244865,191.765691,192.172714,34049900,0.0,0.0,192.767838,184.867971,0


In [6]:
backtest_results = backtest_with_shorts(df, price_col="Close", signal_col="Signal")