In [1]:
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt

In [6]:
def get_signals(data: pd.DataFrame, horizon: int, alpha: float, ticker:str):
    """
    Generate trading signals based on future price movements.

    Parameters:
        data (pd.DataFrame): The input DataFrame containing price data.
        horizon (int): The number of periods to look ahead for price movement.
        alpha (float): The threshold for generating buy/sell signals.

    Returns:
        pd.DataFrame: The DataFrame with an additional 'Signal' column.
    """

    data = data.copy()
    # Calculate future percentage change
    data['future_pct_change'] = data[ticker].pct_change(periods=horizon).shift(-horizon)

    # Generate signals
    data['signal'] = 0
    data.loc[data['future_pct_change'] > alpha, 'signal'] = 1
    data.loc[data['future_pct_change'] < -alpha, 'signal'] = -1

    # Clean up
    data.drop(columns=['future_pct_change'], inplace=True)

    return data


def signal_distribution(data: pd.DataFrame):
    """
    Print the distribution of trading signals in the DataFrame.

    Parameters:
        data (pd.DataFrame): The input DataFrame containing trading signals.
    """

    counts = data['signal'].value_counts()
    pct = counts / len(data)

    print("Signal Distribution:")
    for signal, count in counts.items():
        print(f"Signal {signal}: Count = {count}, Percentage = {pct[signal]:.2%}")


def plot_price(data: pd.DataFrame, ticker: str):
    """
    Plot the closing price of the stock.

    Parameters:
        data (pd.DataFrame): The input DataFrame containing price data.
        ticker (str): The stock ticker symbol.
    """

    plt.figure(figsize=(10, 5))
    plt.plot(data.index, data[ticker], label=f'{ticker} Closing Price')
    plt.title(f'{ticker} Closing Price Over Time')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.grid()
    plt.show()

def split_data(data: pd.DataFrame, train_size: float = 0.6) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split the DataFrame into training and testing sets based on the given train size ratio.

    Parameters:
        data (pd.DataFrame): The DataFrame to be split.
        train_size (float): The proportion of the data to be used for training (default is 0.6).

    Returns:
        tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the training, testing, and validation DataFrames.
    """

    data = data.copy()

    # Calculate split indices
    train_size = int(len(data) * train_size)
    test_size = int(len(data) * 0.2)

    # Split the data
    train = data[:train_size]
    test = data[train_size:train_size + test_size]
    validation = data[train_size + test_size:]

    return train, test, validation

In [3]:
tickers = ['AAPL', 'NVDA', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'SBUX', 'JPM', 'AXP', 'MCD', 'KO', 
           'NFLX', 'CMG', 'CP', 'WMT', 'V', 'GLD', 'BLK', 'PG', 'JNJ', 'TMUS', 'MA', 'BX', 'LULU', 'DPZ', 'BAC', 
           'FDX', 'DIS', 'GE', 'HSY', 'HP', 'COST', 'HD', 'K', 'ADBE', 'CSCO', 'T', 'F', 'NKE', 'CVX', 'XOM', 
           'PEP', 'PFE', 'RL', 'AZN', 'VZ', 'WBD', 'HSBC']
df = yf.download(tickers, start='2010-10-28',
                       end='2025-10-28', interval='1d', auto_adjust=True)['Close']
df

[*********************100%***********************]  48 of 48 completed


Ticker,AAPL,ADBE,AMZN,AXP,AZN,BAC,BLK,BX,CMG,COST,...,RL,SBUX,T,TMUS,TSLA,V,VZ,WBD,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-10-28,9.163338,28.100000,8.342000,33.386787,14.320225,9.007682,114.134804,6.273259,4.208400,45.166828,...,72.891602,10.717488,7.394421,16.619528,1.412667,17.153399,15.197101,22.979050,13.175977,37.348217
2010-10-29,9.035454,28.150000,8.261500,33.581165,14.328752,8.945182,116.076202,6.377339,4.204200,45.145264,...,75.188889,10.842775,7.399611,16.732147,1.456000,17.537083,15.229918,22.825754,13.197905,37.500484
2010-11-01,9.131520,28.219999,8.129000,33.662174,14.183928,8.984247,116.660057,6.363147,4.221600,45.512062,...,74.862938,10.937691,7.451503,16.909122,1.427333,17.348600,15.192416,22.687786,13.232010,37.759914
2010-11-02,9.287024,29.020000,8.230500,34.334454,14.269116,8.906119,117.467842,6.618620,4.304200,45.828514,...,75.282028,10.964257,7.508582,17.343513,1.416667,17.595411,15.379972,22.733776,13.348957,38.261890
2010-11-03,9.390296,28.799999,8.423500,34.075256,14.294674,8.999868,112.457985,6.708508,4.386400,45.425751,...,74.994865,11.047788,7.539714,17.440046,1.451333,17.725550,15.469067,21.768011,13.378197,38.335201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-10-21,262.769989,357.549988,222.029999,355.220001,83.220001,51.520000,1130.000000,161.429993,42.360001,936.175537,...,334.980011,85.900002,26.049999,229.080002,442.600006,347.209991,40.299999,20.330000,106.220001,112.709999
2025-10-22,258.450012,354.089996,217.949997,352.000000,83.430000,51.099998,1129.680054,161.720001,41.610001,943.345398,...,332.119995,85.500000,25.549999,227.399994,438.970001,345.359985,39.799999,20.530001,107.139999,114.709999
2025-10-23,259.579987,354.119995,221.089996,354.929993,83.400002,51.759998,1125.000000,154.979996,41.799999,940.719116,...,336.290009,85.440002,24.620001,219.990005,448.980011,345.959991,38.400002,21.250000,106.860001,115.980003
2025-10-24,262.820007,353.519989,224.210007,357.559998,83.290001,52.570000,1136.630005,154.600006,41.189999,930.823120,...,335.670013,86.089996,25.139999,217.770004,433.720001,347.380005,38.820000,21.150000,106.169998,115.389999


In [4]:
#for ticker in tickers:
    #plot_price(df, ticker)

In [7]:
train, test, validation = split_data(df, train_size=0.6)

In [14]:
horizon = 5
alpha = 0.01
# Base alpha 0.02
# HP, LULU, RL, KO, PFE
ticker = 'VZ'
df = get_signals(train, horizon, alpha, ticker)
signal_distribution(df)

Signal Distribution:
Signal 1: Count = 827, Percentage = 36.54%
Signal 0: Count = 825, Percentage = 36.46%
Signal -1: Count = 611, Percentage = 27.00%
