In [1]:
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt

In [2]:
def get_signals(data: pd.DataFrame, horizon: int, alpha: float, ticker:str):
    """
    Generate trading signals based on future price movements.

    Parameters:
        data (pd.DataFrame): The input DataFrame containing price data.
        horizon (int): The number of periods to look ahead for price movement.
        alpha (float): The threshold for generating buy/sell signals.

    Returns:
        pd.DataFrame: The DataFrame with an additional 'Signal' column.
    """

    data = data.copy()
    # Calculate future percentage change
    data['future_pct_change'] = data[ticker].pct_change(periods=horizon).shift(-horizon)

    # Generate signals
    data['signal'] = 0
    data.loc[data['future_pct_change'] > alpha, 'signal'] = 1
    data.loc[data['future_pct_change'] < -alpha, 'signal'] = -1

    # Clean up
    data.drop(columns=['future_pct_change'], inplace=True)

    return data


def signal_distribution(data: pd.DataFrame):
    """
    Print the distribution of trading signals in the DataFrame.

    Parameters:
        data (pd.DataFrame): The input DataFrame containing trading signals.
    """

    counts = data['signal'].value_counts()
    pct = counts / len(data)

    print("Signal Distribution:")
    for signal, count in counts.items():
        print(f"Signal {signal}: Count = {count}, Percentage = {pct[signal]:.2%}")


def plot_price(data: pd.DataFrame, ticker: str):
    """
    Plot the closing price of the stock.

    Parameters:
        data (pd.DataFrame): The input DataFrame containing price data.
        ticker (str): The stock ticker symbol.
    """

    plt.figure(figsize=(10, 5))
    plt.plot(data.index, data[ticker], label=f'{ticker} Closing Price')
    plt.title(f'{ticker} Closing Price Over Time')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.grid()
    plt.show()

def split_data(data: pd.DataFrame, train_size: float = 0.6) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split the DataFrame into training and testing sets based on the given train size ratio.

    Parameters:
        data (pd.DataFrame): The DataFrame to be split.
        train_size (float): The proportion of the data to be used for training (default is 0.6).

    Returns:
        tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the training, testing, and validation DataFrames.
    """

    data = data.copy()

    # Calculate split indices
    train_size = int(len(data) * train_size)
    test_size = int(len(data) * 0.2)

    # Split the data
    train = data[:train_size]
    test = data[train_size:train_size + test_size]
    validation = data[train_size + test_size:]

    return train, test, validation

In [3]:
tickers = ['AAPL', 'NVDA', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'SBUX', 'JPM', 'AXP', 'MCD', 'KO', 
           'NFLX', 'CMG', 'CP', 'WMT', 'V', 'GLD', 'BLK', 'PG', 'JNJ', 'TMUS', 'MA', 'BX', 'LULU', 'DPZ', 'BAC', 
           'FDX', 'DIS', 'GE', 'HSY', 'HP', 'COST', 'HD', 'K', 'ADBE', 'CSCO', 'T', 'F', 'NKE', 'CVX', 'XOM', 
           'PEP', 'PFE', 'RL', 'AZN', 'VZ', 'WBD', 'HSBC','EME']
df = yf.download(tickers, start='2010-10-28',
                       end='2025-10-28', interval='1d', auto_adjust=True)['Close']
df

[*********************100%***********************]  49 of 49 completed


Ticker,AAPL,ADBE,AMZN,AXP,AZN,BAC,BLK,BX,CMG,COST,...,RL,SBUX,T,TMUS,TSLA,V,VZ,WBD,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-10-28,9.163342,28.100000,8.342000,33.386795,14.320231,9.007681,114.134750,6.218073,4.208400,45.166836,...,72.891602,10.717488,7.394424,16.619528,1.412667,17.153400,15.197101,22.979050,13.175973,37.348209
2010-10-29,9.035454,28.150000,8.261500,33.581173,14.328748,8.945181,116.076263,6.321241,4.204200,45.145252,...,75.188881,10.842773,7.399611,16.732147,1.456000,17.537071,15.229923,22.825754,13.197901,37.500469
2010-11-01,9.131519,28.219999,8.129000,33.662182,14.183927,8.984243,116.660049,6.307168,4.221600,45.512051,...,74.862900,10.937686,7.451500,16.909122,1.427333,17.348600,15.192413,22.687786,13.232017,37.759914
2010-11-02,9.287024,29.020000,8.230500,34.334450,14.269120,8.906121,117.467880,6.560396,4.304200,45.828510,...,75.282021,10.964260,7.508580,17.343515,1.416667,17.595417,15.379968,22.733776,13.348955,38.261879
2010-11-03,9.390293,28.799999,8.423500,34.075256,14.294676,8.999868,112.457993,6.649494,4.386400,45.425755,...,74.994881,11.047782,7.539715,17.440046,1.451333,17.725548,15.469060,21.768011,13.378197,38.335209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-10-21,262.769989,357.549988,222.029999,355.220001,83.220001,51.520000,1130.000000,160.009888,42.360001,936.175537,...,334.980011,85.900002,26.049999,229.080002,442.600006,347.209991,40.299999,20.330000,106.220001,112.709999
2025-10-22,258.450012,354.089996,217.949997,352.000000,83.430000,51.099998,1129.680054,160.297348,41.610001,943.345398,...,332.119995,85.500000,25.549999,227.399994,438.970001,345.359985,39.799999,20.530001,107.139999,114.709999
2025-10-23,259.579987,354.119995,221.089996,354.929993,83.400002,51.759998,1125.000000,153.616623,41.799999,940.719116,...,336.290009,85.440002,24.620001,219.990005,448.980011,345.959991,38.400002,21.250000,106.860001,115.980003
2025-10-24,262.820007,353.519989,224.210007,357.559998,83.290001,52.570000,1136.630005,153.239975,41.189999,930.823120,...,335.670013,86.089996,25.139999,217.770004,433.720001,347.380005,38.820000,21.150000,106.169998,115.389999


In [4]:
#for ticker in tickers:
    #plot_price(df, ticker)

In [5]:
train, test, validation = split_data(df, train_size=0.6)

In [10]:
a = ['AAPL', 'NVDA', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'SBUX', 'JPM', 'AXP', 'MCD', 'KO', 
           'NFLX', 'CMG', 'CP', 'WMT', 'V', 'GLD', 'BLK', 'PG', 'JNJ', 'TMUS', 'MA', 'BX', 'LULU', 'DPZ', 'BAC', 
           'FDX', 'DIS', 'GE', 'HSY', 'HP', 'COST', 'HD', 'K', 'ADBE', 'CSCO', 'T', 'F', 'NKE', 'CVX', 'XOM', 
           'PEP', 'PFE', 'RL', 'AZN', 'VZ', 'WBD', 'HSBC','EME']

horizon = 5
alpha = 0.015
# Base alpha 0.02
# HP, LULU, RL, KO, PFE
ticker = 'MSFT'
df = get_signals(train, horizon, alpha, ticker)
signal_distribution(df)

Signal Distribution:
Signal 0: Count = 975, Percentage = 43.08%
Signal 1: Count = 785, Percentage = 34.69%
Signal -1: Count = 503, Percentage = 22.23%
