1. For each date, get the gics level (sector, industry group, or industry) with the highest returns over the next day.
2. Construct a label for each FOMC statement with the label being the gics level with the highest and lowest returns over the next day.

In [100]:
from collections import defaultdict
import json
import numpy as np
import pandas as pd
import tiktoken

In [73]:
sp = pd.read_pickle('data/sp500_constituents.pkl')
fomc = pd.read_pickle('data/fomc_statements.pkl')
fomc = fomc.drop(columns='statement')
fomc = fomc.rename(columns={'cleaned_statement':'statement'})
fomc = fomc[fomc['date']>='2000-01-03'].reset_index(drop=True)
fomc = fomc[fomc['date'] <= '2024-01-01'].reset_index(drop=True)

In [93]:
# gics_level = 'sector'
gics_level = 'group'
# gics_level = 'industry'

In [76]:
# get return for next day of each stock 
sp['next_ret'] = sp.groupby('gvkey')['ret'].shift(-1)

# for each date, get returns of each gics group
gics_returns = pd.DataFrame(sp.groupby(['date', gics_level])['next_ret'].mean().reset_index())

gics_returns = gics_returns.dropna().reset_index(drop=True)
gics_returns

Unnamed: 0,date,group,next_ret
0,2000-01-03,Automobiles & Components,-0.015753
1,2000-01-03,Banks,-0.033881
2,2000-01-03,Capital Goods,-0.020263
3,2000-01-03,Commercial & Professional Services,-0.023463
4,2000-01-03,Consumer Durables & Apparel,-0.026479
...,...,...,...
147847,2023-12-28,Software & Services,-0.003382
147848,2023-12-28,Technology Hardware & Equipment,-0.004529
147849,2023-12-28,Telecommunication Services,0.005558
147850,2023-12-28,Transportation,-0.009013


In [77]:
# get the gics group with the highest and lowest return over the next day

def get_high_low(df):
    df = df.dropna(subset=['next_ret'])
    
    # find the group with the highest and lowest return for each date
    highest_return_gics = df.loc[df['next_ret'].idxmax(), gics_level]
    highest_return = df['next_ret'].max()
    
    lowest_return_gics = df.loc[df['next_ret'].idxmin(), gics_level]
    lowest_return = df['next_ret'].min()
    
    return pd.Series({
        f'highest_return_{gics_level}': highest_return_gics,
        'highest_return': highest_return,
        f'lowest_return_{gics_level}': lowest_return_gics,
        'lowest_return': lowest_return
    })

labels = gics_returns.groupby('date').apply(get_high_low).reset_index()

  labels = gics_returns.groupby('date').apply(get_high_low).reset_index()


In [78]:
labels

Unnamed: 0,date,highest_return_group,highest_return,lowest_return_group,lowest_return
0,2000-01-03,Utilities,0.006866,Technology Hardware & Equipment,-0.063528
1,2000-01-04,Real Estate,0.043004,Software & Services,-0.041469
2,2000-01-05,Energy,0.039145,Technology Hardware & Equipment,-0.028927
3,2000-01-06,"Pharmaceuticals, Biotechnology & Life Sciences",0.061160,Real Estate,-0.010203
4,2000-01-07,Semiconductors & Semiconductor Equipment,0.077122,Insurance,-0.031050
...,...,...,...,...,...
6031,2023-12-21,Software & Services,0.009323,Consumer Durables & Apparel,-0.013113
6032,2023-12-22,Semiconductors & Semiconductor Equipment,0.017757,Consumer Services,0.000384
6033,2023-12-26,"Pharmaceuticals, Biotechnology & Life Sciences",0.006470,Energy,-0.005782
6034,2023-12-27,Utilities,0.007503,Energy,-0.014100


In [79]:
# create new col that shows the desired label from chatgpt
labels['strategy'] = labels.apply(
    lambda row: f"long: {row[f'highest_return_{gics_level}']}, short: {row[f'lowest_return_{gics_level}']}", axis=1
)

labels


Unnamed: 0,date,highest_return_group,highest_return,lowest_return_group,lowest_return,strategy
0,2000-01-03,Utilities,0.006866,Technology Hardware & Equipment,-0.063528,"long: Utilities, short: Technology Hardware & ..."
1,2000-01-04,Real Estate,0.043004,Software & Services,-0.041469,"long: Real Estate, short: Software & Services"
2,2000-01-05,Energy,0.039145,Technology Hardware & Equipment,-0.028927,"long: Energy, short: Technology Hardware & Equ..."
3,2000-01-06,"Pharmaceuticals, Biotechnology & Life Sciences",0.061160,Real Estate,-0.010203,"long: Pharmaceuticals, Biotechnology & Life Sc..."
4,2000-01-07,Semiconductors & Semiconductor Equipment,0.077122,Insurance,-0.031050,long: Semiconductors & Semiconductor Equipment...
...,...,...,...,...,...,...
6031,2023-12-21,Software & Services,0.009323,Consumer Durables & Apparel,-0.013113,"long: Software & Services, short: Consumer Dur..."
6032,2023-12-22,Semiconductors & Semiconductor Equipment,0.017757,Consumer Services,0.000384,long: Semiconductors & Semiconductor Equipment...
6033,2023-12-26,"Pharmaceuticals, Biotechnology & Life Sciences",0.006470,Energy,-0.005782,"long: Pharmaceuticals, Biotechnology & Life Sc..."
6034,2023-12-27,Utilities,0.007503,Energy,-0.014100,"long: Utilities, short: Energy"


In [80]:
fomc

Unnamed: 0,date,statement
0,2000-02-02,immediate release federal open market committe...
1,2000-03-21,immediate release federal open market committe...
2,2000-05-16,immediate release federal open market committe...
3,2000-06-28,immediate release federal open market committe...
4,2000-08-22,immediate release federal open market committe...
...,...,...
154,2023-06-14,recent indicators suggest economic activity co...
155,2023-07-26,recent indicators suggest economic activity ex...
156,2023-09-20,recent indicators suggest economic activity ex...
157,2023-11-01,recent indicators suggest economic activity ex...


In [85]:
# create the labeled dataset (input: FOMC and date of FOMC, output: desired output from GPT)
labeled_fomcs = pd.merge(fomc, labels[['date', 'strategy']], on='date', how='left')

In [94]:
labeled_fomcs.isna().sum()

date         0
statement    0
strategy     0
dtype: int64

In [95]:
labeled_fomcs

Unnamed: 0,date,statement,strategy
0,2000-02-02,immediate release federal open market committe...,long: Semiconductors & Semiconductor Equipment...
1,2000-03-21,immediate release federal open market committe...,long: Semiconductors & Semiconductor Equipment...
2,2000-05-16,immediate release federal open market committe...,"long: Energy, short: Food & Staples Retailing"
3,2000-06-28,immediate release federal open market committe...,"long: Health Care Equipment & Services, short:..."
4,2000-08-22,immediate release federal open market committe...,long: Semiconductors & Semiconductor Equipment...
...,...,...,...
154,2023-06-14,recent indicators suggest economic activity co...,"long: Telecommunication Services, short: Semic..."
155,2023-07-26,recent indicators suggest economic activity ex...,long: Semiconductors & Semiconductor Equipment...
156,2023-09-20,recent indicators suggest economic activity ex...,"long: Food, Beverage & Tobacco, short: Real Es..."
157,2023-11-01,recent indicators suggest economic activity ex...,"long: Banks, short: Automobiles & Components"


In [92]:
# properly format above data for training gpt model

# Create a list to hold each formatted conversation
chat_data = []

# Loop through each row and format the data
for index, row in labeled_fomcs.iterrows():
    date = row['date']
    statement = row['statement']
    # Check if the strategy is not NaN before adding to the dataset
    if pd.notna(row['strategy']):
        chat_data.append({
            "messages": [{
            # system message to describe the chatbot
            "role": "system",
            "content": f"""As of {date.strftime('%Y-%m-%d')}, you are a financial analyst specializing in 
            interpreting FOMC statements to predict GICS industry group returns in the stock market."""
        },
        {   
            # system to describe what we are asking the
            "role": "user",
            "content": f"""Based on the FOMC statement released on {date.strftime('%Y-%m-%d')}, please identify:

        - The industry group that will have the highest returns over the next day.
        - The industry group that will have the lowest returns over the next day.

        Provide your answer in the following format:

        'long: industry_group, short: industry_group'

        Recall the list of industry groups to choose from are:
        ['Energy',
        'Materials',
        'Capital Goods',
        'Commercial & Professional Services',
        'Transportation',
        'Automobiles & Components',
        'Consumer Durables & Apparel',
        'Consumer Services',
        'Retailing',
        'Media & Entertainment',
        'Food & Staples Retailing',
        'Food, Beverage & Tobacco',
        'Household & Personal Products',
        'Health Care Equipment & Services',
        'Pharmaceuticals, Biotechnology & Life Sciences',
        'Banks',
        'Diversified Financials',
        'Insurance',
        'Real Estate',
        'Software & Services',
        'Technology Hardware & Equipment',
        'Semiconductors & Semiconductor Equipment',
        'Telecommunication Services',
        'Media & Entertainment',
        'Utilities',
        'Real Estate',
        'Real Estate Management & Development']


        Here is the FOMC Statement:
        \"\"\"
        {statement}
        \"\"\"
        """
        },
        {
            "role": "assistant", 
            "content": row['strategy']
        }
        ]})

#  path to save the JSONL file
output_file = "fine_tuning_chat_data.jsonl"

# write data to a JSONL file
with open(output_file, 'w') as f:
    for entry in chat_data:
        f.write(json.dumps(entry) + "\n")

print(f"Data successfully saved to {output_file}")


Data successfully saved to fine_tuning_chat_data.jsonl


In [96]:
labeled_fomcs.dtypes

date         datetime64[ns]
statement            object
strategy             object
dtype: object

In [97]:
# split dataset into train, val, and test sets
train_df = labeled_fomcs[labeled_fomcs['date'].between('2000-01-01', '2015-12-31')]
val_df = labeled_fomcs[labeled_fomcs['date'].between('2016-01-01', '2018-12-31')]
test_df = labeled_fomcs[labeled_fomcs['date'] >= '2019-01-01']

# format the data into the required structure
def format_chat_data(df):
    # create list to hold each formatted conversation
    chat_data = []

    # loop through each row and format the data
    for index, row in labeled_fomcs.iterrows():
        date = row['date']
        statement = row['statement']

        # check if the strategy is not null before adding to the dataset
        if pd.notna(row['strategy']):
            chat_data.append({
                "messages": [{
                # system message to describe the chatbot
                "role": "system",
                "content": f"""As of {date.strftime('%Y-%m-%d')}, you are a financial analyst specializing in 
                interpreting FOMC statements to predict GICS industry group returns in the stock market."""
            },
            {   
                # system to describe what we are asking the
                "role": "user",
                "content": f"""Based on the FOMC statement released on {date.strftime('%Y-%m-%d')}, please identify:

            - The industry group that will have the highest returns over the next day.
            - The industry group that will have the lowest returns over the next day.

            Provide your answer in the following format:

            'long: industry_group, short: industry_group'

            Recall the list of industry groups to choose from are:
            ['Energy',
            'Materials',
            'Capital Goods',
            'Commercial & Professional Services',
            'Transportation',
            'Automobiles & Components',
            'Consumer Durables & Apparel',
            'Consumer Services',
            'Retailing',
            'Media & Entertainment',
            'Food & Staples Retailing',
            'Food, Beverage & Tobacco',
            'Household & Personal Products',
            'Health Care Equipment & Services',
            'Pharmaceuticals, Biotechnology & Life Sciences',
            'Banks',
            'Diversified Financials',
            'Insurance',
            'Real Estate',
            'Software & Services',
            'Technology Hardware & Equipment',
            'Semiconductors & Semiconductor Equipment',
            'Telecommunication Services',
            'Media & Entertainment',
            'Utilities',
            'Real Estate',
            'Real Estate Management & Development']


            Here is the FOMC Statement:
            \"\"\"
            {statement}
            \"\"\"
            """
            },
            {
                "role": "assistant", 
                "content": row['strategy']
            }
            ]})

    return chat_data


In [98]:
# format each subset of the data
train_data = format_chat_data(train_df)
val_data = format_chat_data(val_df)
test_data = format_chat_data(test_df)

# save data to JSONL
def save_to_jsonl(data, filename):
    with open(filename, 'w') as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")
    print(f"Data successfully saved to {filename}")

save_to_jsonl(train_data, "train_data.jsonl")
save_to_jsonl(val_data, "validation_data.jsonl")
save_to_jsonl(test_data, "test_data.jsonl")

Data successfully saved to train_data.jsonl
Data successfully saved to validation_data.jsonl
Data successfully saved to test_data.jsonl


### Ensure Data in Proper Format  
[Data Formatting](https://cookbook.openai.com/examples/chat_finetuning_data_prep)

In [102]:
# Format error checks
format_errors = defaultdict(int)

for ex in train_data:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [103]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [104]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in train_data:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 382, 926
mean / median: 587.7672955974842, 569.0
p5 / p95: 477.6, 708.2

#### Distribution of num_assistant_tokens_per_example:
min / max: 7, 19
mean / median: 12.11320754716981, 12.0
p5 / p95: 8.0, 16.0

0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning


In [105]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 16385

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(train_data)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~93455 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~280365 tokens
