## Synthetic data creation (simplified)
This notebook contains the simple code from the blog post. The final script used for creating the numbers in the blog post with asynchronous functions with batching, train-test splitting, comparisons to OpenAI models etc. is `synthetic_data_creation.ipynb`. 

In [2]:
# install requirements
%%bash
pip install --upgrade pip -q
pip install transformers~=4.37.2
pip install huggingface_hub~=0.20.3
pip install datasets~=2.16.1
pip install openai~=1.11.0
pip install scikit-learn
pip install pandas
pip install tqdm
pip install python-dotenv

'%%bash\npip install --upgrade pip -q\npip install transformers~=4.37.2\npip install huggingface_hub~=0.20.3\npip install datasets~=2.16.1\npip install openai~=1.11.0\npip install scikit-learn\npip install pandas\npip install tqdm\npip install python-dotenv'

In [2]:
import os
from tqdm import tqdm
import ast
import numpy as np
import pandas as pd
import random
import json
from datetime import datetime
import os
import requests
from datasets import load_dataset
import random
 
print("Notebook running")

Notebook running


### Global Variables

In [None]:
# login via the huggingface hub with you hf_token
# you need a huggingface account and create a token here: https://huggingface.co/settings/tokens
# we can then call on the token with huggingface_hub.get_token()
import huggingface_hub
huggingface_hub.login()

In [4]:
# global variables for experiment variations
API_URL = "https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1"
SEED = 42
N_SAMPLE = False  # You can sample parts of the data for faster testing. False for run on full dataset, int for sampling
SELF_CONSISTENCY_ITERATIONS = 3  # How many times should the model try to predict the same text for self-consistency?
DATA_SUBSET = "sentences_allagree"  # "sentences_allagree", "sentences_66agree", "sentences_75agree"

### Load and prepare dataset

In [5]:
# financial_phrasebank paper: https://arxiv.org/pdf/1307.5336.pdf
random.seed(SEED)

# load dataset
dataset = load_dataset(
    "financial_phrasebank", DATA_SUBSET, 
    split="train"  # note that the dataset does not have a default test split
)

# sample for faster testing
if N_SAMPLE: 
    dataset = dataset.select(random.sample(range(len(dataset)), N_SAMPLE))


Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 453
})


In [6]:
# create a new column with the numeric label verbalised as label_text (e.g. "positive" instead of "0")
label_map = {i: label_text for i, label_text in enumerate(dataset.features["label"].names)}

def add_label_text(example):
    example["label_text"] = label_map[example["label"]]
    return example

dataset = dataset.map(add_label_text)

print(dataset)

Dataset({
    features: ['sentence', 'label', 'idx', 'label_text'],
    num_rows: 453
})


### Prompts / Instructions

In [7]:
# prompt is inspired by the annotator instructions provided in section "Annotation task and instructions"
# in the financial_phrasebank paper: https://arxiv.org/pdf/1307.5336.pdf

prompt_financial_sentiment = """\
You are a highly qualified expert trained to annotate machine learning training data.

Your task is to analyze the sentiment in the TEXT below from an investor perspective and label it with only one the three labels:
positive, negative, or neutral.

Base your label decision only on the TEXT and do not speculate e.g. based on prior knowledge about a company. 

Do not provide any explanations and only respond with one of the labels as one word: negative, positive, or neutral

Examples:
Text: Operating profit increased, from EUR 7m to 9m compared to the previous reporting period.
Label: positive
Text: The company generated net sales of 11.3 million euro this year.
Label: neutral
Text: Profit before taxes decreased to EUR 14m, compared to EUR 19m in the previous period.	
Label: negative

Your TEXT to analyse:
TEXT: {text}
Label: """



prompt_financial_sentiment_cot = """\
You are a highly qualified expert trained to annotate machine learning training data.

Your task is to briefly analyze the sentiment in the TEXT below from an investor perspective and then label it with only one the three labels:
positive, negative, neutral.

Base your label decision only on the TEXT and do not speculate e.g. based on prior knowledge about a company. 

You first reason step by step about the correct label and then return your label.

You ALWAYS respond only in the following JSON format: {{"reason": "...", "label": "..."}}
You only respond with one single JSON response. 

Examples:
Text: Operating profit increased, from EUR 7m to 9m compared to the previous reporting period.
JSON response: {{"reason": "An increase in operating profit is positive for investors", "label": "positive"}}
Text: The company generated net sales of 11.3 million euro this year.
JSON response: {{"reason": "The text only mentions financials without indication if they are better or worse than before", "label": "neutral"}}
Text: Profit before taxes decreased to EUR 14m, compared to EUR 19m in the previous period.	
JSON response: {{"reason": "A decrease in profit is negative for investors", "label": "negative"}}

Your TEXT to analyse:
TEXT: {text}
JSON response: """



In [None]:
# apply chat templates
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")

chat_financial_sentiment = [{"role": "user", "content": prompt_financial_sentiment}]
chat_financial_sentiment_cot = [{"role": "user", "content": prompt_financial_sentiment_cot}]

prompt_financial_sentiment = tokenizer.apply_chat_template(chat_financial_sentiment, tokenize=False)
prompt_financial_sentiment_cot = tokenizer.apply_chat_template(chat_financial_sentiment_cot, tokenize=False)

# The prompt now includes special tokens: '<s>[INST] You are a highly qualified expert ...  [/INST]'

### Test simplified code for blog

In [8]:

labels = ["positive", "negative", "neutral"]

def clean_output(string, random_choice=True):
    for category in labels:
        if category.lower() in string.lower():
            return category
    # if the output string cannot be mapped to one of the categories, we either return "FAIL" or choose a random label
    if random_choice:
        return random.choice(labels)
    else:
        return "FAIL"


def process_output_cot(output):
    try: 
        output_dic = ast.literal_eval(output) 
        return output_dic
    except Exception as e:
        # if json/dict parse fails, do simple search for occurance of first label term
        print(f"Parsing failed for output: {output}, Error: {e}")
        output_cl = clean_output(output, random_choice=False)
        output_dic = {"reason": "FAIL", "label": output_cl}
        return output_dic


# docs on different parameters: https://huggingface.co/docs/api-inference/detailed_parameters#text-generation-task
generation_params = dict(
    top_p=0.90,
    temperature=0.8,
    max_new_tokens=128,
    return_full_text=False,
    use_cache=False,
)

def generate_text(prompt=None, generation_params=None):
    payload = {
        "inputs": prompt, 
        "parameters": {**generation_params}
    }
    response = requests.post(
				API_URL, 
				headers={"Authorization": f"Bearer {huggingface_hub.get_token()}"}, 
				json=payload
		)
    return response.json()[0]["generated_text"]



In [9]:
output_simple = []
for text in tqdm(dataset["sentence"]):
	# add text into the prompt template
    prompt_formatted = prompt_financial_sentiment.format(text=text)
    # send text to API
    output = generate_text(
        prompt=prompt_formatted, generation_params=generation_params
    )
	# clean output
    output_cl = clean_output(output, random_choice=True)
    output_simple.append(output_cl)
    
    
SELF_CONSISTENCY_ITERATIONS = 3

output_cot_multiple = []
for _ in range(SELF_CONSISTENCY_ITERATIONS):
    output_lst_step = []
    for text in tqdm(dataset["sentence"]):
        prompt_formatted = prompt_financial_sentiment_cot.format(text=text)
        output = generate_text(
            prompt=prompt_formatted, generation_params=generation_params
        )
        output_dic = process_output_cot(output)
        output_lst_step.append(output_dic["label"])

    output_cot_multiple.append(output_lst_step)
    

100%|██████████| 8/8 [00:15<00:00,  1.94s/it]
100%|██████████| 8/8 [00:18<00:00,  2.29s/it]
100%|██████████| 8/8 [00:21<00:00,  2.68s/it]
100%|██████████| 8/8 [00:21<00:00,  2.67s/it]


In [17]:
from collections import Counter

def find_majority(row):
    # Count occurrences
    count = Counter(row)
    # Find majority
    majority = count.most_common(1)[0]
    # Check if it's a real majority or if all labels are equally frequent
    if majority[1] > 1:
        return majority[0]
    else: # in case all labels appear with equal frequency
        return random.choice(labels)

df_output = pd.DataFrame(data=output_cot_multiple).T

df_output['label_pred_cot_multiple'] = df_output.apply(find_majority, axis=1)
df_output

Unnamed: 0,0,1,2,label_pred_cot_multiple
0,neutral,neutral,neutral,neutral
1,positive,positive,positive,positive
2,neutral,neutral,neutral,neutral
3,positive,positive,positive,positive
4,positive,positive,positive,positive
5,neutral,positive,neutral,neutral
6,neutral,neutral,neutral,neutral
7,neutral,neutral,neutral,neutral


In [18]:
from sklearn.metrics import classification_report

def compute_metrics(label_experts, label_pred):
		# classification report gives us both aggregate and per-class metrics 
    metrics_report = classification_report(
        label_experts, label_pred, digits=2, output_dict=True, zero_division='warn'
    )

    return metrics_report

label_experts = dataset["label_text"]
label_pred = output_simple
label_pred_cot_multiple = df_output['label_pred_cot_multiple']

print(label_experts)
print(label_pred)

metrics_simple = compute_metrics(label_experts, label_pred)
metrics_cot_multiple = compute_metrics(label_experts, label_pred_cot_multiple)


['neutral', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral']
['neutral', 'positive', 'positive', 'positive', 'positive', 'positive', 'neutral', 'neutral']


In [23]:

df_train = pd.DataFrame({
    "text": dataset["sentence"],
    "labels": df_output['label_pred_cot_multiple']
})

df_train.to_csv("df_train.csv")
