In [None]:
# install requirements
!pip install --upgrade pip -q
!pip install transformers~=4.37.2
!pip install huggingface_hub~=0.20.3
!pip install datasets~=2.16.1
!pip install openai~=1.11.0
!pip install scikit-learn
!pip install pandas
!pip install tqdm
!pip install python-dotenv
!pip install aiohttp
!pip install certifi 

In [2]:
import os
from tqdm import tqdm
import ast
import numpy as np
import pandas as pd
import random
import json
from datetime import datetime

from datasets import load_dataset
from datasets import concatenate_datasets
from openai import OpenAI, AsyncOpenAI
import openai

import asyncio
from aiohttp import ClientSession, ClientTimeout, ClientError
from tqdm import tqdm
import random
import logging

print("Notebook running")

Notebook running


In [3]:
# set correct directory
os.chdir("/Users/moritzlaurer/huggingface/projects/blog-posts/synthetic-data-blog/")
print(os.getcwd())

/Users/moritzlaurer/huggingface/projects/blog-posts/synthetic-data-blog


### Global variables

In [4]:
# login via the huggingface hub with you hf_token
# you need a huggingface account and create a token here: https://huggingface.co/settings/tokens
# we can then call on the token with huggingface_hub.get_token()
import huggingface_hub
huggingface_hub.login()

In [5]:
# global variables for APIs
MODEL = "Mixtral-8x7B-Instruct-v0.1"  #"gpt-3.5-turbo-0125", "gpt-4-0125-preview", "Mixtral-8x7B-Instruct-v0.1"
API_URL = f"https://api-inference.huggingface.co/models/mistralai/{MODEL}"
HEADERS = {"Authorization": f"Bearer {huggingface_hub.get_token()}"}
client_oai = AsyncOpenAI(api_key=os.getenv('OAI_TOKEN'))

# choose one of these API providers: "HF" or "OAI"
API_PROVIDER = "HF"
# for asynchronous API calls
BATCH_SIZE = 64
SLEEP_TIME = 1

# global variables for experiment variations
SEED = 42
N_SAMPLE = False  # You can sample parts of the data for faster testing. False for run on full dataset, int for sampling
SELF_CONSISTENCY_ITERATIONS = 3  # How many times should the model try to predict the same text for self-consistency?
DATA_SUBSET = "sentences_allagree"  # "sentences_allagree", "sentences_66agree", "sentences_75agree"
FINAL_TEST_RUN = True  # True for final run on test set
SAVE_OUTPUTS = True


### Load and prepare dataset

In [6]:
# financial_phrasebank paper: https://arxiv.org/pdf/1307.5336.pdf
random.seed(SEED)

# load dataset
dataset = load_dataset(
    "financial_phrasebank", DATA_SUBSET, 
    split="train"  # note that the dataset does not have a default test split
)

# sample for faster testing
if N_SAMPLE: 
    dataset = dataset.select(random.sample(range(len(dataset)), N_SAMPLE))

# train-test-split
# note: with 0-shot prompting you can sometimes skip holding out a test-set, because you don't train a model
# But: the prompt is a form of hyperparameter and every time to adapt the prompt 
# to get better performance this is a form of hyperparameter search and you do not know how well this prompt would generalize to unseen data.
# If you update your prompt, it is therefore good practice to do the final test on a separate test-set
# on which the "prompt wording hyperparameter" was not tested to avoid overfitting your prompt to the data. 
# Moreover: for our example, we need a separate testset because we will also train a small BERT model on the training data

dataset = dataset.add_column("idx", range(len(dataset)))
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, stratify_by_column="label", seed=SEED)

# determine ids for train and test set to split again after inference
row_id_train = dataset["train"]["idx"] 
row_id_test = dataset["test"]["idx"] 

if FINAL_TEST_RUN and (API_PROVIDER != "OAI"):
    # merging splits again here for easier inference. Splitting again after inference based on row_ids
    dataset = concatenate_datasets([dataset["train"], dataset["test"]])
elif API_PROVIDER == "OAI":
    # for the run with OpenAI models, we only want labels for the testset to calculate metrics
    dataset = dataset["test"]
else:
    # for testing prompts
    dataset = dataset["train"]

print(dataset)

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 2264
})


In [7]:
# create a new column with the numeric label verbalised as label_text (e.g. "positive" instead of "0")
label_map = {i: label_text for i, label_text in enumerate(dataset.features["label"].names)}

def add_label_text(example):
    example["label_text"] = label_map[example["label"]]
    return example

dataset = dataset.map(add_label_text)

print(dataset)

Map:   0%|          | 0/2264 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'label', 'idx', 'label_text'],
    num_rows: 2264
})


### Prompts / Instructions

In [8]:
# prompt is inspired by the annotator instructions provided in section "Annotation task and instructions"
# in the financial_phrasebank paper: https://arxiv.org/pdf/1307.5336.pdf

prompt_financial_sentiment = """\
You are a highly qualified expert trained to annotate machine learning training data.

Your task is to analyze the sentiment in the TEXT below from an investor perspective and label it with only one the three labels:
positive, negative, or neutral.

Base your label decision only on the TEXT and do not speculate e.g. based on prior knowledge about a company. 

Do not provide any explanations and only respond with one of the labels as one word: negative, positive, or neutral

Examples:
Text: Operating profit increased, from EUR 7m to 9m compared to the previous reporting period.
Label: positive
Text: The company generated net sales of 11.3 million euro this year.
Label: neutral
Text: Profit before taxes decreased to EUR 14m, compared to EUR 19m in the previous period.	
Label: negative

Your TEXT to analyse:
TEXT: {text}
Label: """


prompt_financial_sentiment_cot = """\
You are a highly qualified expert trained to annotate machine learning training data.

Your task is to briefly analyze the sentiment in the TEXT below from an investor perspective and then label it with only one the three labels:
positive, negative, neutral.

Base your label decision only on the TEXT and do not speculate e.g. based on prior knowledge about a company. 

You first reason step by step about the correct label and then return your label.

You ALWAYS respond only in the following JSON format: {{"reason": "...", "label": "..."}}
You only respond with one single JSON response. 

Examples:
Text: Operating profit increased, from EUR 7m to 9m compared to the previous reporting period.
JSON response: {{"reason": "An increase in operating profit is positive for investors", "label": "positive"}}
Text: The company generated net sales of 11.3 million euro this year.
JSON response: {{"reason": "The text only mentions financials without indication if they are better or worse than before", "label": "neutral"}}
Text: Profit before taxes decreased to EUR 14m, compared to EUR 19m in the previous period.	
JSON response: {{"reason": "A decrease in profit is negative for investors", "label": "negative"}}

Your TEXT to analyse:
TEXT: {text}
JSON response: """



In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")

chat_financial_sentiment = [{"role": "user", "content": prompt_financial_sentiment}]
chat_financial_sentiment_cot = [{"role": "user", "content": prompt_financial_sentiment_cot}]

prompt_financial_sentiment_formatted = tokenizer.apply_chat_template(chat_financial_sentiment, tokenize=False)
prompt_financial_sentiment_cot_formatted = tokenizer.apply_chat_template(chat_financial_sentiment_cot, tokenize=False)


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [27]:
prompt_financial_sentiment_formatted

'<s>[INST] You are a highly qualified expert trained to annotate machine learning training data.\n\nYour task is to analyze the sentiment in the TEXT below from an investor perspective and label it with only one the three labels:\npositive, negative, or neutral.\n\nBase your label decision only on the TEXT and do not speculate e.g. based on prior knowledge about a company. \n\nDo not provide any explanations and only respond with one of the labels as one word: negative, positive, or neutral\n\nExamples:\nText: Operating profit increased, from EUR 7m to 9m compared to the previous reporting period.\nLabel: positive\nText: The company generated net sales of 11.3 million euro this year.\nLabel: neutral\nText: Profit before taxes decreased to EUR 14m, compared to EUR 19m in the previous period.\t\nLabel: negative\n\nYour TEXT to analyse:\nTEXT: {text}\nLabel:  [/INST]'

### Generation

In [10]:
# params for API: https://huggingface.co/docs/api-inference/detailed_parameters#text-generation-task
# alternative list for API: https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference/generate
# params for endpoints: https://huggingface.co/docs/huggingface_hub/v0.20.3/en/package_reference/inference_client#huggingface_hub.InferenceClient

generation_params = dict(
    top_p=0.90,
    top_k=None,
    temperature=0.8,
    repetition_penalty=1.0,
    do_sample=True,
    max_new_tokens=128,
    return_full_text=False,
    #seed=SEED,  # no seed, because we need randomness for self-consistency
    max_time=None, 
    stream=False,
    details=False,
    use_cache=False,
    wait_for_model=False,
)

In [11]:

# avoid error: Request failed due to network error: Cannot connect to host api-inference.huggingface.co:443 ssl:True 
# [SSLCertVerificationError: (1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')]
# this might not be necessary on your machine
import ssl
import certifi
import aiohttp
ssl_context = ssl.create_default_context(cafile=certifi.where())

In [12]:
# asynchronous functions for efficiently calling on LLM APIs with batching
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# functions for calling the HF API with retries, async and batch processing
async def request_with_retry_hf(session, url, headers, json, semaphore, retries=4, backoff_factor=3):
    """Attempt a request with exponential backoff and retry logic."""
    attempt = 0
    while attempt < retries:
        async with semaphore:
            try:
                async with session.post(url, headers=headers, json=json) as response:
                    if response.status in [200, 201]:
                        return await response.json()
                    elif response.status == 429:
                        retry_after = int(response.headers.get("Retry-After", 60))
                        logging.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
                    else:
                        raise RuntimeError(f"API returned a non-200 status code: {response.status}")
            except (ClientError, asyncio.TimeoutError) as e:
                logging.error(f"Request failed due to network error: {e}")
            # Wait before retrying with exponential backoff
            sleep_time = backoff_factor ** attempt
            logging.info(f"Retrying in {sleep_time} seconds...")
            await asyncio.sleep(sleep_time)
            attempt += 1
    # After all retries, raise an exception to indicate the request has ultimately failed
    raise RuntimeError("Request failed after multiple retries.")


async def generate_text_async_hf(session, text, prompt, generation_params, semaphore):
    payload = {
        "inputs": prompt.format(text=text),
        "parameters": {**generation_params}
    }
    # Call the request_with_retry function to handle potential retries
    response_json = await request_with_retry_hf(session, API_URL, HEADERS, payload, semaphore)
    generated_text = response_json[0].get("generated_text", "No text generated")
    if "error" in response_json:
        raise RuntimeError(f"API returned an error: {response_json['error']}")
    return generated_text


async def request_with_retry_oai(session, messages, generation_params, semaphore, retries=4, backoff_factor=3):
    """Attempt a request to the OpenAI API with exponential backoff and retry logic."""
    attempt = 0
    while attempt < retries:
        async with semaphore:
            try:
                completion = await client_oai.chat.completions.create(
                    model=MODEL,
                    messages=messages,
                    **generation_params
                )
                return completion.choices[0].message.content
            except openai.RateLimitError as e:
                retry_after = int(e.headers.get("Retry-After", 60))
                logging.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
                await asyncio.sleep(retry_after)
            except (openai.APITimeoutError, asyncio.TimeoutError) as e:
                logging.error(f"Request failed due to API or network error: {e}")
                sleep_time = backoff_factor ** attempt
                logging.info(f"Retrying in {sleep_time} seconds...")
                await asyncio.sleep(sleep_time)
                attempt += 1
    # After all retries, raise an exception to indicate the request has ultimately failed
    raise RuntimeError("Request failed after multiple retries.")


async def generate_text_async_oai(session, text, prompt, generation_params, semaphore):
    messages = [{"role": "user", "content": prompt.format(text=text)}]
    if "max_new_tokens" in generation_params:
        generation_params["max_tokens"] = generation_params.pop("max_new_tokens")
    allowed_params = {"top_p", "temperature", "max_tokens", "stop"}
    generation_params = {k: v for k, v in generation_params.items() if k in allowed_params}

    # Call the request_with_retry_oai function to handle potential retries
    generated_text = await request_with_retry_oai(session, messages, generation_params, semaphore)
    return generated_text


async def run_batch(dataset, prompt, generation_params, api_provider, batch_size, sleep_time):
    results_lst = []
    semaphore = asyncio.BoundedSemaphore(128)
    timeout = ClientTimeout(total=60)

    #async with ClientSession(timeout=timeout) as session:
    async with ClientSession(timeout=timeout, connector=aiohttp.TCPConnector(ssl=ssl_context)) as session:
        for i in tqdm(range(0, len(dataset), batch_size), desc="Processing batches"):
            text_batch = dataset[i:i + batch_size]["sentence"]
            if api_provider == "HF":
                tasks = [generate_text_async_hf(session, text, prompt, generation_params, semaphore) for text in text_batch]
            elif api_provider == "OAI":
                tasks = [generate_text_async_oai(session, text, prompt, generation_params, semaphore) for text in text_batch]
            else:
                raise ValueError("Invalid API provider")
            results_batch = await asyncio.gather(*tasks)
            results_lst.extend(results_batch)
            await asyncio.sleep(sleep_time)

    return results_lst



In [13]:
# run batch processing for simple prompt

# run async function in jupyter notebook
output_simple = await run_batch(dataset, prompt_financial_sentiment_formatted, generation_params, API_PROVIDER, BATCH_SIZE, SLEEP_TIME)
# run async function in .py script
#output_simple = asyncio.run(run_batch(dataset, prompt_financial_sentiment_formatted, generation_params, API_PROVIDER, BATCH_SIZE, SLEEP_TIME))

print(output_simple[:3])

Processing batches: 100%|██████████| 36/36 [02:43<00:00,  4.55s/it]

[' neutral', ' Neutral', ' neutral']





In [28]:
# run batch processing for chain-of-thought and self-consistency prompt
output_cot_multiple = []
for _ in range(SELF_CONSISTENCY_ITERATIONS):
    # run async function in jupyter notebook
    output_cot = await run_batch(dataset, prompt_financial_sentiment_cot_formatted, generation_params, API_PROVIDER, BATCH_SIZE, SLEEP_TIME)
    # run async function in .py script
    #output_cot = asyncio.run(run_batch(dataset, prompt_financial_sentiment_cot_formatted, generation_params, API_PROVIDER, BATCH_SIZE, SLEEP_TIME))
    output_cot_multiple.append(output_cot)

print(output_cot[:3])

Processing batches: 100%|██████████| 36/36 [04:39<00:00,  7.77s/it]
Processing batches: 100%|██████████| 36/36 [04:28<00:00,  7.46s/it]
Processing batches: 100%|██████████| 36/36 [04:21<00:00,  7.27s/it]

[' {"reason": "The text mentions the company\'s consideration of starting production in Russia, but it does not provide enough information to determine a clear sentiment from an investor\'s perspective. The potential impact on investors would depend on various factors such as the cost-effectiveness, market potential, and risks associated with production in Russia.", "label": "neutral"}', ' {"reason": "The text does not contain any sentiment related to investments or financials, it is just a statement about the source of the text", "label": "neutral"}', ' {"reason": "The text does not provide information about the significance or impact of the deal, and does not reveal whether the deal is positive or negative for the company or its investors. Therefore, it is not possible to make a definitive sentiment judgment.", "label": "neutral"}']





In [None]:
# parse and clean outputs
random.seed(SEED)

labels = ["positive", "negative", "neutral"]

# function to map each label string to a discrete category (for simple prompt)
def clean_output(string, random_choice=True):
    for category in labels:
        if category.lower() in string.lower():
            return category
    if random_choice:
        return random.choice(labels)  # random category if no category is found
    else:
        return "FAIL"

# function to parse chain-of-thought JSON output
def process_output_cot(output):
    try: 
        output_dic = ast.literal_eval(output) 
        return output_dic
    except Exception as e:
        # if json/dict parse fails, do simple search for occurance of first label term
        print(f"Parsing failed for output: {output}, Error: {e}")
        output_cl = clean_output(output, random_choice=True)
        output_dic = {"reason": "FAIL", "label": output_cl}
        return output_dic


# clean outputs for simple prompt
output_simple_cl = [clean_output(output) for output in output_simple]

# clean outputs for CoT + SC prompt
output_cot_multiple_cl = []
output_dic_lst = []
for i in range(SELF_CONSISTENCY_ITERATIONS):
    output_dic_step = [process_output_cot(output) for output in output_cot_multiple[i]]
    output_labels = [dic["label"] for dic in output_dic_step]
    output_cot_multiple_cl.append(output_labels)
    output_dic_lst.extend(output_dic_step)


In [None]:
# convert the CoT+SC output to dataframe for easier downstream processing
df_output = pd.DataFrame(data=output_cot_multiple_cl).T
df_output = df_output.rename(columns={0: "sc_iter1", 1: "sc_iter2", 2: "sc_iter3"})
df_output.head(3)

In [None]:
# find majority from multiple self-consistency runs
from collections import Counter
random.seed(SEED)

def find_majority(row):
    # Find majority
    count = Counter(row)
    majority = count.most_common(1)[0]
    # Check if it's a real majority or if all 3 labels appear 3 times
    if majority[1] > 1:
        return majority[0]
    else: # in case all 3 labels appear 3 times
        return random.choice(labels)


# majority for multiple self-consistency + CoT runs
df_output['label_llm_cot_multiple'] = df_output.apply(find_majority, axis=1)
# single CoT run
df_output["label_llm_cot"] = [clean_output(output, random_choice=True) for output in df_output["sc_iter1"]]  # if parsing did not work, choose a random label
# simple labeling prompt
df_output["label_llm"] = output_simple_cl
# expert label
df_output["label_experts"] = dataset["label_text"]

df_output["text"] = dataset["sentence"]

# also add the reasoning for the first iteration for inspection
df_output["reason_iter1"] = [dic["reason"] for dic in output_dic_lst[:len(df_output)]]
df_output["reason_iter2"] = [dic["reason"] for dic in output_dic_lst[len(df_output):len(df_output)*2]]
df_output["reason_iter3"] = [dic["reason"] for dic in output_dic_lst[len(df_output)*2:len(df_output)*3]]

df_output.head(3)


In [32]:
# save for debugging and in case something fails after API calls
df_output.to_csv(f'./data/df_{API_PROVIDER}_{MODEL}_backup.csv', index=False)

#### Calculate metrics

In [33]:
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, accuracy_score, classification_report

def compute_metrics(label_experts, label_pred):

    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(label_experts, label_pred, average='macro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(label_experts, label_pred, average='micro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    acc_balanced = balanced_accuracy_score(label_experts, label_pred)
    acc_not_balanced = accuracy_score(label_experts, label_pred)

    metrics = {
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        'accuracy_balanced': acc_balanced,
        'accuracy': acc_not_balanced,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
    }
    metrics_report = classification_report(
        label_experts, label_pred, digits=2, output_dict=True, zero_division='warn'
    )

    return {**metrics, **{"report": metrics_report}}

In [None]:
# compute metrics
if FINAL_TEST_RUN:
    if "HF" in API_PROVIDER:
        df_output_test = df_output.iloc[row_id_test]
        df_output_train = df_output.iloc[row_id_train]
    elif API_PROVIDER == "OAI":
        df_output_test = df_output
        df_output_train = pd.DataFrame()
    else:
        raise NotImplementedError
    print(f"Length of testset: {len(df_output_test)}, and length of trainset: {len(df_output_train)}")
else:
    df_output_test = df_output.copy(deep=True)
    df_output_train = pd.DataFrame()


label_experts = df_output_test["label_experts"]
label_llm = df_output_test["label_llm"]
label_llm_cot = df_output_test["label_llm_cot"]  #[label if label in labels else random.choice(labels) for label in df_output["label_llm_cot"]]
label_llm_cot_multiple = df_output_test["label_llm_cot_multiple"]  #[label if label in labels else random.choice(labels) for label in df_output["label_llm_cot_multiple"]] # replacing FAIL with a random label

metrics_single = compute_metrics(label_experts, label_llm)
metrics_single_cot = compute_metrics(label_experts, label_llm_cot)
metrics_multiple_cot =  compute_metrics(label_experts, label_llm_cot_multiple)

metrics = {"metrics_single": metrics_single, "metrics_single_cot": metrics_single_cot, "metrics_multiple_cot": metrics_multiple_cot}

metrics

### Save results

In [36]:
# save to disk
if SAVE_OUTPUTS: 
    time_now = datetime.now().strftime("%Y-%m-%d-%H-%M")

    file_path_metrics = f'./data/metrics_{API_PROVIDER}_{time_now}_{MODEL}.json'

    # Writing the metrics dictiontary to json
    with open(file_path_metrics, 'w') as file:
        json.dump(metrics, file, indent=4)

    # Write dfs to csv
    df_output_train.to_csv(f'./data/df_train_{API_PROVIDER}_{time_now}_{MODEL}.csv', index=False)
    df_output_test.to_csv(f'./data/df_test_{API_PROVIDER}_{time_now}_{MODEL}.csv', index=False)

In [None]:
# inspect the texts where the model and the dataset labels from the experts disagree
df_wrong = df_output[df_output["label_llm_cot_multiple"] != df_output["label_experts"]]