# Prompt Optimisation

As your system built on AgentX goes to production, whether it is using the Agent class or publishing Tool to the store, you will accumulate more and more feedback data. Utilising these data, you can tune the prompt and the inference hyperparameters to achieve elevated performance.

This notebook demonstrate how to tune prompt and the inference hyperparameters to better predict financial news sentiment.

In [20]:
# Get a dataset with QQQ news and price data.
# sourced from https://www.kaggle.com/datasets/miguelaenlle/massive-stock-news-analysis-db-for-nlpbacktests
from pandas import read_csv
from rich import print as rich_print

dataset = read_csv(
    'data/qqq_sentiment.csv'
)

stats = dataset.describe()
stats = stats.to_dict()
rich_print(stats)

In [21]:
# We will define negative as the bottom 25% of one week percentage change
# and top 25% as positive.

stats = dataset.describe()

def classify_movement(price):
    if price <= stats['price_after_a_week']['25%']:
        return 'negative'
    elif price >= stats['price_after_a_week']['75%']:
        return 'positive'
    else:
        return 'neutral'

dataset['price_movement'] = dataset.price_after_a_week.apply(classify_movement)

In [22]:
# Use an easily defined sentiment classification agent to get a baseline

from agentx.agent import Agent
from agentx.schema import GenerationConfig, Message, Content
from dotenv import load_dotenv
from random import sample
from typing import List, Union, Literal
from pydantic import BaseModel
import os
import asyncio

load_dotenv()

class PricePrediction(BaseModel):
    movement:Literal['positive', 'negative', 'neutral']

generation_config = GenerationConfig(
    api_type='azure',
    api_key=os.environ.get('AZURE_OPENAI_KEY'),
    base_url=os.environ.get('AZURE_OPENAI_ENDPOINT'),
    azure_deployment='gpt-35',
)

sentiment_classification_agent = Agent(
    name='financial_news_sentiment_classification',
    generation_config=generation_config,
    system_prompt='''You will predict the price movement of QQQ based on news headlines.
The price movement can be positive, negative, or neutral.
If the price movement is positive, the price of QQQ should go up substantially in a week.
Vice versa, if the price movement is negative, the price of QQQ should go down substantially in a week.''',
)

# sample a small test set
test_set = sample(dataset.to_dict(orient='records'), 50)
rich_print(test_set)

In [23]:

responses = await asyncio.gather(*[
    sentiment_classification_agent.a_generate_response(
        messages=[
            Message(
                role='user',
                content=Content(
                    text=datum['title'],
                )
            )
        ],
        output_model=PricePrediction
    ) for datum in test_set
])

responses = [response for response in responses if response != []]

In [26]:
# Let's check how well the agent did
import numpy as np

def metric(predicted:List[PricePrediction], actual:List[PricePrediction]):
    if len(predicted) != len(actual):
        raise ValueError('predicted and actual must be the same length')
    
    value_map = {
        'positive': 1,
        'negative': -1,
        'neutral': 0
    }

    p = np.array([value_map[p.movement] for p in predicted])
    a = np.array([value_map[a.movement] for a in actual])
    
    error = np.square(p - a).mean()
    return error

metric(
    [PricePrediction.model_validate_json(response.content.text) for response in responses], 
    [PricePrediction(movement=datum.get('price_movement')) for datum in test_set]
)

1.14

An mean square error of 1.14 basically mean the model is doing nothing at predicting the sentiment. The actual movement is ever so slightly going opposite of what the model predicted.

In [13]:
from agentx.optimisers import TextualGradientPromptTrainer

reviewer = Agent(
    name='reviewer',
    generation_config=generation_config,
    system_prompt='You are a prompt engineer. Review the given prompt, error samples and give reasons why the prompt have gotten these examples wrong.',
)

def textual_gradient(
    prompt:str,
    input:List[str],
    predicted:List[PricePrediction],
    truth:List[PricePrediction],
) -> List[Message]:
    errors = [
        {
            'input': input,
            'predicted': predicted.movement,
            'truth': truth.movement,
        } for input, predicted, truth in zip(input, predicted, truth) if predicted != truth
    ]

    messages = [
        Message(
            role='user',
            content=Content(
                text='''Current prompt: {prompt}

Errors: {errors}'''.format(prompt=prompt, errors=errors, )
            ),
        )
    ]

    response = reviewer.generate_response(
        messages=messages
    )
    
    return messages + [response]

In [15]:
# test the textual_gradient
gradient = textual_gradient(
    prompt=sentiment_classification_agent.system_prompt,
    input=[datum['title'] for datum in test_set],
    predicted=[Sentiment.model_validate_json(response.content.text) for response in responses],
    truth=[Sentiment(**datum) for datum in test_set],
)

In [18]:
rich_print(gradient[-1].content.text)

In [None]:
from ray import tune
from ray.tune.schedulers import ASHAScheduler