In [None]:
%pip install --quiet arize-phoenix pandas pyarrow openai anthropic
%pip install --quiet openinference-instrumentation-openai opentelemetry-sdk opentelemetry-exporter-otlp

# Setup Variables & Keys

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from getpass import getpass
import os

if os.environ.get("OPENAI_API_KEY") is None:
    os.environ["OPENAI_API_KEY"] = getpass("🔑 Enter your OpenAI API key: ")
    
if not (PHOENIX_API_KEY := os.getenv("PHOENIX_API_KEY")):
    PHOENIX_API_KEY = getpass("🔑 Enter your Phoenix API key: ")

os.environ["PHOENIX_API_KEY"] = PHOENIX_API_KEY

if os.environ.get("ANTHROPIC_API_KEY") is None:
    os.environ["ANTHROPIC_API_KEY"] = getpass("🔑 Enter your Anthropic API key: ")

# Start Phoenix

In [None]:
import os
from opentelemetry import trace as trace_api
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
    OTLPSpanExporter as HTTPSpanExporter,
)
from openinference.instrumentation.openai import OpenAIInstrumentor
import phoenix as px

# Add Phoenix API Key for tracing
PHOENIX_API_KEY = os.environ["PHOENIX_API_KEY"]

os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com"

# Add Phoenix
span_phoenix_processor = SimpleSpanProcessor(HTTPSpanExporter(endpoint="https://app.phoenix.arize.com/v1/traces"))

# Add them to the tracer
tracer_provider = trace_sdk.TracerProvider()
tracer_provider.add_span_processor(span_processor=span_phoenix_processor)
trace_api.set_tracer_provider(tracer_provider=tracer_provider)
OpenAIInstrumentor().instrument()

# Download Dataset

In [13]:
import pandas as pd

# Load the Parquet file into a DataFrame
# file_path = 'emotion_classification.parquet'
# df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/raw/train-00000-of-00001.parquet")

# df = pd.read_parquet(file_path)

data = {
    "prediction_id": [1, 2, 3],
    "text": ["I am so happy today!", "This is the worst day ever.", "I feel so calm and relaxed."],
    "predicted_emotion": ["happiness", "anger", "calm"]
}

df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

   prediction_id                         text predicted_emotion
0              1         I am so happy today!         happiness
1              2  This is the worst day ever.             anger
2              3  I feel so calm and relaxed.              calm


In [None]:
print(df.head())

In [14]:
df = df[:20]
df = df[["prediction_id", "text", "predicted_emotion"]]
df.set_index("prediction_id", inplace=True)
emotions= df['predicted_emotion'].unique()
df.head()

Unnamed: 0_level_0,text,predicted_emotion
prediction_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,I am so happy today!,happiness
2,This is the worst day ever.,anger
3,I feel so calm and relaxed.,calm


In [15]:
from datetime import datetime

now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dataset = px.Client().upload_dataset(
    dataset_name=f"sentiment-analysis-{now}",
    dataframe=df,
    input_keys=["text"],
    output_keys=["predicted_emotion"],
)



📤 Uploading dataset...




💾 Examples uploaded: https://app.phoenix.arize.com/datasets/RGF0YXNldDoy/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246Mg==


# Prepare Experiment Task

In [16]:
template = """
Classify the emotion present in the text below. You should only respond with the name of the emotion, no other words.
The emotion must be one of the provided values.

Input
=======
[Text]: {text}
[Provided Values]: {emotions}
"""
model = "gpt-4-turbo"

In [17]:
from openai import OpenAI

openai_client = OpenAI()

def summarize_article_openai(input) -> str:
    formatted_prompt_template = template.format(text=input["text"], emotions=emotions)
    response = openai_client.chat.completions.create(
        model=model,
        messages=[
            {"role": "assistant", "content": formatted_prompt_template},
        ],
    )
    assert response.choices
    return response.choices[0].message.content

# Prepare Evaluators

In [18]:
openai_eval_client = OpenAI()
eval_prompt = """
Your task is to evaluate whether the predicted emotion below describes the supplied input text. 
We are also including the correct emotion as a piece of data.

Begin Data:
[input text]: {input}
[correct emotion]: {expected}
[predicted emotion]: {output}

It's possible that the predicted emotion is another word for the correct emotion, and the two are 
roughly equivalent. If the two emotions are equivalent, respond with the word 'correct'. If they
are note equivalent, respond with the word 'incorrect'. Do not include any other words in your 
response
"""

def llm_as_a_judge_eval(input: str, output: str, expected: str):
    formatted_prompt_template = eval_prompt.format(input=input["text"], 
                                                   output=output, 
                                                   expected=expected["predicted_emotion"])
    
    response = openai_client.chat.completions.create(
        model=model,
        messages=[
            {"role": "assistant", "content": formatted_prompt_template},
        ],
    )
    assert response.choices
    return 1 if response.choices[0].message.content == "correct" else 0

def exact_match_eval(output: str, expected: str):
    return 1 if output.lower() == expected["predicted_emotion"].lower() else 0

EVALUATORS = [exact_match_eval]

# Run Experiment

In [19]:
from phoenix.experiments import run_experiment

experiment_results = run_experiment(
    dataset,
    summarize_article_openai,
    experiment_name="initial-template",
    experiment_description="first experiment using a simple prompt template",
    experiment_metadata={"vendor": "openai", "model": "gpt4o"},
    evaluators=EVALUATORS,
)

🐌!! If running inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.


🧪 Experiment started.
📺 View dataset experiments: https://app.phoenix.arize.com/datasets/RGF0YXNldDoy/experiments
🔗 View this experiment: https://app.phoenix.arize.com/datasets/RGF0YXNldDoy/compare?experimentId=RXhwZXJpbWVudDox


running tasks |██████████| 3/3 (100.0%) | ⏳ 00:10<00:00 |  3.61s/it


✅ Task runs completed.


🐌!! If running inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.


🧠 Evaluation started.


running experiment evaluations |██████████| 3/3 (100.0%) | ⏳ 00:02<00:00 |  1.15it/s


🔗 View this experiment: https://app.phoenix.arize.com/datasets/RGF0YXNldDoy/compare?experimentId=RXhwZXJpbWVudDox

Experiment Summary (12/07/24 09:14 PM +0530)
--------------------------------------------
          evaluator  n  n_scores  avg_score
0  exact_match_eval  3         3        1.0

Tasks Summary (12/07/24 09:14 PM +0530)
---------------------------------------
   n_examples  n_runs  n_errors
0           3       3         0





# Modify Task to use a New Model

In [None]:
model = "gpt-4o-mini"

In [None]:
from openai import OpenAI
openai_client = OpenAI()

def summarize_article_openai(input) -> str:
    formatted_prompt_template = template.format(text=input["text"], emotions=emotions)
    response = openai_client.chat.completions.create(
        model=model,
        messages=[
            {"role": "assistant", "content": formatted_prompt_template},
        ],
    )
    assert response.choices
    return response.choices[0].message.content

# Re-Run Experiment

In [None]:
from phoenix.experiments import run_experiment

experiment_results = run_experiment(
    dataset,
    summarize_article_openai,
    experiment_name="new-model",
    experiment_description="second experiment using a new model",
    experiment_metadata={"vendor": "openai", "model": "gpt4o-mini"},
    evaluators=EVALUATORS,
)

# Modify Task to Use a Third Model

In [20]:
from anthropic import Anthropic
client = Anthropic()
anthropic_model = "claude-3-5-sonnet-20240620"

def summarize_article_anthropic(input: str):
    formatted_prompt_template = template.format(text=input["text"], emotions=emotions)
    message = client.messages.create(
        model=anthropic_model,
        max_tokens=1024,
        messages=[{"role": "user", "content": formatted_prompt_template}],
    )
    return message.content[0].text

# Re-run Experiment

In [21]:
from phoenix.experiments import run_experiment

experiment_results = run_experiment(
    dataset,
    summarize_article_anthropic,
    experiment_name="third-model",
    experiment_description="third experiment, with a new model",
    experiment_metadata={"vendor": "anthropic", "model": "claude-3-sonnet"},
    evaluators=EVALUATORS,
)

🐌!! If running inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.


🧪 Experiment started.
📺 View dataset experiments: https://app.phoenix.arize.com/datasets/RGF0YXNldDoy/experiments
🔗 View this experiment: https://app.phoenix.arize.com/datasets/RGF0YXNldDoy/compare?experimentId=RXhwZXJpbWVudDoy


running tasks |          | 0/3 (0.0%) | ⏳ 00:00<? | ?it/s

[91mTraceback (most recent call last):
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\phoenix\experiments\functions.py", line 238, in sync_run_experiment
    _output = task(*bound_task_args.args, **bound_task_args.kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\MSI KATANA\AppData\Local\Temp\ipykernel_48388\2276699426.py", line 7, in summarize_article_anthropic
    message = client.messages.create(
              ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\anthropic\_utils\_utils.py", line 275, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\anthropic\resources\messages.py", line 888, in create
    return self._post(
           ^^^^^^^^^^^
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-pac

running tasks |███▎      | 1/3 (33.3%) | ⏳ 00:03<00:07 |  3.71s/it

[91mTraceback (most recent call last):
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\phoenix\experiments\functions.py", line 238, in sync_run_experiment
    _output = task(*bound_task_args.args, **bound_task_args.kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\MSI KATANA\AppData\Local\Temp\ipykernel_48388\2276699426.py", line 7, in summarize_article_anthropic
    message = client.messages.create(
              ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\anthropic\_utils\_utils.py", line 275, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\anthropic\resources\messages.py", line 888, in create
    return self._post(
           ^^^^^^^^^^^
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-pac

running tasks |██████▋   | 2/3 (66.7%) | ⏳ 00:04<00:02 |  2.24s/it

[91mTraceback (most recent call last):
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\phoenix\experiments\functions.py", line 238, in sync_run_experiment
    _output = task(*bound_task_args.args, **bound_task_args.kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\MSI KATANA\AppData\Local\Temp\ipykernel_48388\2276699426.py", line 7, in summarize_article_anthropic
    message = client.messages.create(
              ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\anthropic\_utils\_utils.py", line 275, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\anthropic\resources\messages.py", line 888, in create
    return self._post(
           ^^^^^^^^^^^
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-pac

running tasks |██████████| 3/3 (100.0%) | ⏳ 00:06<00:00 |  2.16s/it

✅ Task runs completed.


🐌!! If running inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.


🧠 Evaluation started.




[91mTraceback (most recent call last):
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\phoenix\experiments\functions.py", line 503, in sync_evaluate_run
    result = evaluator.evaluate(
             ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\phoenix\experiments\evaluators\utils.py", line 215, in evaluate
    result = func(*bound_signature.args, **bound_signature.kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\MSI KATANA\AppData\Local\Temp\ipykernel_48388\3847397930.py", line 32, in exact_match_eval
    return 1 if output.lower() == expected["predicted_emotion"].lower() else 0
                ^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'lower'

The above exception was the direct cause of the following exception:

RuntimeError: evaluator failed for example id 'RGF0YXNldEV4YW1wbGU6MTAx', repetition 1
[0m


                                                                    
running tasks |██████████| 3/3 (100.0%) | ⏳ 00:08<00:00 |  2.16s/it        

Retries exhausted after 1 attempts: Client error '422 unknown' for url 'https://app.phoenix.arize.com/v1/experiment_evaluations'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/422
[91mTraceback (most recent call last):
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\phoenix\experiments\functions.py", line 503, in sync_evaluate_run
    result = evaluator.evaluate(
             ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\phoenix\experiments\evaluators\utils.py", line 215, in evaluate
    result = func(*bound_signature.args, **bound_signature.kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\MSI KATANA\AppData\Local\Temp\ipykernel_48388\3847397930.py", line 32, in exact_match_eval
    return 1 if output.lower() == expected["predicted_emotion"].lower() else 0
                ^^^^^^^^^^^^
AttributeError: 'Non

                                                                    
running tasks |██████████| 3/3 (100.0%) | ⏳ 00:09<00:00 |  2.16s/it                 

Retries exhausted after 1 attempts: Client error '422 unknown' for url 'https://app.phoenix.arize.com/v1/experiment_evaluations'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/422
[91mTraceback (most recent call last):
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\phoenix\experiments\functions.py", line 503, in sync_evaluate_run
    result = evaluator.evaluate(
             ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\MSI KATANA\Documents\GITHUB_FULL_REPO\Phoenix\phonex\Lib\site-packages\phoenix\experiments\evaluators\utils.py", line 215, in evaluate
    result = func(*bound_signature.args, **bound_signature.kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\MSI KATANA\AppData\Local\Temp\ipykernel_48388\3847397930.py", line 32, in exact_match_eval
    return 1 if output.lower() == expected["predicted_emotion"].lower() else 0
                ^^^^^^^^^^^^
AttributeError: 'Non

                                                                    
running tasks |██████████| 3/3 (100.0%) | ⏳ 00:10<00:00 |  2.16s/it                 

Retries exhausted after 1 attempts: Client error '422 unknown' for url 'https://app.phoenix.arize.com/v1/experiment_evaluations'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/422

🔗 View this experiment: https://app.phoenix.arize.com/datasets/RGF0YXNldDoy/compare?experimentId=RXhwZXJpbWVudDoy

Experiment Summary (12/07/24 09:16 PM +0530)
--------------------------------------------
          evaluator  n
0  exact_match_eval  3

Tasks Summary (12/07/24 09:16 PM +0530)
---------------------------------------
   n_examples  n_runs  n_errors  \
0           3       3         3   

                                           top_error  
0  AuthenticationError("Error code: 401 - {'type'...  
