In [None]:
import base64
import time
from typing import Any

import openai
from openai import OpenAI
import numpy as np
import os
OPENAI_SYSTEM_MESSAGE_API = "You are a helpful assistant."
OPENAI_SYSTEM_MESSAGE_CHATGPT = (
    "You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture."
    + "\nKnowledge cutoff: 2023-12\nCurrent date: 2024-04-01"
)

from google.auth import default
from google.auth.transport.requests import Request


credentials, _ = default()
auth_request = Request()
credentials.refresh(auth_request)
base_url = "https://us-central1-aiplatform.googleapis.com/v1beta1/projects/storied-channel-368910/locations/us-central1/endpoints/openapi"

class ChatCompletionSampler():
    """
    Sample from OpenAI's chat completion API
    """

    def __init__(
        self,
        model: str = "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
        system_message: str | None = None,
        temperature: float = 0.5,
        max_tokens: int = 1024,
        base_url=None,
        api_key=None,
        use_logprobs = False
    ):
        self.api_key_name = os.environ["TOGETHER_API_KEY"]
        self.client = OpenAI(base_url=base_url, api_key=api_key)
        # using api_key=os.environ.get("OPENAI_API_KEY")  # please set your API_KEY
        self.model = model
        self.system_message = system_message
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.image_format = "url"
        self.use_logprobs = use_logprobs

        self.top_logprobs = None
        self.logit_perplexity = None
        self.log_probs = None

    def _handle_image(
        self, image: str, encoding: str = "base64", format: str = "png", fovea: int = 768
    ):
        new_image = {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/{format};{encoding},{image}",
            },
        }
        return new_image

    def _handle_text(self, text: str):
        return {"type": "text", "text": text}

    def _pack_message(self, role: str, content: Any):
        return {"role": str(role), "content": content}

    def __call__(self, message_list) -> str:
        if self.system_message:
            message_list = [self._pack_message("system", self.system_message)] + message_list
        trial = 0
        while True:
            try:
                if self.logprobs:
                    # together
                    response = self.client.chat.completions.create(
                        model=self.model,
                        messages=message_list,
                        temperature=self.temperature,
                        max_tokens=self.max_tokens,
                        logprobs=5
                    )
                    self.top_logprobs = response.choices[0].logprobs.top_logprobs
                    return response.choices[0].message.content, float(np.exp(response.choices[0].logprobs.token_logprobs).mean()), response.choices[0].logprobs.token_logprobs, response
                else:
                    response = self.client.chat.completions.create(
                        model=self.model,
                        messages=message_list,
                        temperature=self.temperature,
                        max_tokens=self.max_tokens
                    )
                    return response.choices[0].message.content, None, None
                
            # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU
            except openai.BadRequestError as e:
                print("Bad Request Error", e)
                return ""
            except Exception as e:
                exception_backoff = min(2**trial, 60)  # expontial back off
                print(
                    f"Rate limit exception so wait and retry {trial} after {exception_backoff} sec",
                    e,
                )
                time.sleep(exception_backoff)
                trial += 1
            # unknown error shall throw exception


In [95]:
google = ChatCompletionSampler(
            model="meta/llama-4-maverick-17b-128e-instruct-maas",
            system_message=OPENAI_SYSTEM_MESSAGE_API,
            max_tokens=2048,
            base_url=f"https://{"us-east5-aiplatform.googleapis.com"}/v1beta1/projects/storied-channel-368910/locations/us-central1/endpoints/openapi",
            api_key=credentials.token
        )

to = ChatCompletionSampler(
            base_url = "https://api.together.xyz/v1",
            api_key = os.environ['TOGETHER_API_KEY'],
            model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
            system_message=OPENAI_SYSTEM_MESSAGE_API,
            max_tokens=2048,
    )

In [96]:
to.logprobs=True

In [97]:
response = to([{"role": "user", "content": "hi"}])

In [98]:
response

("It's nice to meet you. Is there something I can help you with or would you like to chat?",
 0.9703521864702449,
 [-1.140625,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  -3.5762787e-07,
  0,
  0,
  0,
  0,
  0,
  -0.0015029907,
  0,
  0,
  0,
  0,
  0,
  0,
  -1.66893e-05],
 ChatCompletion(id='nrEJLr9-4Yz4kd-934c95111c4ae7cc', choices=[Choice(finish_reason='stop', index=0, logprobs=ChoiceLogprobs(content=None, refusal=None, token_ids=[2181, 596, 6555, 311, 3449, 499, 13, 2209, 1070, 2555, 358, 649, 1520, 499, 449, 477, 1053, 499, 1093, 311, 6369, 30, 128009], tokens=['It', "'s", ' nice', ' to', ' meet', ' you', '.', ' Is', ' there', ' something', ' I', ' can', ' help', ' you', ' with', ' or', ' would', ' you', ' like', ' to', ' chat', '?', '<|eot_id|>'], token_logprobs=[-1.140625, 0, 0, 0, 0, 0, 0, 0, 0, -3.5762787e-07, 0, 0, 0, 0, 0, -0.0015029907, 0, 0, 0, 0, 0, 0, -1.66893e-05], top_logprobs=[{'Hello': -0.38867188, 'It': -1.140625, 'How': -7.625, 'Hi': -7.875, 'I': -17.625}, {"'s": 0