# **Chat Completions**

This notebook demonstrates how to implement **chat completions** using the **Inflection AI API**. 

With the new OpenAI API compatibility support, one can use OpenAI Client or direct http requests.

By default, the Inference API service generates the entire completion before sending it back as a single response, which can result in significant delays for longer completions.
To minimize waiting time, you can opt for 'streaming' completions, allowing you to receive and process partial completions as they're generated. This enables you to begin working with the initial part of the completion without waiting for the entire response.

Enable streaming by setting *`stream=True`* when using the chat completions or completions endpoints. This returns an object that delivers data-only server-sent events in chunks, accessible through the delta field instead of the standard message field.

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
# Retrieve environment variables
base_url = os.getenv("BASE_URL")
inflection_api_key = os.getenv("INFLECTION_API_KEY")

# Set the model you want to use
#model = "inflection_3_productivity"
model = "inflection_3_pi"


In [None]:
class color:
    BOLD = '\033[1m'
    END = '\033[0m'

### **Test Message** ###
We'll use the simple test message below, but modify as needed to experimengt with the responses

In [None]:
# Messages
test_messages = [
    {
        "role": "user", 
        "content": "What's the weather in Palo Alto?"
    }
]

## **Using OpenAI Client** ##

In [None]:
from openai import OpenAI

url = base_url + "/external/api/inference/openai/v1/"
client = OpenAI(base_url=url, api_key=inflection_api_key)

### **Simple Message Completion Without Streaming** ###


In [None]:
print(f"Starting test: chat completion without streaming using {model}")
print("+*"*20)

response = client.chat.completions.create(
    model=model,
    messages=test_messages,
)


response_content = response.choices[0].message.content

print(f"{color.BOLD} Response: {color.END} {response_content}")
print("+*"*20)

### **Simple Message Completion With Streaming** ###


In [None]:
print(f"Starting test: chat completion through OpenAI Client with streaming using {model}")
print("+*"*20)

response = client.chat.completions.create(
    model=model,
    messages=test_messages,
    stream=True, # Enable streaming
)

for chunk in response:
    if chunk.choices[0].delta.content:
        print(f"{color.BOLD} Chunk: {color.END} {chunk.choices[0].delta.content}", end="\n")


print("+*"*20)

## **Using HTTP Requests** ##

In [None]:
import json
import requests

url = base_url + "/external/api/inference/openai/v1/chat/completions"

headers = {
    "Authorization": f"Bearer {inflection_api_key}",
    "Content-Type": "application/json",
}

### **Simple Message Completion Without Streaming** ###

In [None]:
json_payload = {
    "model": model, 
    "messages": test_messages,
    "temperature": 0.8,
    "top_p": 1,
    "web_search": True,
}

# Convert the payload to a JSON string
payload = json.dumps(json_payload)

# Make the post request
print(f"Starting test: chat completion through an HTTP POST without streaming using {model}")
print("+*"*20)

response = requests.post(url, headers=headers, data=payload)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    json_response = response.json()
    # Print the response
    print(f"{color.BOLD} Response: {color.END} {json_response['choices'][0]['message']['content']}")

print("+*"*20)

### **Simple Message Completion With Streaming** ###

In [None]:
json_payload = {
    "model": model, 
    "messages": test_messages,
    "temperature": 0.8,
    "top_p": 1,
    "web_search": True,
    "stream": True # Enable streaming
}

# Convert the payload to a JSON string
payload = json.dumps(json_payload)

# Make the post request
print(f"Starting test: chat completion through an HTTP POST with streaming using {model}")
print("+*"*20)

response = requests.post(url, headers=headers, data=payload)

# Check if the request was successful
if response.status_code == 200:
    onse = requests.post(url, headers=headers, data=payload, stream=True)

# Check if the request was successful
if response.status_code == 200:
    # Process the streaming response
    for line in response.iter_lines():
        if line:
            # Skip the "data: " prefix and parse the JSON
            line = line.decode('utf-8')
            if line.startswith('data: '):
                if line == 'data: [DONE]':
                    break
                
                json_data = json.loads(line[6:])  # Skip the 'data: ' prefix
                
                # Extract the content from the delta if it exists
                if 'choices' in json_data and json_data['choices'] and 'delta' in json_data['choices'][0]:
                    delta = json_data['choices'][0]['delta']
                    if 'content' in delta and delta['content']:
                        print(f"{color.BOLD} Chunk: {color.END} {delta['content']}", end="\n")

print("+*"*20)