- get chatgpt to accept prompts
- get chatgpt to output prompts

In [1]:
!pip install cohere
!pip install tiktoken
!pip install openai

Collecting cohere
  Downloading cohere-4.34-py3-none-any.whl (48 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/48.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting backoff<3.0,>=2.0 (from cohere)
  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting fastavro==1.8.2 (from cohere)
  Downloading fastavro-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fastavro, backoff, cohere
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires openai, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not instal

In [2]:
import os
from openai import OpenAI
from google.colab import userdata
import pandas as pd
import re

client = OpenAI(api_key=userdata.get('API_KEY'))
MODEL = "gpt-3.5-turbo-16k"


In [30]:
#Initial call to GPT to understand the trials
init_sys_call = [
    {"role": "system",
    "content": """Your task is to give judgement about where you were based on the time we give. You will give 4 alternative options. You MUST select one.

                  Example:

                  Presented:
                  1 at 6PM 11/08/2019
                  2 at 7PM 12/08/2019
                  ...

                  Question:
                  Where were you at 6PM 11/09/2019:
                  A. 2
                  B. 3
                  C. 1
                  D. 30

                  Response:
                  C

                  Where were you at 7PM 12/09/2019:
                  A. 2
                  B. 3
                  C. 9
                  D. 30

                  Response:
                  A
                  ..."""
              }]

In [31]:
events = pd.read_csv(userdata.get('EVENTS_FILE'))
experiments = pd.read_csv(userdata.get('EXPERIMENTS_FILE'))

In [32]:
# Helpful constants
EVENT_DF = 0
EVENT_OFFSET = 1
EXPERIMENT_DF = 2
EXPERIMENT_OFFSET = 3


def create_event_experiment_dfs(uuid):
  """
  create_event_experiment_dfs() creates the appropriate dataframes and index offsets to reference events and experiments.

  :param USER_ID: the user's unique identification string
  :return: user's events, event offset index, user's experiment, experiment offset index
  """

  events_temp = events[['ID', 'StartDateTime Local', 'GPS Cluster Original']][events['USER ID']==uuid]
  experiments_temp = experiments[['ID', 'Target', 'A', 'B', 'C', 'D']][experiments['USER ID']==uuid]

  # offsets
  events_offset = events_temp.index.min()
  experiments_offset = experiments_temp.index.min()
  return (events_temp, events_offset, experiments_temp, experiments_offset)

temp = create_event_experiment_dfs("ap-northeast-1:e7c916fc-736e-47e6-979a-c1c937ebe094")


In [33]:
# Let GPT know where it was

def gen_location(Event_Exp_df):
  """
  gen_location() turns each entry of event_df as a context for GPT

  :param Event_Exp_df: the list of the event_df, event_offset, exp_df, exp_offset
  :return: the structured msg containing where a particular participant was located
  """

  events_df = Event_Exp_df[EVENT_DF]
  events_offset = Event_Exp_df[EVENT_OFFSET]
  location_data = []

  # Each entry is a context for GPT
  for i in range(len(events_df)):
    datetime = events_df.loc[i+events_offset, "StartDateTime Local"]
    date_pattern = r"(\d{4}-\d{2}-\d{2})T"
    time_pattern = r"\d{4}-\d{2}-\d{2}T(\d{2}:\d{2}:\d{2})Z"
    date = re.search(date_pattern, datetime).group(1)
    time = re.search(time_pattern, datetime).group(1)

    gps_cluster = events_df.loc[i+events_offset, "GPS Cluster Original"]
    location_data.append({
        "role": "user",
        "content": f"{gps_cluster} at {time} {date}"
    })

  return location_data


  # 2019-08-11T06:00:00Z
  #  print(events_1.loc[i, "ID"], events_1.loc[i, "GPS Cluster Original"])
temp_loc = gen_location(temp)

In [34]:
# Form Questions for GPT

def gen_questions(Event_Exp_df):
  """
  gen_questions() generates the questions that will be fed to GPT by using the
  experiment_df

  :param Event_Exp_df: the list of the event_df, event_offset, exp_df, exp_offset
  :return: the structured msg containing questions from the experiment for a
           given participant
  """

  events_df = Event_Exp_df[EVENT_DF]
  experiments_df = Event_Exp_df[EXPERIMENT_DF]
  experiments_offset = Event_Exp_df[EXPERIMENT_OFFSET]
  question_data = []

  # Turn each entry into an input for GPT
  for i in range(len(experiments_df)):
    question = ""
    response_id = experiments_df.loc[i+experiments_offset, "Target"]
    correct_event_id = experiments_df.loc[i+experiments_offset, response_id]

    # Perform lookup for experiments_df from events_df
    cluster_question = []
    for alpha in ['A', 'B', 'C', 'D']:
      alpha_response_id = experiments_df.loc[i+experiments_offset, alpha]
      cluster_question.append(events_df.loc[alpha_response_id, "GPS Cluster Original"])

    # Datetime processing
    datetime = events_df.loc[correct_event_id, "StartDateTime Local"]
    date_pattern = r"(\d{4}-\d{2}-\d{2})T"
    time_pattern = r"\d{4}-\d{2}-\d{2}T(\d{2}:\d{2}:\d{2})Z"
    date = re.search(date_pattern, datetime).group(1)
    time = re.search(time_pattern, datetime).group(1)

    # Question forming
    question += f"Where were you at {time} {date}:\n"
    question += f"A. {cluster_question[0]}\n"
    question += f"B. {cluster_question[1]}\n"
    question += f"C. {cluster_question[2]}\n"
    question += f"D. {cluster_question[3]}"

    question_data.append({
        "role": "user",
        "content": question
    })

  return question_data

temp_ques = gen_questions(temp)

In [35]:
# Pricing (per 1k token) is specific for this model: GPT 3.5 Turbo (16k)
gpt_pricing = {"input": 0.001, "output": 0.002}

def run_gpt(Event_Exp_df, location_data, question_data):
  """
  run_gpt() is the main code for interacting with the OpenAI API to obtain expected output

  :param Event_Exp_df: the list of the event_df, event_offset, exp_df, exp_offset
  :param location_data: the structured location information for GPT
  :param question_data: the structured questions for GPT
  :return: an array of output as specified by the experiment
  """

  experiments_df = Event_Exp_df[EXPERIMENT_DF]
  output_msg = []
  message = {}
  total_cost = 0

  for i in range(len(experiments_df)):
    message = init_sys_call #+ location_data

    # Append previous questions as context
    if (i==0):
      message.append(question_data[0])
    else:
      for j in range(i+1):
        message.append(question_data[j])
        if (j<i):
          message.append({'role': 'assistant', 'content': output_msg[j]})

    # Generate response body for API
    response = client.chat.completions.create(
      model=MODEL,
      messages=message
    )

    # Notify if it's too large for current model
    if (response.choices[0].finish_reason == "length"):
      print("ERROR: Too Long")
      return output_msg

    # Obtain results and save it as context for future questions
    output_msg.append(response.choices[0].message.content)

    # Cost Calculations
    input_cost = gpt_pricing["input"] * response.usage.prompt_tokens
    output_cost = gpt_pricing["output"] * response.usage.completion_tokens
    total_cost = input_cost + output_cost

    break

  print(total_cost)

  return output_msg



In [22]:
output = run_gpt(temp, temp_loc, temp_ques)
output

# ChatCompletion(id='chatcmpl-8LiuiS7iuwFLoZaG86qaBk54NHPDT',
#                choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='B. 14', role='assistant', function_call=None, tool_calls=None))],
#                created=1700188228, model='gpt-3.5-turbo-16k-0613',
#                object='chat.completion',
#                system_fingerprint=None,
#                usage=CompletionUsage(completion_tokens=4, prompt_tokens=7661, total_tokens=7665))


0.262


['A. 27']

In [None]:
df = pd.DataFrame(output_msg, columns=["Response"])
df.to_csv('/content/data/responseFULLparticipant7.csv', index=True)

# TODOs
- [x] Change all the code into functions
- [x] Migrate code for new version of GPT API
- [ ] Amend all temporal information to be more human friendly

# Prev Minutes
- compare human against gpt
- gpt not nearly as accurate as human

# Model tuning
- Question & content:
  - convert dates to day (frame questions in terms on weeks)
  - x weeks ago, on monday at this time
  - change time to e.g., 6am
  - change cluster to location 0 (and include the word in questions) (sidebar - diff between place & location)

- sequential dependencies: what happens if you dont give it the prev questions
  - llms have tendency to repeat itself

- Assume midnight observations are at home

Then
- clustering:
  - clusters don't seem like locations
  - can we get gps to decode?
    - pro: semantically meaningful name insteaad of a cluster num
    - cons: error in conversion since it's not accurate


