- get chatgpt to accept prompts
- get chatgpt to output prompts

In [58]:
!pip install cohere
!pip install tiktoken
!pip install openai



In [92]:
import os
from openai import OpenAI
from google.colab import userdata
import pandas as pd
import re
import tiktoken
import copy
from datetime import datetime, timedelta
client = OpenAI(api_key=userdata.get('API_KEY'))
MODEL = "gpt-3.5-turbo-16k"


In [60]:
#Initial call to GPT to understand the trials
# init_sys_call = [
#     {"role": "system",
#     "content": """Your task is to give judgement about where you were based on the time we give. You will give 4 alternative options. You MUST select one.

#                   Example:

#                   Presented:
#                   Location 1 at 6PM 11/08/2019
#                   Location 2 at 7PM 12/08/2019
#                   ...

#                   Question:
#                   Where were you at 6PM 11/09/2019:
#                   A. Location 2
#                   B. Location 3
#                   C. Location 1
#                   D. Location 30

#                   Response:
#                   C

#                   Where were you at 7PM 12/09/2019:
#                   A. Location 2
#                   B. Location 3
#                   C. Location 9
#                   D. Location 30

#                   Response:
#                   A
#                   ..."""
#               }]

In [61]:
events = pd.read_csv(userdata.get('EVENTS_FILE'))
experiments = pd.read_csv(userdata.get('EXPERIMENTS_FILE'))

In [62]:
# Helpful constants
EVENT_DF = 0
EVENT_OFFSET = 1
EXPERIMENT_DF = 2
EXPERIMENT_OFFSET = 3


def create_event_experiment_dfs(uuid):
  """
  create_event_experiment_dfs() creates the appropriate dataframes and index offsets to reference events and experiments.

  :param USER_ID: the user's unique identification string
  :return: user's events, event offset index, user's experiment, experiment offset index
  """

  events_temp = events[['ID', 'StartDateTime Local', 'GPS Cluster Original']][events['USER ID']==uuid]
  experiments_temp = experiments[['ID', 'Target', 'A', 'B', 'C', 'D']][experiments['USER ID']==uuid]

  # offsets
  events_offset = events_temp.index.min()
  experiments_offset = experiments_temp.index.min()
  return (events_temp, events_offset, experiments_temp, experiments_offset)

temp = create_event_experiment_dfs("ap-northeast-1:e7c916fc-736e-47e6-979a-c1c937ebe094")


In [107]:
# Let GPT know what to do and where it was

def gen_location_and_instruct(Event_Exp_df):
  """
  gen_location_and_instruct() turns each entry of event_df as a context for GPT and also
  adds the initial context to allow the model to understand what it is to do

  :param Event_Exp_df: the list of the event_df, event_offset, exp_df, exp_offset
  :return: the structured msg containing where a particular participant was located
  """

  events_df = Event_Exp_df[EVENT_DF]
  events_offset = Event_Exp_df[EVENT_OFFSET]
  location_data = []

  # Each entry is a context for GPT
  for i in range(len(events_df)):
    curr_datetime = events_df.loc[i+events_offset, "StartDateTime Local"]
    # date_pattern = r"(\d{4}-\d{2}-\d{2})T"
    # time_pattern = r"\d{4}-\d{2}-\d{2}T(\d{2}:\d{2}:\d{2})Z"
    # date = re.search(date_pattern, datetime).group(1)
    # time = re.search(time_pattern, datetime).group(1)

    datetime_obj = datetime.fromisoformat(curr_datetime.rstrip("Z"))
    date = datetime_obj.strftime("%d %B")
    day = datetime_obj.strftime("%A")
    time = datetime_obj.strftime("%I%p").lstrip('0')


    gps_cluster = events_df.loc[i+events_offset, "GPS Cluster Original"]
    location_data.append({
        "role": "user",
        "content": f"{day}, {date} {time} you were at location {gps_cluster}"
        # On Thursday, the 25th of May 2019 at 10 am you were at location 16
        #Location {gps_cluster} at {time}, {date}"
    })

  # variables are holding onto the latest dates, it's known that questioning took place a week after
  new_datetime_obj = datetime_obj + timedelta(days=7)
  date = new_datetime_obj.strftime("%d %B").lstrip('0')
  day = new_datetime_obj.strftime("%A")

  location_data.insert(0,
   {"role": "system",
    "content": """Today is {}, {}. Your task is to judge your location based on the given time. You'll have 4 options to choose from. ONLY choose 1 option.\n\nExample:\nSunday, August 11 at 5 PM, you were at location 3.\nFriday, August 16 at 12AM, you were at location 0.\n...\n\nQuestion:\nFriday, 16 August 12AM you were at location\nA. 2\nB. 3\nC. 0\nD. 30\n\nResponse:\nC\n\nSunday, 11 August 5AM you were at location\nA. 2\nB. 3\nC. 9\nD. 30\n\nResponse:\nB\n...""".format(day, date)
    })



  return location_data

  # 2019-08-11T06:00:00Z
  #  print(events_1.loc[i, "ID"], events_1.loc[i, "GPS Cluster Original"])
temp_loc = gen_location_and_instruct(temp)
temp_loc

[{'role': 'system',
  'content': "Today is Saturday, 7 September. Your task is to judge your location based on the given time. You'll have 4 options to choose from. ONLY choose 1 option.\n\nExample:\\Sunday, August 11 at 5 PM, you were at location 3.\nFriday, August 16 at 12AM, you were at location 0.\n...\n\nQuestion:\nFriday, 16 August 12AM you were at location\nA. 2\nB. 3\nC. 0\nD. 30\n\nResponse:\nC\n\nSunday, 11 August 5AM you were at location\nA. 2\nB. 3\nC. 9\nD. 30\n\nResponse:\nB\n..."},
 {'role': 'user', 'content': 'Sunday, 11 August 1AM you were at location 0'},
 {'role': 'user', 'content': 'Sunday, 11 August 2AM you were at location 0'},
 {'role': 'user', 'content': 'Sunday, 11 August 3AM you were at location 0'},
 {'role': 'user', 'content': 'Sunday, 11 August 4AM you were at location 0'},
 {'role': 'user', 'content': 'Sunday, 11 August 5AM you were at location 0'},
 {'role': 'user', 'content': 'Sunday, 11 August 6AM you were at location 0'},
 {'role': 'user', 'content': '

In [98]:
# Form Questions for GPT

def gen_questions(Event_Exp_df):
  """
  gen_questions() generates the questions that will be fed to GPT by using the
  experiment_df

  :param Event_Exp_df: the list of the event_df, event_offset, exp_df, exp_offset
  :return: the structured msg containing questions from the experiment for a
           given participant
  """

  events_df = Event_Exp_df[EVENT_DF]
  experiments_df = Event_Exp_df[EXPERIMENT_DF]
  experiments_offset = Event_Exp_df[EXPERIMENT_OFFSET]
  question_data = []

  # Turn each entry into an input for GPT
  for i in range(len(experiments_df)):
    question = ""
    response_id = experiments_df.loc[i+experiments_offset, "Target"]
    correct_event_id = experiments_df.loc[i+experiments_offset, response_id]

    # Perform lookup for experiments_df from events_df
    cluster_question = []
    for alpha in ['A', 'B', 'C', 'D']:
      alpha_response_id = experiments_df.loc[i+experiments_offset, alpha]
      cluster_question.append(events_df.loc[alpha_response_id, "GPS Cluster Original"])

    # Datetime processing
    curr_datetime = events_df.loc[correct_event_id, "StartDateTime Local"]
    # date_pattern = r"(\d{4}-\d{2}-\d{2})T"
    # time_pattern = r"\d{4}-\d{2}-\d{2}T(\d{2}:\d{2}:\d{2})Z"
    # date = re.search(date_pattern, datetime).group(1)
    # time = re.search(time_pattern, datetime).group(1)

    datetime_obj = datetime.fromisoformat(curr_datetime.rstrip("Z"))
    date = datetime_obj.strftime("%d %B")
    day = datetime_obj.strftime("%A")
    time = datetime_obj.strftime("%I%p").lstrip('0')

    # Question forming
    question += f"{day}, {date} {time} you were at location\n"
    question += f"A {cluster_question[0]}\n"
    question += f"B {cluster_question[1]}\n"
    question += f"C {cluster_question[2]}\n"
    question += f"D {cluster_question[3]}"

    question_data.append({
        "role": "user",
        "content": question
    })

  return question_data

temp_ques = gen_questions(temp)
temp_ques

[{'role': 'user',
  'content': 'Thursday, 22 August 2PM you were at location\nA 27\nB 14\nC 9\nD 20'},
 {'role': 'user',
  'content': 'Friday, 23 August 12PM you were at location\nA 22\nB 24\nC 27\nD 14'},
 {'role': 'user',
  'content': 'Friday, 23 August 10AM you were at location\nA 20\nB 14\nC 9\nD 15'},
 {'role': 'user',
  'content': 'Monday, 19 August 3PM you were at location\nA 11\nB 8\nC 14\nD 9'},
 {'role': 'user',
  'content': 'Friday, 16 August 9AM you were at location\nA 15\nB 22\nC 14\nD 24'},
 {'role': 'user',
  'content': 'Friday, 23 August 6PM you were at location\nA 9\nB 14\nC 11\nD 8'},
 {'role': 'user',
  'content': 'Thursday, 29 August 7PM you were at location\nA 14\nB 27\nC 9\nD 15'},
 {'role': 'user',
  'content': 'Monday, 19 August 8PM you were at location\nA 11\nB 8\nC 14\nD 9'},
 {'role': 'user',
  'content': 'Monday, 26 August 10AM you were at location\nA 14\nB 16\nC 8\nD 24'},
 {'role': 'user',
  'content': 'Saturday, 24 August 5PM you were at location\nA 9\nB 

In [117]:
# Pricing (per 1k token) is specific for this model: GPT 3.5 Turbo (16k)

def run_gpt(Event_Exp_df, location_data, question_data):
  """
  run_gpt() is the main code for interacting with the OpenAI API to obtain expected output

  :param Event_Exp_df: the list of the event_df, event_offset, exp_df, exp_offset
  :param location_data: the structured location information for GPT
  :param question_data: the structured questions for GPT
  :return: an array of output as specified by the experiment
  """
  gpt_pricing = {"input": 0.001, "output": 0.002}
  gpt4_pricing = {"input": 0.01, "output": 0.03}
  experiments_df = Event_Exp_df[EXPERIMENT_DF]
  output_msg = []
  message = {}
  cost = 0

  for i in range(len(experiments_df)):
    message = copy.copy(location_data)

    # Append previous questions as context
    if (i==0):
      message.append(question_data[0])
    else:
      for j in range(i+1):
        message.append(question_data[j])
        if (j<i):
          message.append({'role': 'assistant', 'content': output_msg[j]})

    # Generate response body for API
    response = client.chat.completions.create(
      model=MODEL,
      messages=message
    )

    #### For exact token calculation
    # encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    # x = encoding.encode(str(message))
    # token = len([encoding.decode_single_token_bytes(token) for token in x])
    # cost += (token / 1000) * 0.001 + 0.002
    # print("Token:",token)

    # Notify if it's too large for current model
    if (response.choices[0].finish_reason == "length"):
      print("ERROR: Too Long")
      return output_msg

    # Obtain results and save it as context for future questions
    output_msg.append(response.choices[0].message.content)

    # Cost Calculations
    input_cost = gpt_pricing["input"] * (response.usage.prompt_tokens / 1000)
    output_cost = gpt_pricing["output"] * (response.usage.completion_tokens / 1000)
    cost += input_cost + output_cost
    print(input_cost+output_cost)
    if (i>25):
      break

  print("TOTAL:",cost)
  print(len(experiments_df))


  return output_msg



In [66]:
# output = run_gpt(temp, temp_loc, temp_ques)
# output

# ChatCompletion(id='chatcmpl-8LiuiS7iuwFLoZaG86qaBk54NHPDT',
#                choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='B. 14', role='assistant', function_call=None, tool_calls=None))],
#                created=1700188228, model='gpt-3.5-turbo-16k-0613',
#                object='chat.completion',
#                system_fingerprint=None,
#                usage=CompletionUsage(completion_tokens=4, prompt_tokens=7661, total_tokens=7665))


# Main Code Entrypoint


In [113]:
all_uuid = experiments['USER ID'].unique()
len(all_uuid)

66

In [118]:
count = 0
OUTPUT_PATH = userdata.get('OUTPUT_FILE')

for uuid in all_uuid:

  # Generate Output
  ev_exp_df = create_event_experiment_dfs(uuid)
  location_and_instruction = gen_location_and_instruct(ev_exp_df)
  question_data = gen_questions(ev_exp_df)
  output = run_gpt(ev_exp_df, location_and_instruction, question_data)

  # Save output
  count += 1
  df = pd.DataFrame(output, columns=["Response"])
  df.to_csv(f'{OUTPUT_PATH}responseFULLparticipant{count}.csv', index=True)

  print(f"COMPLETED M{count}")

  break


0.008291999999999999
0.008329
0.008366000000000002
0.008403
0.008440000000000001
0.008477
0.008514
0.008551
0.008588
0.008624999999999999
0.008662000000000001
0.008699
0.008736
0.008773000000000001
0.00881
0.008847
0.008884
0.008921
0.008958
0.008995000000000001
0.009032
0.009069
0.009106
0.009143
0.00918
0.009217
0.009254000000000002
TOTAL: 0.23687100000000005
92
COMPLETED M1


# Token Calculation Code

In [71]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [72]:
token = 0
for loc in temp_loc:
  token += len(encoding.encode(loc['content']))

token

5639

In [91]:
# new_output = output[len(temp_loc):-1]
# print(temp_ques)
token = 0

for i in range(len(temp_ques)):
  # new_msg = []
  for j in range(i+1):
    if (i==91):
      token += len(encoding.encode(temp_ques[j]['content'])) + 1
print(token)

2668


In [90]:
len(temp_ques)

92