### Import dataset to dataframe

Note that the dataset might be coming from BigQuery or from a query on the local database (created from the SE Data Dump). The two data sources should be interchangeable in the code.

In [None]:
import json
import os
import pandas as pd
from pathlib import Path

DATASET_FILE = "saved_dataset.csv"      # The file name of the saved dataset (saved on / loaded from local disk)
cwd = Path().absolute()                 # Current working directory (note: possibly different from execution directory)

# Load a saved copy of the dataset from local disk (if it exists)
try:
    dataset_path = os.path.join(cwd, DATASET_FILE)
    results = pd.read_csv(dataset_path)
    results = results.astype({"creation_date": "datetime64[ns]"})
    print("Saved copy of dataset loaded from local disk.")
except FileNotFoundError:
    print("Saved dataset not found!")

### Cull / filter dataset

For the demo we're just arbitrarily culling the size of the dataset to make it more manageable, but you could also filter for other reasons such as focusing on a specific tag, or sampling based on answer and/or question scores.

A pandas dataframe can be sampled either:
* Using a fractional value: e.g., ``.sample(frac=0.01)`` will result in a number of samples equivalent to 1% of the dataset.
* Using an integer value: e.g., ``.sample(n=1000)`` will result in 1000 samples from the dataset.

In [16]:
# Randomly sample dataset
fd_tiny = results.sample(frac=0.0001)
fd_nano = results.sample(n=10)
fd_1k   = results.sample(n=1000)
fd_10k  = results.sample(n=10000)
fd_100k = results.sample(n=100000)

# Convenience: alias the filtered data so we can change it easily for later code
wd = fd_nano

# Dump info about the filtered result
print("Number of questions in currently selected filtered dataset:", len(wd))
wd.head()

Number of questions in currently selected filtered dataset: 10


Unnamed: 0,id,title,body,accepted_answer_id,view_count,tags,answer_count,question_score,answer_score,stackoverflow_answer
75793,73545119,Unzip .zip File without Writing to Disc from R...,<p>Let me preface by stating that I' somewhat ...,73545325,60,c#|asp.net|api,2,1,3,<p>Assuming you actually have a <code>.zip</co...
151748,70934250,All self-hosted agents cannot connect to devop...,<p>All of our self-hosted agents cannot connec...,70937402,948,azure-devops,1,0,3,<p>This should be caused by TLS1.2.</p>\n<bloc...
169258,70835504,display original list after shuffle function,<p>See my code snipped\nI've an html list and ...,70835746,37,javascript,2,1,1,<p>A short trick would be to store the <code>i...
291164,72270744,Get sum of column values for specific rows onl...,<p>I am super new to power bi and Dax and i ne...,72279794,2107,powerbi|dax|calculated-columns,2,0,1,<p>This is simple to do in DAX. Add the column...
331925,71434362,How to check which row in producing LangDetect...,<p>I have a dataset of tweets that contains tw...,71434382,131,python-3.x|pandas|dataframe|twitter,1,1,1,<p>Use custom function for return <code>True</...


### Strip HTML from filtered dataset

We decided we would strip HTML and use this "stripped" version as our default for evaluations. The stripped text is appended as a separate column in our dataframe.

In [19]:
from bs4 import BeautifulSoup as soup

# Separate out the text columns we want for convenience
titles  = wd["title"]
bodies  = wd["body"]
answers = wd["stackoverflow_answer"]

# Sanity check (surely this will always be true, but *just in case*)
if len(titles) == len(bodies) and len(titles) == len(answers):
    pass
else:
    raise ValueError("columns are different lengths!")

# Create new lists to store HTML-stripped versions of text
s_titles  = []
s_bodies  = []
s_answers = []

# Iterating is slow(!) but comparatively easy to understand (modified Harvey approach)
for idx, *row in wd.itertuples():
    
    # Strip HTML from title, question, and answer; save to lists
    s_titles.append(soup(titles[idx], "html.parser").get_text())
    s_bodies.append(soup(bodies[idx], "html.parser").get_text())
    s_answers.append(soup(answers[idx], "html.parser").get_text())

# Add the populated lists into our dataframe
wd["stripped_title"] = s_titles
wd["stripped_body"] = s_bodies
wd["stripped_stackoverflow_answer"] = s_answers

### Install OpenAI library and Configure OpenAI API Key

Currently configured using secrets.json located at the root directory. An alternative method (which would require code changes) would be to read the system's environment variable.

Key can be generated from: https://platform.openai.com/account/api-keys

In [20]:
# Install OpenAI
%pip install openai
import openai

# Function to load OpenAI API key from file
# https://stackoverflow.com/a/76148268
def load_api_key(secrets_file="secrets.json"):
    with open(secrets_file) as f:
        secrets = json.load(f)
    return secrets["OPENAI_API_KEY"]

# Read and set our OpenAI API key
api_key = load_api_key()
openai.api_key = api_key

Note: you may need to restart the kernel to use updated packages.


### Split dataframe into chunks for batching

In [21]:
# Split into n parts (in our case 10) in a list
wd_list = np.array_split(wd, 10)

# Check the result of the first chunk
wd_list[0].head()

Unnamed: 0,id,title,body,accepted_answer_id,view_count,tags,answer_count,question_score,answer_score,stackoverflow_answer,stripped_title,stripped_body,stripped_stackoverflow_answer
75793,73545119,Unzip .zip File without Writing to Disc from R...,<p>Let me preface by stating that I' somewhat ...,73545325,60,c#|asp.net|api,2,1,3,<p>Assuming you actually have a <code>.zip</co...,Unzip .zip File without Writing to Disc from R...,Let me preface by stating that I' somewhat new...,"Assuming you actually have a .zip file, you do..."


### Get GPT answers to SO questions

In [None]:
# Used for rate limit handling with OpenAI API
%pip install tenacity

In [None]:
# Used to estimate token counts
%pip install tiktoken

In [22]:
import tiktoken
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

MODEL_NAME = "gpt-4"
ENCODING = tiktoken.encoding_for_model(MODEL_NAME)

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    response = openai.ChatCompletion.create(**kwargs)
    return response

def chat_format(question):
    """Insert the full prompt into chat format."""
    messages = [
        {"role": "user", "content": question},
    ]
    return messages

for chunk in wd_list:
    GPT_answers = []
    GPT_finished = []
    full_responses = []

    for idx, row in chunk.iterrows():

        title_and_question = row["stripped_title"] + "\n\n" + row["stripped_body"]
        SO_text = title_and_question + row["stripped_stackoverflow_answer"]

        # Estimate tokens for SO T+Q+A, and skip anything too long
        if len(ENCODING.encode(SO_text)) < 4000:

            # Get the response from GPT
            prompt = chat_format(title_and_question)
            GPT_answer = completion_with_backoff(model=MODEL_NAME, messages=prompt, temperature=0, max_tokens=2000)
            extracted_answer = GPT_answer.choices[0].message.content

            # Check if the GPT response completed or terminated early (because e.g. hit token limit)
            if GPT_answer.choices[0].finish_reason == "stop":
                finished = True
            else:
                finished = False

        else:
            extracted_answer = None
            GPT_answer = None
            finished = False

        # Add to our lists
        GPT_answers.append(extracted_answer)
        GPT_finished.append(finished)
        full_responses.append(GPT_answer)

    # Add answers back into the chunk dataframe
    COL_NAME = f"{MODEL_NAME}_answer"
    chunk[COL_NAME] = GPT_answers
    chunk["GPT_finished"] = GPT_finished
    chunk["full_GPT_response"] = full_responses



In [10]:
# Check result of first chunk
wd_list[0].head()

Unnamed: 0,id,title,body,accepted_answer_id,view_count,tags,answer_count,question_score,answer_score,stackoverflow_answer,stripped_title,stripped_body,stripped_stackoverflow_answer,gpt-3.5-turbo-16k_answer,full_GPT_response
301250,71540756,419 Page Expired In Laravel Even after adding ...,"<p>I am working on a Laravel 8 Framework,\nI h...",71542495,2560,php|laravel|post|laravel-8|csrf,2,2,2,<h3>Laravel &quot;419 Page Expired&quot; Error...,419 Page Expired In Laravel Even after adding ...,"I am working on a Laravel 8 Framework,\nI have...","Laravel ""419 Page Expired"" Error Troubleshooti...",If you have already checked that the CSRF toke...,{'id': 'chatcmpl-80Stg7rYBYPh9T65RGSwBtQNbTHYv...


In [11]:
# Check result of last chun
wd_list[9].head()

Unnamed: 0,id,title,body,accepted_answer_id,view_count,tags,answer_count,question_score,answer_score,stackoverflow_answer,stripped_title,stripped_body,stripped_stackoverflow_answer,gpt-3.5-turbo-16k_answer,full_GPT_response
153788,70546619,Why Typescript return type void in interface d...,<p>I'm trying two different classes in Typescr...,70546721,886,javascript|typescript,1,5,7,<p>The inferred type of the method without the...,Why Typescript return type void in interface d...,I'm trying two different classes in Typescript...,The inferred type of the method without the ex...,"In TypeScript, when a class implements an inte...",{'id': 'chatcmpl-80Sven16rn48fpj47iEBNv7b10lOl...


In [None]:
# temporary fix to column name TODO: REMOVE

# MODEL_NAME = "gpt-3.5-turbo-16k"
# col_name = f"{MODEL_NAME}_answer"

# wd.drop(columns=["GPT_3.5_answer", "gpt-3.5-turbo-16k_answer"], inplace=True)

# wd.rename(columns={"gpt-3.5-turbo-16k": "gpt-3.5-turbo-16k_answer"}, inplace=True)

# wd

### Prepare JSONL file for evals

In [23]:
# Strip and save a copy of the GPT answer
#   (so that the eval is on fair footing, with HTML tags removed from both human and AI)

for index, chunk in enumerate(wd_list):

    # Separate out the text columns we want for convenience
    titles  = chunk["title"]
    bodies  = chunk["body"]
    answers = chunk["stackoverflow_answer"]

    # Create a new list to store HTML-stripped versions of text
    s_gpt_answers  = []

    # Iterating is slow(!) but comparatively easy to understand (modified Harvey approach)
    for idx, *row in chunk.itertuples():
        
        # Strip HTML from GPT's answer; save to list
        s_gpt_answers.append(soup(chunk[COL_NAME][idx], "html.parser").get_text())

    # Add the populated lists into our dataframe
    stripped_col_name = "stripped_" + COL_NAME
    chunk[stripped_col_name] = s_gpt_answers

    # Preview result
    #chunk.head()

In [13]:
# Check first chunk
wd_list[0].head()

Unnamed: 0,id,title,body,accepted_answer_id,view_count,tags,answer_count,question_score,answer_score,stackoverflow_answer,stripped_title,stripped_body,stripped_stackoverflow_answer,gpt-3.5-turbo-16k_answer,full_GPT_response,stripped_gpt-3.5-turbo-16k_answer
301250,71540756,419 Page Expired In Laravel Even after adding ...,"<p>I am working on a Laravel 8 Framework,\nI h...",71542495,2560,php|laravel|post|laravel-8|csrf,2,2,2,<h3>Laravel &quot;419 Page Expired&quot; Error...,419 Page Expired In Laravel Even after adding ...,"I am working on a Laravel 8 Framework,\nI have...","Laravel ""419 Page Expired"" Error Troubleshooti...",If you have already checked that the CSRF toke...,{'id': 'chatcmpl-80Stg7rYBYPh9T65RGSwBtQNbTHYv...,If you have already checked that the CSRF toke...


In [14]:
# Check last chunk
wd_list[9].head()

Unnamed: 0,id,title,body,accepted_answer_id,view_count,tags,answer_count,question_score,answer_score,stackoverflow_answer,stripped_title,stripped_body,stripped_stackoverflow_answer,gpt-3.5-turbo-16k_answer,full_GPT_response,stripped_gpt-3.5-turbo-16k_answer
153788,70546619,Why Typescript return type void in interface d...,<p>I'm trying two different classes in Typescr...,70546721,886,javascript|typescript,1,5,7,<p>The inferred type of the method without the...,Why Typescript return type void in interface d...,I'm trying two different classes in Typescript...,The inferred type of the method without the ex...,"In TypeScript, when a class implements an inte...",{'id': 'chatcmpl-80Sven16rn48fpj47iEBNv7b10lOl...,"In TypeScript, when a class implements an inte..."


In [25]:
for index, chunk in enumerate(wd_list):

    json_list = []
    for idx, row in chunk.iterrows():
        
        # Strip and chat-format the text as appropriate
        title_and_question = row["stripped_title"] + "\n\n" + row["stripped_body"]
        prompt = chat_format(title_and_question)

        # Create the JSON object using the stripped and chat-formatted text
        json_object = {
            "input": prompt,
            "ideal": row["stripped_stackoverflow_answer"],
            "completion": row["stripped_gpt-3.5-turbo-16k_answer"]
        }

        # TODO: update to skip this json object if there's no GPT answer

        # Add the object to our list
        json_list.append(json.dumps(json_object))

    # because re-running the notebook changes the sampling we DON'T want a persistent jsonl file
    #   this step relies on the directories existing; they are not created here
    #   so this will fail with e.g. FileNotFoundError if they don't exist
    JSONL_FILENAME = f"new_samples_{index}.jsonl"
    JSONL_FILEPATH = os.path.join(cwd, "temp", "samples", JSONL_FILENAME)
    with open(JSONL_FILEPATH, "w") as outfile:
        outfile.write("\n".join(json_list))
        print("Saved JSONL file: " + JSONL_FILEPATH)

Saved JSONL file: c:\Users\Mark\Documents\A2I2 T2 2023\temp\samples\new_samples_0.jsonl
Saved JSONL file: c:\Users\Mark\Documents\A2I2 T2 2023\temp\samples\new_samples_1.jsonl
Saved JSONL file: c:\Users\Mark\Documents\A2I2 T2 2023\temp\samples\new_samples_2.jsonl
Saved JSONL file: c:\Users\Mark\Documents\A2I2 T2 2023\temp\samples\new_samples_3.jsonl
Saved JSONL file: c:\Users\Mark\Documents\A2I2 T2 2023\temp\samples\new_samples_4.jsonl
Saved JSONL file: c:\Users\Mark\Documents\A2I2 T2 2023\temp\samples\new_samples_5.jsonl
Saved JSONL file: c:\Users\Mark\Documents\A2I2 T2 2023\temp\samples\new_samples_6.jsonl
Saved JSONL file: c:\Users\Mark\Documents\A2I2 T2 2023\temp\samples\new_samples_7.jsonl
Saved JSONL file: c:\Users\Mark\Documents\A2I2 T2 2023\temp\samples\new_samples_8.jsonl
Saved JSONL file: c:\Users\Mark\Documents\A2I2 T2 2023\temp\samples\new_samples_9.jsonl


TODO: remove/update old stuff below (from pre-evals-solved)

### Install OpenAI evals

The T1 intern used their own modified evals (speculated to be for efficiency reasons?). For T2 we're trying the unmodified evals. They used the `-e` flag in their installation because they were modifying evals - since we're not doing that it's not needed.

In [49]:
import shutil

# openai evals uses git-lfs, but installation may be system-specific
#!git lfs install

# get a local copy of evals (and if one already exists, nuke it first)
try:
    !rm -rf evals
finally:
    !git clone https://github.com/openai/evals.git

# TODO: the -e thing needs to be added back, and markdown cell updated accordingly
# complete the remaining lfs setup steps
!cd evals
!git lfs fetch --all
!git lfs pull
!cd evals
%pip install evals -e

Cloning into 'evals'...
Updating files:  38% (443/1136)
Updating files:  39% (444/1136)
Updating files:  40% (455/1136)
Updating files:  41% (466/1136)
Updating files:  42% (478/1136)
Updating files:  43% (489/1136)
Updating files:  44% (500/1136)
Updating files:  45% (512/1136)
Updating files:  46% (523/1136)
Updating files:  47% (534/1136)
Updating files:  48% (546/1136)
Updating files:  49% (557/1136)
Updating files:  50% (568/1136)
Updating files:  51% (580/1136)
Updating files:  52% (591/1136)
Updating files:  53% (603/1136)
Updating files:  54% (614/1136)
Updating files:  55% (625/1136)
Updating files:  56% (637/1136)
Updating files:  57% (648/1136)
Updating files:  58% (659/1136)
Updating files:  59% (671/1136)
Updating files:  60% (682/1136)
Updating files:  61% (693/1136)
Updating files:  62% (705/1136)
Updating files:  63% (716/1136)
Updating files:  64% (728/1136)
Updating files:  65% (739/1136)
Updating files:  66% (750/1136)
Updating files:  67% (762/1136)
Updating files: 

fetch: Fetching all references...
Collecting evals
  Using cached evals-1.0.3.post1-py3-none-any.whl (7.8 MB)
Collecting mypy (from evals)
  Obtaining dependency information for mypy from https://files.pythonhosted.org/packages/4e/11/ac861ca5d9b16fd5b781c1941254d4e382e8eaab90e11f41f193d9222b7e/mypy-1.5.1-cp311-cp311-win_amd64.whl.metadata
  Using cached mypy-1.5.1-cp311-cp311-win_amd64.whl.metadata (1.8 kB)
Collecting tiktoken (from evals)
  Using cached tiktoken-0.4.0-cp311-cp311-win_amd64.whl (635 kB)
Collecting blobfile (from evals)
  Using cached blobfile-2.0.2-py3-none-any.whl (74 kB)
Collecting backoff (from evals)
  Using cached backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting snowflake-connector-python[pandas] (from evals)
  Obtaining dependency information for snowflake-connector-python[pandas] from https://files.pythonhosted.org/packages/f1/78/adcecaf1ea13bfaf68662d218b25157c96d060709be63efc98158bad408b/snowflake_connector_python-3.1.1-cp311-cp311-win_amd64.whl.metadata
  Us

### Use/Run evals

This also simultaneously queries OpenAI's models for answers (they're not generated beforehand and then fed into the evaluator).

(As an aside, while using evals without both magic commands and manual file creation is possible (see https://medium.com/@sergioli/evaluating-chatgpt-using-openai-evals-7ca85c0ad139), it's comparatively more complex.)

In [26]:
# Note how the lists here are appended with an extra set of quotes
#   this is being done because we're running shell commands that need quotes

# Construct list of sample file paths
sample_paths = []
for index, chunk in enumerate(wd_list):
    sample_path = os.path.join(cwd, "eval_samples", f"new_samples_{index}.jsonl")
    sample_paths.append(f"{sample_path}")

# Construct list of record file paths
record_paths = []
for index, chunk in enumerate(wd_list):
    record_path = os.path.join(cwd, "eval_records", f"eval_record_{index}.jsonl")
    record_paths.append(f'\"{record_path}\"')

# Construct list of log file paths
log_paths = []
for index, chunk in enumerate(wd_list):
    log_path = os.path.join(cwd, "eval_logs", f"eval_log_{index}.jsonl")
    log_paths.append(f'\"{log_path}\"')

In [28]:
import shutil

samples_file_path = os.path.join(cwd, "evals", "evals", "registry", "data", "coqa", "samples.jsonl")

# Run chunked evals
for index, chunk in enumerate(wd_list):

    # Update the samples file programmatically each iteration
    shutil.copy(sample_paths[index], samples_file_path)

    # Run the evaluation and save the results (records) and log file as specified
    record = record_paths[index]
    log = log_paths[index]
    !oaieval gpt-4 coqa-fact --record_path $record --log_to_file $log


  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:07<00:00,  7.95s/it]
100%|██████████| 1/1 [00:07<00:00,  7.95s/it]

  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:08<00:00,  8.88s/it]
100%|██████████| 1/1 [00:08<00:00,  8.88s/it]

  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:08<00:00,  8.05s/it]
100%|██████████| 1/1 [00:08<00:00,  8.05s/it]

  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:07<00:00,  7.35s/it]
100%|██████████| 1/1 [00:07<00:00,  7.35s/it]

  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:11<00:00, 11.80s/it]
100%|██████████| 1/1 [00:11<00:00, 11.80s/it]

  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:08<00:00,  8.41s/it]
100%|██████████| 1/1 [00:08<00:00,  8.42s/it]

  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:08<00:00,  8.34s/it]
100%|██████████| 1/1 [00:08<00:00,  8.34s/it]

  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:10<00:00, 10.14s/it

In [70]:
# obed's answer reading code
#   (because by his evaluation, the metric disagrees with the written response ~10% of the time)
import re
def get_answer_from_response(text):
    """Parses the output text for the evaluation choice."""
    
    last_letter = text[-1]
    if last_letter not in ['A', 'B', 'C', 'D', 'E']:
        matches = re.findall('\((.*?)\)', text)
        return matches[-1] if matches else None      
    return last_letter


# Iterate through our evals results
for index, chunk in enumerate(wd_list):

    # Update our record path each iteration
    record_path = record_paths[index]

    # Empty lists for us to store stuff in, to later add to the chunk's dataframe
    answer_list = []
    provided_answer_list = []
    sampled_list = []
    metric_list = []

    # Open the record (.jsonl) for this iteration
    with open(record_path) as f:
        # Skip the first two lines (they're just metadata about the query and response)
        for _ in range(2):
            next(f)

        # Iterate through the rest of the file line-by-line
        for line in f:
            eval_line = json.loads(line)

            # process the "sampling" half of each evaluation response
            if eval_line["type"] == "sampling":

                # Extract the response from the answer received
                answer = eval_line["data"]["sampled"][0]
                extr_choice = get_answer_from_response(answer)

                # Add the extracted response and the raw response to our lists
                answer_list.append(extr_choice)
                sampled_list.append(eval_line)

            # Process the "metric" half of each evaluation response
            elif eval_line["type"] == "metric":

                # Also pull evals' self-reported response
                og_choice = eval_line["data"]["choice"]

                # Add that and the raw response to our lists
                provided_answer_list.append(og_choice)
                metric_list.append(eval_line)

    # Add the populated lists into our chunk's dataframe
    chunk["original_eval_choice"] = provided_answer_list
    chunk["extracted_eval_choice"] = answer_list
    chunk["eval_full_sampled"] = sampled_list
    chunk["eval_full_metric"] = metric_list

Unnamed: 0,index,id,title,body,accepted_answer_id,view_count,tags,answer_count,question_score,answer_score,stackoverflow_answer,stripped_title,stripped_body,stripped_stackoverflow_answer,gpt-3.5-turbo-16k_answer,stripped_gpt-3.5-turbo-16k_answer,eval_choice
0,196461,71067715,CRA error. Html Webpack Plugin: Error: Child c...,<p>I create a default react app with <code>npx...,71086154,221,node.js|reactjs|webpack|create-react-app,1,0,0,<p>Solution: install <strong>react-scripts@4.0...,CRA error. Html Webpack Plugin: Error: Child c...,I create a default react app with npx create-r...,Solution: install react-scripts@4.0.3. If you ...,It seems like you are encountering an error re...,It seems like you are encountering an error re...,A
1,243249,71098325,Hyperledger Fabric | Orderer PODs keeps restar...,<p>I'm running Hyperledger Fabric network in A...,71168925,239,azure|kubernetes|hyperledger-fabric|raft,1,0,0,<p>Turns out my WAL logs directory was deleted...,Hyperledger Fabric | Orderer PODs keeps restar...,I'm running Hyperledger Fabric network in Azur...,Turns out my WAL logs directory was deleted. A...,The error message suggests that the raft log o...,The error message suggests that the raft log o...,C
2,375295,73155785,Grouping character strings,<p>I have 3 text :</p>\n<ul>\n<li>Simple test ...,73155978,21,php|regex,1,0,2,"<p>If you want to match the square brackets, y...",Grouping character strings,I have 3 text :\n\nSimple test 1 [https://www....,"If you want to match the square brackets, you ...","To achieve the desired grouping, you can use t...","To achieve the desired grouping, you can use t...",A
3,254120,71900912,Convert array from spreadsheet into associativ...,<p>I've been having difficulty visualizing how...,71901148,96,php|arrays|multidimensional-array,1,0,1,<p>This is a common issue with spreadsheets an...,Convert array from spreadsheet into associativ...,I've been having difficulty visualizing how to...,This is a common issue with spreadsheets and C...,"Yes, the `array_combine` function can be used ...","Yes, the `array_combine` function can be used ...",A
4,361337,71269489,ID components may not include unresolved token...,<p>I am trying to use a CfnParameter in the AW...,71313692,1213,python|amazon-web-services|amazon-cloudformati...,1,0,1,<p>Managed to resolve by using --context inste...,ID components may not include unresolved token...,I am trying to use a CfnParameter in the AWS P...,Managed to resolve by using --context instead ...,The error message suggests that you are trying...,The error message suggests that you are trying...,B


In [None]:
# Combine each of the chunked dataframes, export to CSV
combined = pd.concat(wd_list)
combined.to_csv("dataset_results.csv")

# Preview the result
combined.head()

### THE TOKEN ESTIMATION IS STILL BEING INTEGRATED

In [None]:
%pip install tiktoken

In [1]:
# temp 2023-09-27
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4")

text = "\n\nCompare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.\nThe submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:\n(A) The submitted answer is a subset of the expert answer and is fully consistent with it.\n(B) The submitted answer is a superset of the expert answer and is fully consistent with it.\n(C) The submitted answer contains all the same details as the expert answer.\n(D) There is a disagreement between the submitted answer and the expert answer.\n(E) The answers differ, but these differences don't matter from the perspective of factuality.\n\nFirst, write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset. Then print only a single choice from \"A\" or \"B\" or \"C\" or \"D\" or \"E\" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the answer by itself on a new line.\n\nReasoning:"

tokens = len(encoding.encode(text, disallowed_special=()))

print(tokens)

243


In [26]:
# temp thing for token estimation TODO: update and incorporate into main thing

import tiktoken
import pandas as pd

samples = 100000

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
ans_tokens = 0
que_tokens = 0
row_tokens = 0
rows_over_3k = 0
rows_over_6k = 0
rows_over_12k = 0

for idx, row in wd.iterrows():
    ans_tokens = len(encoding.encode(row["stripped_title"], disallowed_special=()) + encoding.encode(row["stripped_body"], disallowed_special=()))
    que_tokens = len(encoding.encode(row["stripped_stackoverflow_answer"], disallowed_special=()))
    row_tokens = ans_tokens + que_tokens
    if row_tokens > 3000:
        rows_over_3k += 1
    if row_tokens > 6000:
        rows_over_6k += 1
    if row_tokens > 12000:
        rows_over_12k += 1

print(rows_over_3k, f"over 3000 tokens in {samples} samples")
print(rows_over_6k, f"over 6000 tokens in {samples} samples")
print(rows_over_12k, f"over 12000 tokens in {samples} samples")

1511 over 3000 tokens in 100000 samples
328 over 6000 tokens in 100000 samples
59 over 12000 tokens in 100000 samples


In [None]:
#obed's token counter (unmodified code)
import json
import tiktoken
import pprint

tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
INPUT_COST_PER_1K = 0.0015
OUTPUT_COST_PER_1K = 0.002
jsonl_list = [
    "29_08_2023_jsonl/bioasq_3_gpt-3.5-turbo.jsonl",
]

total_prompt_tokens = 0
total_completion_tokens = 0
for jsonl in jsonl_list:
    with open(jsonl) as f:
        for line in f:
            json_object = json.loads(line)
            try:
                prompt = json_object['data']['prompt'][0]['content']
                completion = json_object['data']['sampled'][0]
            except:
                continue
            total_prompt_tokens += len(encoding.encode(prompt))
            total_completion_tokens += len(encoding.encode(completion))
        print(f"Total prompt tokens so far: {total_prompt_tokens}")
        print(f"Total completion tokens so far: {total_completion_tokens}\n")

    
total_prompt_cost = (total_prompt_tokens / 1000) * INPUT_COST_PER_1K
total_completion_cost = (total_completion_tokens / 1000) * OUTPUT_COST_PER_1K
total_cost = total_prompt_cost + total_completion_cost

print(f"Total tokens: {total_prompt_tokens + total_completion_tokens}")
print(f"Total cost: {total_cost}")