In [15]:
service_account_path = "creds/google__sa.json"
tracking_sheet_id = "1qBU7Kvuuij2fxbqPxebReKMxWgIBmOIE5Gi4ZuX0j_4"

In [16]:
import pandas as pd

from src.sheets_utils import download_sheet_as_df


contributors_df = download_sheet_as_df(
    service_account_path,
    tracking_sheet_id,
    "Contributors"
)

tasks_df = pd.concat(
    [
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_2"
        ),
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_3"
        ),
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_4"
        ),
        download_sheet_as_df(
            service_account_path,
            tracking_sheet_id,
            "Conversations_Batch_5"
        ),
    ],
    ignore_index=True
)
tasks_df = tasks_df[tasks_df["completion_status"] == "Done"].reset_index()


reviews_df = download_sheet_as_df(
    service_account_path,
    tracking_sheet_id,
    "Reviews"
)

In [17]:
# Parse Conversations into list of dicts

import io
import threading

import nbformat
from fuzzywuzzy import fuzz

from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload


def get_closest_match(query, choices):
    """
    Get the closest match(es) to a query string from a list of choices.

    :param query: The query string.
    :param choices: A list of strings to match against.
    :param limit: The maximum number of matches to return.
    """
    best_role = None
    best_score = 0
    for choice in choices:
        score = fuzz.ratio(query, choice)
        if score > best_score and score > 25:
            best_score = score
            best_role = choice

    return best_role, best_score


def notebook_parser(notebook):
    """
    Parse a notebook and extract the message objects.

    :param notebook: The notebook object.
    """
    messages = []
    for cell in notebook.cells[2:]:
        if cell["cell_type"] == "markdown":
            markdown_headers = ["**User**", "**Assistant**"]
            lines = cell["source"].split("\n")
            first_line = lines[0]
            role, score = get_closest_match(first_line, markdown_headers)
            if score>25:
                message = {
                    "role": role.replace("*", "").strip(),
                    "content": "\n".join(lines[1:]).strip("\n"),
                    "type": "markdown"
                }
                messages.append(message)

        elif cell["cell_type"] == "code":
            code_headers = ["# User", "# Assistant"]
            lines = cell["source"].split("\n")
            first_line = lines[0]
            role, score = get_closest_match(first_line, code_headers)
            if score>25:
                message = {
                    "role": role.replace("#", "").strip(),
                    "content": "\n".join(lines[1:]).strip("\n"),
                    "type": "code"
                }
                messages.append(message)
    return messages


def download_and_parse_notebook(service_account_file, file_id):
    # Authenticate with the service account
    credentials = service_account.Credentials.from_service_account_file(
        service_account_file, scopes=['https://www.googleapis.com/auth/drive'])
    service = build('drive', 'v3', credentials=credentials)

    # Request to download the file
    request = service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)

    # Download the file
    done = False
    while not done:
        status, done = downloader.next_chunk()
        print("Download progress: %d%%." % int(status.progress() * 100))

    # Move the buffer's pointer to the beginning
    fh.seek(0)

    # Open the notebook
    notebook = nbformat.read(fh, as_version=4)

    # Parse the notebook
    messages = notebook_parser(notebook)

    # # Extract the first cell
    first_cell = notebook.cells[0]
    lines = first_cell["source"].split("\n")
    metadata = {}
    for line in lines:
        if "**Python Topics**" in line:
            metadata["topic"] = line.split(" - ")[1]
        if "**Type**" in line:
            metadata["type"] = line.split(" - ")[1]
        if "**Target Number of Turns (User + Assistant)**" in line:
            metadata["target_turns"] = line.split(" - ")[1]

    return {
        "id": file_id,
        "metadata": metadata,
        "messages":messages
    }


def threading_processor(service_account_path, file_id, results):
    results.append(download_and_parse_notebook(service_account_path, file_id))


threads = []
parsed_conversations = []
for i in range(tasks_df.shape[0]):
    file_id = tasks_df["task_link"][i].split("/")[-1]
    thread = threading.Thread(target=threading_processor, args=(service_account_path, file_id, parsed_conversations))
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()


Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


Exception in thread Thread-2184:
Traceback (most recent call last):
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_525/2184826214.py", line 115, in threading_processor
  File "/tmp/ipykernel_525/2184826214.py", line 83, in download_and_parse_notebook
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 780, in next_chunk
    raise HttpError(resp, content, uri=self._uri)
googleapiclient.errors.HttpError: <HttpError 404 when requesting https://www.googleapis.com/drive/v3/files/1rfNQU__74pEdovonm_-u6yrhF0UsAa2C?alt=media 

Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


Exception in thread Thread-2182:
Traceback (most recent call last):
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_525/2184826214.py", line 115, in threading_processor
  File "/tmp/ipykernel_525/2184826214.py", line 83, in download_and_parse_notebook
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 780, in next_chunk
    raise HttpError(resp, content, uri=self._uri)
googleapiclient.errors.HttpError: <HttpError 404 when requesting https://www.googleapis.com/drive/v3/files/1lHYB-8JiU67LlaqjvaRuLYUbetxWbnD5%23scrollTo

Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


Exception in thread Thread-3273:
Traceback (most recent call last):
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_525/2184826214.py", line 115, in threading_processor
  File "/tmp/ipykernel_525/2184826214.py", line 83, in download_and_parse_notebook
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 741, in next_chunk
    resp, content = _retry_request(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 222, in _retry_request


Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


    Exception in thread Thread-3314:
Traceback (most recent call last):
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 980, in _bootstrap_inner
raise exception
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 191, in _retry_request
    self.run()
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_525/2184826214.py", line 115, in threading_processor
    resp, content = http.request(uri, method, *args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/google_auth_httplib2.py", line 209, in request
  File "/tmp/ipykernel_525/2184826214.py", line 83, in download_and_parse_notebook
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
    s

Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


    response, data = self.http.request(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/httplib2/__init__.py", line 1724, in request
    response_data = _token_endpoint_request(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/google/oauth2/_client.py", line 268, in _token_endpoint_request
    response_status_ok, response_data, retryable_error = _token_endpoint_request_no_throw(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/google/oauth2/_client.py", line 215, in _token_endpoint_request_no_throw
    request_succeeded, response_data, retryable_error = _perform_request()
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/google/oauth2/_client.py", line 191, in _perform_request
    response = request(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packa

Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


    self.sock.sendall(data)
  File "/home/joe96/miniconda3/lib/python3.9/ssl.py", line 1238, in sendall
    v = self.send(byte_view[count:])
  File "/home/joe96/miniconda3/lib/python3.9/ssl.py", line 1207, in send
    v = self.send(byte_view[count:])
  File "/home/joe96/miniconda3/lib/python3.9/ssl.py", line 1207, in send
    return self._sslobj.write(data)
BrokenPipeError: [Errno 32] Broken pipe
    return self._sslobj.write(data)
BrokenPipeError: [Errno 32] Broken pipe


Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


Exception in thread Thread-3441:
Traceback (most recent call last):
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_525/2184826214.py", line 115, in threading_processor
  File "/tmp/ipykernel_525/2184826214.py", line 83, in download_and_parse_notebook
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 741, in next_chunk
    resp, content = _retry_request(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 222, in _retry_request
    raise exc

Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


    self.credentials.before_request(self._request, method, uri, request_headers)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/google/auth/credentials.py", line 175, in before_request
    self.refresh(request)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/google/oauth2/service_account.py", line 449, in refresh
    access_token, expiry, _ = _client.jwt_grant(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/google/oauth2/_client.py", line 308, in jwt_grant
    response_data = _token_endpoint_request(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/google/oauth2/_client.py", line 268, in _token_endpoint_request
    response_status_ok, response_data, retryable_error = _token_endpoint_request_no_throw(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-

Download progress: 100%.
Download progress: 100%.


    response, data = self.http.request(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/httplib2/__init__.py", line 1724, in request
    (response, content) = self._request(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/httplib2/__init__.py", line 1444, in _request


Download progress: 100%.
Download progress: 100%.


    (response, content) = self._conn_request(conn, request_uri, method, body, headers)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/httplib2/__init__.py", line 1367, in _conn_request
    conn.request(method, request_uri, body, headers)
  File "/home/joe96/miniconda3/lib/python3.9/http/client.py", line 1285, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/home/joe96/miniconda3/lib/python3.9/http/client.py", line 1331, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/home/joe96/miniconda3/lib/python3.9/http/client.py", line 1280, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/home/joe96/miniconda3/lib/python3.9/http/client.py", line 1040, in _send_output
    self.send(msg)
  File "/home/joe96/miniconda3/lib/python3.9/http/client.py", line 1001, in send
    self.sock.sendall(data)
  File "/home/joe96/miniconda3/lib/python3

Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


Exception in thread Thread-3435:
Traceback (most recent call last):
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_525/2184826214.py", line 115, in threading_processor
  File "/tmp/ipykernel_525/2184826214.py", line 83, in download_and_parse_notebook
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 741, in next_chunk


Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


    resp, content = _retry_request(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 222, in _retry_request
    raise exception
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 191, in _retry_request
    resp, content = http.request(uri, method, *args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/google_auth_httplib2.py", line 209, in request
    self.credentials.before_request(self._request, method, uri, request_headers)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/google/auth/credentials.py", line 175, in before_request
    self.refresh(request)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/google/oauth2/service_account.py", line 449, in refresh
    access_token

Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


Exception in thread Thread-3276:
Traceback (most recent call last):
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_525/2184826214.py", line 115, in threading_processor
  File "/tmp/ipykernel_525/2184826214.py", line 83, in download_and_parse_notebook
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 741, in next_chunk
    resp, content = _retry_request(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 222, in _retry_request
    raise exc

Download progress: 100%.
Download progress: 100%.
Download progress: 100%.
Download progress: 100%.


Exception in thread Thread-3359:
Traceback (most recent call last):
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/joe96/miniconda3/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_525/2184826214.py", line 115, in threading_processor
  File "/tmp/ipykernel_525/2184826214.py", line 83, in download_and_parse_notebook
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 741, in next_chunk
    resp, content = _retry_request(
  File "/home/joe96/projects/turing/character.ai/character_tasks/venv/lib/python3.9/site-packages/googleapiclient/http.py", line 222, in _retry_request
    raise exc

In [18]:
from datetime import datetime

def get_number_of_turns(messages):

    initial_role = messages[0]["role"]

    count = 0
    for message in messages:
        if message["role"] == initial_role:
            count += 1
    return count


def standardize_date_format(date):
    """
    Given a date string, standardize the date format to YYYY/MM/DD.
    """
    if date is None:
        return ""
    try:
        # Parse the date string into a datetime object
        standardized_date = datetime.strptime(date, "%Y/%m/%d")
    except ValueError:
        try:
            # Attempt to parse other common formats here
            # Example: MM/DD/YYYY
            standardized_date = datetime.strptime(date, "%m/%d/%Y")
        except ValueError:
            return "Invalid date format"

    # Format the datetime object into the desired string format
    return standardized_date.strftime("%Y/%m/%d")

not_found_emails = set()
metadata_only = []
for conversation in parsed_conversations:
    
    # Extract actual number of turns
    conversation["metadata"]["actual_turns"] = get_number_of_turns(conversation["messages"])

    try:
        tracking_record = tasks_df[tasks_df["task_link"].str.contains(conversation["id"])].iloc[0].to_dict()
    except IndexError:
        print("IndexError for id:", conversation["id"])
        continue

    # Get Author email
    conversation["metadata"]["assigned_to_email"] = tracking_record["assigned_to_email"]

    # Get duration
    conversation["metadata"]["duration_mins"] = tracking_record["duration_mins"]

    # Get Completion Date
    conversation["metadata"]["completion_date"] = standardize_date_format(tracking_record["completion_date"])

    try:
        contrib_entry = contributors_df[contributors_df["Email"] == tracking_record["assigned_to_email"]].iloc[0]
    except IndexError:
        not_found_emails.add(tracking_record["assigned_to_email"])
        print("IndexError for email:", tracking_record["assigned_to_email"])
        continue

    # Get Join Date
    conversation["metadata"]["joined_on"] = contrib_entry["Joined on"]

    # Get Team
    try:
        conversation["metadata"]["team"] = contrib_entry["Source"]
    except IndexError:
        print("IndexError for email:", tracking_record["assigned_to_email"])
        conversation["metadata"]["team"] = "Unknown"

    metadata_only.append(conversation["metadata"])

metadata_only_df = pd.DataFrame(metadata_only)

IndexError for email: https://colab.research.google.com/drive/1s6QrFchojtSInYl0xrwJ-Dcv6gqqL8lB
IndexError for email: toh.y@turing,com
IndexError for email: satya.s@turing.com
IndexError for email: satya.s@turing.com
IndexError for email: 
IndexError for email: raman.k@turing.com
IndexError for email: raman.k@turing.com
IndexError for email: raman.k@turing.com
IndexError for email: shaharyar.t@turing.com
IndexError for email: toh.y@turing,com
IndexError for email: andranik.g@gmail.com
IndexError for email: 
IndexError for email: andranik.g@gmail.com


## Filter data on full timers who joined on 22/12/2023

In [19]:
metadata_only_df = metadata_only_df[metadata_only_df["team"]=="Vetting"]
metadata_only_df = metadata_only_df[metadata_only_df["joined_on"]=="12/28/2023"]
metadata_only_df

Unnamed: 0,topic,type,target_turns,actual_turns,assigned_to_email,duration_mins,completion_date,joined_on,team
152,algorithms > by_topic > game_theory,query,1,1,joseph.d@turing.com,45,2023/12/29,12/28/2023,Vetting
189,python_language_and_scripting > virtual_enviro...,modification,1,1,martinho.h@turing.com,15,2023/12/28,12/28/2023,Vetting
247,unit_testing_methodology > performance_testing,query,1,1,joseph.d@turing.com,20,2023/12/29,12/28/2023,Vetting
284,web_development > web_development_trends,query,2,2,khalid.s@turing.com,45,2024/01/01,12/28/2023,Vetting
287,unit_testing_methodology > test_ai_and_ml_models,query,2,2,cedric.l@turing.com,65,2023/12/28,12/28/2023,Vetting
...,...,...,...,...,...,...,...,...,...
2090,,query,,2,pawan.s@turing.com,40,2024/01/05,12/28/2023,Vetting
2093,,query,,1,pawan.s@turing.com,30,2024/01/05,12/28/2023,Vetting
2094,algorithms > by_data_structure > trees,modification,4,4,santiago.c@turing.com,40,2024/01/02,12/28/2023,Vetting
2095,algorithms > by_topic > sampling,modification,1-4,1,pawan.s@turing.com,70,2024/01/04,12/28/2023,Vetting


In [20]:
# Process the duration_mins column to make into int

def process_duration_mins(duration_mins):
    if duration_mins == "" or duration_mins is None:
        return 15
    elif isinstance(duration_mins, int):
        return duration_mins
    else:
        return int(duration_mins.split(" ")[0])
    
metadata_only_df["duration_mins"] = metadata_only_df["duration_mins"].apply(process_duration_mins)

trainer_avg_turn_duration = metadata_only_df.groupby("assigned_to_email").agg({"duration_mins": "mean", "actual_turns": "mean"}).reset_index()
trainer_avg_turn_duration["avg_turn_duration"] = trainer_avg_turn_duration["duration_mins"] / trainer_avg_turn_duration["actual_turns"]
trainer_avg_turn_duration = trainer_avg_turn_duration.sort_values("avg_turn_duration", ascending=True)
trainer_avg_turn_duration = trainer_avg_turn_duration.rename(columns={
    "duration_mins": "avg_mins_per_convo",
    "actual_turns": "avg_turns_per_convo",
    "avg_turn_duration": "avg_mins_per_turn"
})
trainer_avg_turn_duration

Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn
7,jha.r@turing.com,23.4,2.6,9.0
12,santiago.c@turing.com,43.130435,3.826087,11.272727
13,singh.r@turing.com,37.692308,3.051282,12.352941
1,abdul.r@turing.com,35.189189,2.72973,12.891089
4,andranik.g@turing.com,41.805556,3.083333,13.558559
0,aarunik.g@turing.com,42.093023,3.0,14.031008
6,daniel.i@turing.com,45.5,3.2,14.21875
5,cedric.l@turing.com,48.0,3.133333,15.319149
8,joseph.d@turing.com,35.0,2.2,15.909091
14,zubair.m@turing.com,33.4375,2.0625,16.212121


In [21]:
reviews = reviews_df[reviews_df["Author Email"].isin(trainer_avg_turn_duration["assigned_to_email"].tolist())]
reviews = reviews.astype({"Code Quality": "int32", "Language Quality": "int32"})
reviews["avg_quality_score"] = (reviews["Code Quality"] + reviews["Language Quality"]) / 2

trainer_avg_quality = reviews.groupby("Author Email").agg({"avg_quality_score": "mean", "Timestamp":"count"}).reset_index()
trainer_avg_quality = trainer_avg_quality.rename(columns={"Timestamp": "total_reviews"})
trainer_avg_quality = trainer_avg_quality.sort_values("avg_quality_score", ascending=False)
trainer_avg_quality

Unnamed: 0,Author Email,avg_quality_score,total_reviews
4,andranik.g@turing.com,4.666667,6
10,martinho.h@turing.com,4.666667,6
7,jha.r@turing.com,4.642857,7
9,khalid.s@turing.com,4.6,5
12,santiago.c@turing.com,4.5625,8
1,abdul.r@turing.com,4.5,6
13,singh.r@turing.com,4.5,5
3,aman.s@turing.com,4.4,5
5,cedric.l@turing.com,4.166667,3
0,aarunik.g@turing.com,4.142857,7


In [22]:
trainer_throughput = metadata_only_df.groupby("assigned_to_email").agg({"topic": "count", "duration_mins":"sum", "actual_turns":"sum"}).reset_index()
trainer_throughput = trainer_throughput.sort_values("topic", ascending=False)
trainer_throughput = trainer_throughput.rename(columns={
    "topic": "total_convos",
    "duration_mins": "total_mins",
    "actual_turns": "total_turns"
})
trainer_throughput.sort_values("total_turns", ascending=False)

Unnamed: 0,assigned_to_email,total_convos,total_mins,total_turns
12,santiago.c@turing.com,32,1984,176
0,aarunik.g@turing.com,39,1810,129
13,singh.r@turing.com,33,1470,119
4,andranik.g@turing.com,28,1505,111
1,abdul.r@turing.com,28,1302,101
3,aman.s@turing.com,21,1817,94
7,jha.r@turing.com,27,819,91
9,khalid.s@turing.com,33,1455,79
10,martinho.h@turing.com,18,1610,54
5,cedric.l@turing.com,14,720,47


In [23]:
import numpy as np
import pandas as pd


def extract_top_level_topic(topic):
    if isinstance(topic, str):
        return topic.split(" > ")[0]
    else:
        return topic


def calculate_diversity(df, group_by="assigned_to_email", count_column="count", entropy_column="diversity"):
    """
    Calculate the diversity of a dataframe.

    :param df: The dataframe to calculate diversity on.
    :param group_by: The column to group by.
    :param count_column: The column to count.
    :param entropy_column: The column to store the entropy in.
    """
    # Step 1: Normalize counts
    total_counts = df.groupby(group_by)[count_column].transform('sum')
    df['normalized_count'] = df[count_column] / total_counts

    # Step 2: Calculate entropy
    df['entropy_component'] = -df['normalized_count'] * np.log2(df['normalized_count'])
    entropy = df.groupby(group_by)['entropy_component'].sum().reset_index()

    entropy.columns = [group_by, entropy_column]
    return entropy


metadata_only_df["top_level_topic"] = metadata_only_df["topic"].apply(extract_top_level_topic)

# Aggregation of Count of tasks per trainer per top level topic in a new variable trainer_diversity
trainer_diversity = metadata_only_df.groupby(["assigned_to_email", "top_level_topic"]).agg({"topic": "count"}).reset_index()
trainer_diversity = trainer_diversity.rename(columns={"topic": "count"})
trainer_diversity = trainer_diversity.sort_values("count", ascending=False)

# Calculate entropy
topleveltopic_diversity = calculate_diversity(trainer_diversity, entropy_column="topic_diversity")
topleveltopic_diversity

Unnamed: 0,assigned_to_email,topic_diversity
0,aarunik.g@turing.com,1.11501
1,abdul.r@turing.com,2.106802
2,alexei.v@turing.com,1.0
3,aman.s@turing.com,1.973556
4,andranik.g@turing.com,0.905928
5,cedric.l@turing.com,1.835238
6,daniel.i@turing.com,0.881291
7,jha.r@turing.com,1.698246
8,joseph.d@turing.com,1.370951
9,khalid.s@turing.com,2.314752


In [24]:
# Hour Tracking Utilization



In [25]:
dpi_df = trainer_avg_turn_duration.merge(trainer_avg_quality, left_on="assigned_to_email", right_on="Author Email", how="inner")
dpi_df = dpi_df.merge(trainer_throughput, on="assigned_to_email", how="inner")
dpi_df = dpi_df.merge(topleveltopic_diversity, on="assigned_to_email", how="inner")
dpi_df = dpi_df.drop(columns=["Author Email"])
dpi_df

Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn,avg_quality_score,total_reviews,total_convos,total_mins,total_turns,topic_diversity
0,jha.r@turing.com,23.4,2.6,9.0,4.642857,7,27,819,91,1.698246
1,santiago.c@turing.com,43.130435,3.826087,11.272727,4.5625,8,32,1984,176,1.602793
2,singh.r@turing.com,37.692308,3.051282,12.352941,4.5,5,33,1470,119,2.023499
3,abdul.r@turing.com,35.189189,2.72973,12.891089,4.5,6,28,1302,101,2.106802
4,andranik.g@turing.com,41.805556,3.083333,13.558559,4.666667,6,28,1505,111,0.905928
5,aarunik.g@turing.com,42.093023,3.0,14.031008,4.142857,7,39,1810,129,1.11501
6,daniel.i@turing.com,45.5,3.2,14.21875,3.833333,3,10,455,32,0.881291
7,cedric.l@turing.com,48.0,3.133333,15.319149,4.166667,3,14,720,47,1.835238
8,joseph.d@turing.com,35.0,2.2,15.909091,3.75,2,5,175,11,1.370951
9,zubair.m@turing.com,33.4375,2.0625,16.212121,4.071429,7,15,535,33,2.173557


In [26]:
def transform_to_zscore(sequence):
    """
    Calculate the z-score of a column.

    :param df: The dataframe to calculate z-score on.
    :param column: The column to calculate z-score on.
    """
    return (sequence - sequence.mean()) / sequence.std(ddof=0)


dpi_normalized_df = dpi_df.copy()

# Loop on all numerical columns and transform to z-score
for column in dpi_normalized_df.columns:
    if column != "assigned_to_email":
        dpi_normalized_df[column] = transform_to_zscore(dpi_normalized_df[column])

dpi_normalized_df

Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn,avg_quality_score,total_reviews,total_convos,total_mins,total_turns,topic_diversity
0,jha.r@turing.com,-1.791947,-0.171306,-1.274892,1.098286,1.054093,0.535083,-0.514607,0.352699,0.155177
1,santiago.c@turing.com,-0.087092,1.801586,-0.923396,0.848815,1.581139,0.996362,1.437369,2.158688,-0.042242
2,singh.r@turing.com,-0.556986,0.55485,-0.756332,0.654783,0.0,1.088618,0.576154,0.947613,0.827878
3,abdul.r@turing.com,-0.773274,0.037441,-0.673102,0.654783,0.527046,0.627339,0.294667,0.565168,1.000167
4,andranik.g@turing.com,-0.201571,0.606424,-0.569873,1.172203,0.527046,0.627339,0.634797,0.777638,-1.483523
5,aarunik.g@turing.com,-0.176732,0.472332,-0.496805,-0.453975,1.054093,1.642152,1.145829,1.160082,-1.051092
6,daniel.i@turing.com,0.117656,0.794151,-0.467769,-1.414897,-1.054093,-1.033264,-1.124494,-0.90087,-1.534479
7,cedric.l@turing.com,0.333675,0.686878,-0.297583,-0.380057,-1.054093,-0.664241,-0.680483,-0.582166,0.438508
8,joseph.d@turing.com,-0.789621,-0.814945,-0.206343,-1.673607,-1.581139,-1.494543,-1.593638,-1.347055,-0.521747
9,zubair.m@turing.com,-0.924633,-1.036196,-0.159477,-0.675726,1.054093,-0.571986,-0.990453,-0.879623,1.138233


In [27]:
# Create a final score column as a weighted average of all the columns

weights = {
    # Utilization = 1
    "avg_tracked_mins_per_convo": 0.5,
    "avg_tracked_mins_per_turn": 0.5,

    # Efficiency = 1
    "avg_mins_per_convo": 0.3,
    "avg_mins_per_turn": 0.7  ,

    # Quality = 2
    "avg_quality_score": 2,

    # Throughput = 2
    "total_convos": 0.6,
    "total_turns": 1.4,

    # Diversity = 1
    "topic_diversity": 0.7,
    "turns_diversity": 0.3
}


greater_is_better = {
    # Utilization = 1
    "avg_tracked_mins_per_convo": False,
    "avg_tracked_mins_per_turn": False,

    # Efficiency = 1
    "avg_mins_per_convo": False,
    "avg_mins_per_turn": False,

    # Quality = 2
    "avg_quality_score": True,

    # Throughput = 2
    "total_convos": True,
    "total_turns": True,

    # Diversity = 1
    "topic_diversity": True,
    "turns_diversity": True
}


dpi_normalized_df["final_score"] = 0
count_cols = 0
for column in weights.keys():
    try:
        if greater_is_better[column]:
            dpi_normalized_df["final_score"] += dpi_normalized_df[column] * weights[column]
        else:
            dpi_normalized_df["final_score"] += (dpi_normalized_df[column]*(-1)) * weights[column]
        count_cols += 1
    except KeyError:
        print("KeyError for column:", column)
        continue


dpi_normalized_df["final_score"] = dpi_normalized_df["final_score"] / count_cols
dpi_normalized_df = dpi_normalized_df.sort_values("final_score", ascending=False)
dpi_normalized_df

KeyError for column: avg_tracked_mins_per_convo
KeyError for column: avg_tracked_mins_per_turn


Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn,avg_quality_score,total_reviews,total_convos,total_mins,total_turns,topic_diversity,final_score
1,santiago.c@turing.com,-0.087092,1.801586,-0.923396,0.848815,1.581139,0.996362,1.437369,2.158688,-0.042242,0.991312
2,singh.r@turing.com,-0.556986,0.55485,-0.756332,0.654783,0.0,1.088618,0.576154,0.947613,0.827878,0.8023
0,jha.r@turing.com,-1.791947,-0.171306,-1.274892,1.098286,1.054093,0.535083,-0.514607,0.352699,0.155177,0.766098
3,abdul.r@turing.com,-0.773274,0.037441,-0.673102,0.654783,0.527046,0.627339,0.294667,0.565168,1.000167,0.696754
10,khalid.s@turing.com,-0.116152,-0.616174,0.181637,0.965235,0.0,1.088618,0.551021,0.097736,1.430257,0.676404
4,andranik.g@turing.com,-0.201571,0.606424,-0.569873,1.172203,0.527046,0.627339,0.634797,0.777638,-1.483523,0.464227
11,aman.s@turing.com,2.224667,1.462545,0.322696,0.344331,0.0,-0.018451,1.157558,0.41644,0.724584,0.181984
5,aarunik.g@turing.com,-0.176732,0.472332,-0.496805,-0.453975,1.054093,1.642152,1.145829,1.160082,-1.051092,0.175191
13,martinho.h@turing.com,1.982616,-0.73449,1.944293,1.172203,0.527046,-0.295218,0.810726,-0.433437,1.002098,0.101128
9,zubair.m@turing.com,-0.924633,-1.036196,-0.159477,-0.675726,1.054093,-0.571986,-0.990453,-0.879623,1.138233,-0.233143


In [28]:
# Add final score to the dpi_df
dpi_df = dpi_df.merge(dpi_normalized_df[["assigned_to_email", "final_score"]], on="assigned_to_email", how="inner")
dpi_df = dpi_df.sort_values("final_score", ascending=False)
dpi_df

Unnamed: 0,assigned_to_email,avg_mins_per_convo,avg_turns_per_convo,avg_mins_per_turn,avg_quality_score,total_reviews,total_convos,total_mins,total_turns,topic_diversity,final_score
1,santiago.c@turing.com,43.130435,3.826087,11.272727,4.5625,8,32,1984,176,1.602793,0.991312
2,singh.r@turing.com,37.692308,3.051282,12.352941,4.5,5,33,1470,119,2.023499,0.8023
0,jha.r@turing.com,23.4,2.6,9.0,4.642857,7,27,819,91,1.698246,0.766098
3,abdul.r@turing.com,35.189189,2.72973,12.891089,4.5,6,28,1302,101,2.106802,0.696754
10,khalid.s@turing.com,42.794118,2.323529,18.417722,4.6,5,33,1455,79,2.314752,0.676404
4,andranik.g@turing.com,41.805556,3.083333,13.558559,4.666667,6,28,1505,111,0.905928,0.464227
11,aman.s@turing.com,69.884615,3.615385,19.329787,4.4,5,21,1817,94,1.973556,0.181984
5,aarunik.g@turing.com,42.093023,3.0,14.031008,4.142857,7,39,1810,129,1.11501,0.175191
13,martinho.h@turing.com,67.083333,2.25,29.814815,4.666667,6,18,1610,54,2.107735,0.101128
9,zubair.m@turing.com,33.4375,2.0625,16.212121,4.071429,7,15,535,33,2.173557,-0.233143


In [92]:
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI

load_dotenv(find_dotenv())
opanai = OpenAI()


def fix_missing_roles(messages):
    """
    Fix missing roles in a list of messages.

    :param messages: The list of messages.
    """
    def predict_role(messages_subsequence):
        try:
            response = opanai.chat.completions.create(
                model="gpt-4-1106-preview",
                messages=[
                    {"role":"system", "content": "Your task is to accurately predict whether the empty role is a User or an Assistant. You are only allowed to reply with a single word: 'User' or 'Assistant'."},
                    {"role":"user", "content": f"Here's a part of the conversation including an empty role:\n\n{messages_subsequence}"}
                ],
                temperature=0,
                seed=42
            )
            print(response.choices[0])
            missing_role = response.choices[0].message.content
            assert missing_role in ["User", "Assistant"]
            return missing_role, None
        except Exception as e:
            return None, e

    errors = []
    for i in range(len(messages)):
        if messages[i]["role"] == "":
            subsequence = messages[max(0, i-2):min(len(messages), i+3)]
            messages[i]["role"], error = predict_role(subsequence)
            if error is not None:
                errors.append(error)
    return messages, errors



test = [
    {'role': 'user', 'content': 'Hello'},
    {'role': '', 'content': 'How can I help you?'},
    {'role': 'user', 'content': 'I have a question'},
    {'role': 'assistant', 'content': 'Sure, what is it?'},
    {'role': '', 'content': 'Can you explain this concept to me?'},
    {'role': 'assistant', 'content': 'Of course, here is a brief explanation'},
    {"role": "assistant", "content": "print('Hello World')"},
    {"role": "", "content": "This print should explain the concept for you"},
    {'role': 'user', 'content': 'Thank you!'}
]

fix_missing_roles(test)

Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='User', role='assistant', function_call=None, tool_calls=None), logprobs=None)
Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Assistant', role='assistant', function_call=None, tool_calls=None), logprobs=None)


([{'role': 'user', 'content': 'Hello'},
  {'role': 'Assistant', 'content': 'How can I help you?'},
  {'role': 'user', 'content': 'I have a question'},
  {'role': 'assistant', 'content': 'Sure, what is it?'},
  {'role': 'User', 'content': 'Can you explain this concept to me?'},
  {'role': 'assistant', 'content': 'Of course, here is a brief explanation'},
  {'role': 'assistant', 'content': "print('Hello World')"},
  {'role': 'Assistant',
   'content': 'This print should explain the concept for you'},
  {'role': 'user', 'content': 'Thank you!'}],
 [])