# PIPELINE TO GET DATA READY FOR TRAINING

Global variables:

In [1]:
import os
from transformers import AutoTokenizer, LlamaTokenizer
import pandas as pd
import json
import numpy as np
import random
from beartype import beartype
import torch
import csv
import ast

version_name = "improved"
MODEL_NAME = "GPTJ"

TOOL_DIRS = {
    "Calculator": "/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/definite_horizon/augmented_prompttrick/calculator_LLAMA",
    "WikiSearch": "/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/definite_horizon/augmented_prompttrick/wikiSearch_LLAMA",
    "Calendar": "/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/definite_horizon/augmented_standard/calendar_LLAMA"
}

CONSTRUCTION_DIRS = {
    "origin":"/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/construction/origin",
    "white space removal":"/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/construction/white_space_removal",
    "tool token substitution":"/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/construction/tool_token_substitution",
    "token type masking":"/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/construction/token_type_masking",
    "calculator subtype":"/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/construction/calculator_subtype",  
    "relevance score":"/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/construction/relevance_score",    
    "duplicity rankings":"/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/construction/duplicity_rankings",
    "curated":"/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/curated",
}

def construction_dir(part):
    return os.path.join(CONSTRUCTION_DIRS[part], MODEL_NAME + "_" + version_name)

def recurse_dict_create_dir():
    for key, value in CONSTRUCTION_DIRS.items():
        os.makedirs(construction_dir(key), exist_ok=True)

recurse_dict_create_dir()

TRAIN_FIELDS = ["tokenized_start_text", "tool_name", "tokenized_text", "token_type","tokenized_end_text", "start_token_type", "end_token_type", "start_method_A_train_mask", "end_method_A_train_mask"]
OLD_FIELDS = ["url", "text", "API_calls_text", "API_call_response_text", "position", "loss_improvement", "arg_cohort", "raw_arg", "processed_arg", "title", "date_download", "digest", "length", "nlines", "source_domain", "cc_segment", "original_nlines", "original_length", "language", "language_score", "perplexity", "bucket"]
DATA_SCIENCE_FIELDS = ["duplicity_count_tool", "duplicity_count_global", "duplicity_ranking_tool", "duplicity_ranking_global", "duplicity_count_stats", "duplicity_ranking_stats"]

CALC_SUBTYPES = ["add", "subtract", "multiply", "divide", "add_subtract", "mult_divide", "mix"]

TOOL_NAMES = ["Calculator", "WikiSearch", "Calendar"]

# Copy csv files into TOOL_DIRS to CONSTRUCTION "origin" into a MODEL_NAME + "_" + version_name folder
for tool_name, tool_dir in TOOL_DIRS.items():
    if tool_dir is not None:
        for file in os.listdir(tool_dir):
            if file.endswith(".csv") and "stat" not in file:
                with open(os.path.join(tool_dir, file), "r") as f:
                    lines = f.readlines()
                with open(os.path.join(construction_dir("origin"), tool_name + "_" + file), "w") as f:
                    f.writelines(lines)

cache_dir = "/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/toolformer/cache"

tokenizers = {"GPTJ": AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", truncate=True, max_length=270, cache_dir=cache_dir),
              "LLAMA2": LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf",
                                                   token="***REMOVED***",
                                                   cache_dir=cache_dir),}




  from .autonotebook import tqdm as notebook_tqdm


### Empty experiment dirs if necessary:

In [19]:
import shutil

# Empty dirs for new experiment
def recurse_dict_delete_data():
    for key, value in CONSTRUCTION_DIRS.items():
        if value is not None and key != "origin":
            if os.path.exists(construction_dir(key)):
                print(f"Deleting {construction_dir(key)}")
                shutil.rmtree(construction_dir(key))

if False:
    recurse_dict_delete_data()
    recurse_dict_create_dir()


## Substitutes "[" for tool tokens.

func: tool_token_substitution

In [2]:
import os
from csv import DictWriter
import re
from datasets import load_dataset

tools = ["Calculator", "WikiSearch", "Calendar"]

def tool_tokenize(row, tool):

    text = row.API_call_response_text
    text_no_resp = row.API_calls_text

    if ("<TOOL>" in text):
        return row
                
    # regex expression that returns the text until where it matches the API start token followed by the tool name
    text = re.sub(rf"\[(?={tool[:4]})", '<TOOL>', text)
    text_no_resp = re.sub(rf"\[(?={tool[:4]})", '<TOOL>', text_no_resp)
    end_text = re.split(f"\)]", text_no_resp, maxsplit=1)
    if len(end_text) != 2:
        # If text includes "<TOOL>" substring, raise a stub exception
        raise Exception(f"Text does not end with a response token: {text}")
    end_text = end_text[1]
    text = text[:-len(end_text)-1] + "</TOOL>" + end_text
    text_no_resp = text_no_resp[:-len(end_text)-1] + "</TOOL>" + end_text

    row["API_call_response_text"] = text
    row["API_calls_text"] = text_no_resp

    return row

def tool_token_substitution(input):
    input_dir = construction_dir(input)
    output_dir = construction_dir("tool token substitution")
    print("Substituting tokens in files in directory: ", input_dir)
    for tool in tools:
        file_list = [file for file in os.listdir(input_dir) if file.endswith('.csv') and tool.lower() in file.lower()]

        # Create an output directory by adding "processed" to the input directory
        # output_dir = os.path.join(TOOL_DIRS[tool], "tool_tokenized")
        # os.makedirs(output_dir, exist_ok=True)

        for file in file_list:
            print("Processing: ", file)
            # Load dataset
            df = pd.read_csv(os.path.join(input_dir, file))
            # dataset = load_dataset(TOOL_DIRS[tool], split="train", data_files = file, cache_dir = cache_dir)
            # columns = list(dataset.column_names)
            df = df.apply(tool_tokenize, args=(tool,), axis=1)

            # Save dataset
            df.to_csv(os.path.join(output_dir, file), index=False)


## Removes long whitespaces

Requires \<TOOL\> tokens

white_space_removal

In [3]:
# This script removes long white spaces from the specified fields in the csv files of the "white space removal" "input" dir.
# The output is saved in "white space removal" "output" dir.
# We substitute 5 or more white spaces with 2 white spaces.

import os
import pandas as pd
import re

def remove_spaces_token_avoider(row):
    text = row.text
    api_resp = row.API_call_response_text
    api_call = row.API_calls_text
    
    start_sentences = re.findall(r"(.*?)(\s{5,})", text)
    resp_start_sentences = re.findall(r"(.*?)(\s{5,})", api_resp)
    call_start_sentences = re.findall(r"(.*?)(\s{5,})", api_call)
    if len(start_sentences) == 0:
        return row

    # Else, loop through
    # Assumes the 5 white spaces are not inside the API call
    else:
        if len(start_sentences) != len(resp_start_sentences) or len(start_sentences) != len(call_start_sentences):
            api_resp = re.sub(r"(\s{0,})(<TOOL>.*?</TOOL>)(\s{0,})", r"  \2 ", api_resp)
            api_call = re.sub(r"(\s{0,})(<TOOL>.*?</TOOL>)(\s{0,})", r"  \2 ", api_call)
        
        # Substitute all 5 or more white spaces with 2 white spaces:
        text = re.sub(r"\s{5,}", "  ", text)
        api_resp = re.sub(r"\s{5,}", "  ", api_resp)
        api_call = re.sub(r"\s{5,}", "  ", api_call)
        # Assumes the 5 white spaces are not inside the API call

        row.text = text
        row.API_call_response_text = api_resp
        row.API_calls_text = api_call

        return row

# TEST
#data = {"text": "Hello this        is my      name", "API_call_response_text": "Hello this    <TOOL>LOOOOOL hwen        welcome to the jungle</TOOL>    is my      name", "API_calls_text": "Hello this<TOOL>LOOOOOL hwen        welcome to the jungle</TOOL>        is my      name"}
#df = pd.DataFrame(data, index=[0])
#print(remove_spaces_token_avoider(df.iloc[0]))

def white_space_removal(input):
    input_dir = construction_dir(input)
    output_dir = construction_dir("white space removal")
    print(f"Removing long white spaces from {input_dir} and saving to {output_dir}")
    file_list = [f for f in os.listdir(input_dir) if f.endswith(".csv")]
    for file in file_list:
        print(f"Processing {file}")
        df = pd.read_csv(os.path.join(input_dir, file))

        df = df.apply(remove_spaces_token_avoider, axis=1)
        df.to_csv(os.path.join(output_dir, file), index=False)



##  Identify and tag text with API calls into token types (0,...9)


Require that data has \<TOOL\> tokens instead of "["

Func token_type_masking

In [4]:
# This script readies the data for training. It takes the raw data from the dataset and converts it into a format that can be used by the model.

from csv import DictWriter
import json
import os
import torch
import re
from datasets import load_dataset

from transformers import AutoTokenizer, LlamaTokenizer

from torch.utils.data import DataLoader, Dataset

from beartype import beartype
from typing import List, Dict, Tuple, Union, Optional, Any


def TOOL_START_TOKEN(tool_name):
    return rf"\s\[(?={re.escape(tool_name)})"            #  " <TOOL>"

TOOL_START_TOKEN = " <TOOL>"
TOOL_END_TOKEN = "</TOOL>" 

tokenizers[MODEL_NAME].add_tokens([TOOL_START_TOKEN, TOOL_END_TOKEN, "[PAD]"])



@beartype
def mask_tokenize_data(
        call_response_texts:List[str],
        tool_name:str,
        tokenizer
):
    global TOOL_END_TOKEN, TRAIN_FIELDS

    output = []
    for i, text in enumerate(call_response_texts):
        new_row = {}

        # Substitute "Calendar()" in text for "Calendar ( )"
        text = text.replace("Calendar()", "Calendar( )")
        
        tokenized_text = tokenizer.encode(text, truncation=True, max_length=1000)
        #for token in tokenized_text[model_names[0]]:
        #    print(f"{token:<5}: {tokenizers['GPTJ'].decode([token])}")

        # Find index where tokenized_text matches the tool start token:
        try:
            tool_token_index = tokenized_text.index(tokenizer.encode(TOOL_START_TOKEN)[0])
            index_arrow = tokenized_text[tool_token_index:].index(tokenizer.encode("→")[0]) + tool_token_index
            index_end = tokenized_text[index_arrow:].index(tokenizer.encode(TOOL_END_TOKEN)[0]) + index_arrow
        except ValueError as e:
            print("ValueError", flush=True)
            print(text, flush=True)
            raise e


        len_toolname = len(tokenizer.encode(tool_name))

        # Find number of ocurrences of →
        occurrences = len(re.findall(r'(\)\→)', text))

        if occurrences != 1:
            print("More than one occurrence of →", flush=True)
            print(text, flush=True)
            raise Exception("More than one occurrence of →")

        # Create token type mask
        token_type = torch.zeros(len(tokenized_text))
                                                        # 0 for data...
        token_type[tool_token_index] += 1                         # 1 for <TOOL>
        token_type[tool_token_index+1] += 1                       # 2 for Toolname
        token_type[tool_token_index+1+len_toolname] += 1          # 3 for (
        token_type[tool_token_index+1 + len_toolname + 1] += 1    # 4 for args
        token_type[index_arrow - 1] += 1                   # 5 for )
        token_type[index_arrow] += 1                       # 6 for →
        token_type[index_arrow+1] += 1                     # 7 for response
        token_type[index_end] += 1                         # 8 for </TOOL>
        token_type[index_end+1] += 1                       # 9 for ...Data
        token_type = token_type.cumsum(dim=0)

        new_row["token_type"] = token_type.view(-1).long().tolist()
        new_row["tokenized_text"] = tokenized_text

        new_row["start_method_A_train_mask"] = (token_type[:tool_token_index+1] < 2).view(-1).long().tolist()
        new_row["end_method_A_train_mask"] = (token_type[tool_token_index+1+len_toolname] > 2).view(-1).long().tolist()
        new_row["tokenized_start_text"] = tokenized_text[:tool_token_index+1]
        new_row["tokenized_end_text"] = tokenized_text[index_end+1:]
        new_row["start_token_type"] = token_type[:tool_token_index+1].view(-1).long().tolist()
        new_row["end_token_type"] = token_type[tool_token_index+1+len_toolname].view(-1).long().tolist()
    
        new_row["tool_name"] = tool_name

        output.append(new_row)

    return output

tool_names = ["Calendar", "WikiSearch", "Calculator"]

def token_type_masking(input):
    input_dir = construction_dir(input)
    output_dir = construction_dir("token type masking")

    print(f"Token type masking {input_dir} to {output_dir}", flush=True)

    # We want to output: tokenized_start_text, tool_name, tokenized_end_text, token_type, start_method_A_train_mask, end_method_A_train_mask

    def null_permitting_collate_fn(batch):
        dict_of_lists = {key: [] for key in batch[0].keys()}

        for d in batch:
            for key, value in d.items():
                dict_of_lists[key].append(value)
        
        return dict_of_lists
        

    output_data = []

    for tool in tool_names:
        file_list = [file for file in os.listdir(input_dir) if tool in file]
        if tool == "Calendar":
            file_list = file_list[:4]
        
        print(f"Processing {tool} with {len(file_list)} files", flush=True)
        
        dataset = load_dataset(input_dir, data_files = file_list, split="train", cache_dir=cache_dir)

        dl = DataLoader(dataset, batch_size=1000, collate_fn=null_permitting_collate_fn, shuffle=False)

        data_iter = iter(dl)
        data = next(data_iter, None)
        
        while data is not None:
            train_data = mask_tokenize_data(data["API_call_response_text"], 
                                            tool, 
                                            tokenizers[MODEL_NAME],)

            for i, output_row in enumerate(train_data):
                new_row = {key: output_row[key] for key in TRAIN_FIELDS}
                for key in OLD_FIELDS:
                    new_row[key] = data[key][i]
                # This key is the decoded sentence from tokens of type 0 and 9
                # Extract this with masked select. Decode with tokenizers["GPTJ"].decode()

                output_data.append(new_row)

            data = next(data_iter, None)


    # Create output file:
    with open(f'{output_dir}/train.csv', 'w') as f:
        writer = DictWriter(f, fieldnames=OLD_FIELDS+TRAIN_FIELDS+DATA_SCIENCE_FIELDS)
        writer.writeheader()

        for row in output_data:
            if row["tool_name"] == "Calendar":
                #row["arg"] = " "
                row["arg_cohort"] = []
                #del row["date"]
            writer.writerow(row)


## Script to tag training data (tagged with token types) from tool type Calculator with operation type labels

In [5]:
# tag tool type from calculator

import ast

from csv import DictWriter

from datasets import Value, Features


tokenizer = tokenizers[MODEL_NAME]

def calculator_subtype(input):

    print("Starting calculator subtype......")
    input_dir = construction_dir(input)
    output_dir = construction_dir("calculator subtype")
    # Files in input_dir that end with .csv
    data_files = [f for f in os.listdir(input_dir) if f.endswith(".csv")]  #if f.endswith(f"train_{version_name}.csv")
    print(data_files)

    feat_dict = {'url': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'API_calls_text': Value(dtype='string', id=None), 'API_call_response_text': Value(dtype='string', id=None), 'position': Value(dtype='float64', id=None), 'loss_improvement': Value(dtype='float64', id=None), 'processed_arg': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'date_download': Value(dtype='string', id=None), 'digest': Value(dtype='string', id=None), 'length': Value(dtype='int64', id=None), 'nlines': Value(dtype='int64', id=None), 'source_domain': Value(dtype='string', id=None), 'cc_segment': Value(dtype='string', id=None), 'original_nlines': Value(dtype='int64', id=None), 'original_length': Value(dtype='int64', id=None), 'language': Value(dtype='string', id=None), 'language_score': Value(dtype='float64', id=None), 'perplexity': Value(dtype='float64', id=None), 'bucket': Value(dtype='string', id=None)}
    for key in TRAIN_FIELDS + DATA_SCIENCE_FIELDS:
        feat_dict[key] = Value(dtype='string', id=None)
    features = Features(feat_dict)

    for file in data_files:
        # Load data
        print(f"Loading {file}")
        
        dataset = load_dataset(input_dir, data_files=file, split="train", cache_dir=cache_dir, features=features)

        header = dataset.column_names + ["op_label", "ops_used"]

        with open(os.path.join(output_dir, file), "w") as f:
            # Dict writer
            writer = DictWriter(f, fieldnames=header)

            # Write header
            writer.writeheader()

            for row in dataset:
                if row["tool_name"] != "Calculator":
                    row["op_label"] = "NotApplicable"
                    row["ops_used"] = "NotApplicable"
                    writer.writerow(row)
                    continue
                token_type = ast.literal_eval(row["token_type"])
                tokenized_text = ast.literal_eval(row["tokenized_text"])

                # Args are tokens marked with token_type 4
                args = [tokenized_text[i] for i, t in enumerate(token_type) if t == 4]
                args = tokenizer.decode(args)

                ops = [0,0,0,0]

                for character in args:
                    if character == "+":
                        ops[0] += 1
                    elif character == "-":
                        ops[1] += 1
                    elif character == "*":
                        ops[2] += 1
                    elif character == "/":
                        ops[3] += 1

                # Labels:
                # add: if ops[0] > 0 and ops[1] == 0 and ops[2] == 0 and ops[3] == 0
                # subtract: if ops[0] == 0 and ops[1] > 0 and ops[2] == 0 and ops[3] == 0
                # multiply: if ops[0] == 0 and ops[1] == 0 and ops[2] > 0 and ops[3] == 0
                # divide: if ops[0] == 0 and ops[1] == 0 and ops[2] == 0 and ops[3] > 0
                # add_subtract: if ops[0] > 0 and ops[1] > 0 and ops[2] == 0 and ops[3] == 0
                # mult_divide: if ops[0] == 0 and ops[1] == 0 and ops[2] > 0 and ops[3] > 0
                # mix: else

                if ops[0] > 0 and ops[1] == 0 and ops[2] == 0 and ops[3] == 0:
                    label = "add"
                elif ops[0] == 0 and ops[1] > 0 and ops[2] == 0 and ops[3] == 0:
                    label = "subtract"
                elif ops[0] == 0 and ops[1] == 0 and ops[2] > 0 and ops[3] == 0:
                    label = "multiply"
                elif ops[0] == 0 and ops[1] == 0 and ops[2] == 0 and ops[3] > 0:
                    label = "divide"
                elif ops[0] > 0 and ops[1] > 0 and ops[2] == 0 and ops[3] == 0:
                    label = "add_subtract"
                elif ops[0] == 0 and ops[1] == 0 and ops[2] > 0 and ops[3] > 0:
                    label = "mult_divide"
                else:
                    label = "mix"
                
                row["op_label"] = label
                row["ops_used"] = ops
                
                writer.writerow(row)



## Data relevance score

This script assigns a relevance score to the data points based on a custom metric

Requires calc_subtype

In [7]:
# Visualize training data.
# Open csv files in construction_dir("origin") into one dataframe per tool. For each tool, the filenames will contain the tool name.

import matplotlib.pyplot as plt
import numpy as np

def relevance_score(input, plots = False):
    tools = ["calculator", "calendar", "wikisearch"]

    print("Starting relevance score calculation...")

    input_dir = construction_dir(input)

    # Read csv file into a dataframe
    file = [f for f in os.listdir(input_dir) if f.endswith(".csv")]
    assert len(file) == 1
    df = pd.read_csv(os.path.join(input_dir, file[0]))

    calc_df = df[df["tool_name"] == "Calculator"]
    calendar_df = df[df["tool_name"] == "Calendar"]
    wikisearch_df = df[df["tool_name"] == "WikiSearch"]

    # Len of each df:
    print(f"Len of calc_df: {len(calc_df)}, len of calendar_df: {len(calendar_df)}, len of wikisearch_df: {len(wikisearch_df)}")
    
    calc_m = 0.003
    calc_c = 0.6
    calend_m = 0.013
    calend_c = 1.3
    wiki_m = 0.015
    wiki_c = -0.7
    calc_bounds = (1.3, 0.5)
    calend_bounds = (1.8, 2)
    wiki_bounds = (3.8, 2.2)


    # Define a relevance metric for each tool. It is based on the line in the scatter plot. 
    # The metric assigns to each point the distance to the line.
    # Points above the line have a positive sign, points below the line have a negative sign.

    # Create a new column in the dataframe for the relevance metric
    calc_theta = np.arctan(calc_m)
    calend_theta = np.arctan(calend_m)
    wiki_theta = np.arctan(wiki_m)

    #calc_df.loc[:,"x"] = np.cos(calc_theta) * calc_df.perplexity + np.sin(calc_theta) * calc_df.loss_improvement
    calc_df.loc[:,"relevance"] = - np.sin(calc_theta) * calc_df.perplexity + np.cos(calc_theta) * (calc_df.loss_improvement  - calc_c)
    calendar_df.loc[:,"relevance"] = - np.sin(calend_theta) * calendar_df["perplexity"] + np.cos(calend_theta) * (calendar_df["loss_improvement"]  - calend_c)
    wikisearch_df.loc[:,"relevance"] = - np.sin(wiki_theta) * wikisearch_df["perplexity"] + np.cos(wiki_theta) * (wikisearch_df["loss_improvement"]  - wiki_c)

    # Normalize relevance by putting the minimum value to 0 and the maximum value to 1:
    # df.loc[:,"relevance"] = (df.relevance - df.relevance.min()) / (df.relevance.max() - df.relevance.min())

    # A more sophisticated normalization:
    indices = (calc_df.relevance > -calc_bounds[1]) & (calc_df.relevance < calc_bounds[0])
    calc_averg_thr = np.mean(calc_df[indices].relevance)
    indices = (calendar_df.relevance > -calend_bounds[1]) & (calendar_df.relevance < calend_bounds[0])
    calend_averg_thr = np.mean(calendar_df[indices].relevance)
    indices = (wikisearch_df.relevance > -wiki_bounds[1]) & (wikisearch_df.relevance < wiki_bounds[0])
    wiki_averg_thr = np.mean(wikisearch_df[indices].relevance)

    # Normalize relevance by putting their averages to 1
    #calc_df.loc[:,"relevance"] = (calc_df.relevance+calc_averg_thr-abs(calc_averg_thr)) / abs(calc_averg_thr)
    #calendar_df.loc[:,"relevance"] = (calendar_df.relevance+calend_averg_thr-abs(calend_averg_thr)) / abs(calend_averg_thr)
    #wikisearch_df.loc[:,"relevance"] = (wikisearch_df.relevance+wiki_averg_thr-abs(wiki_averg_thr)) / abs(wiki_averg_thr)

    # Normalize by bringing minimum to -1:
    calc_extreme_max = np.mean(calc_df[calc_df.relevance > calc_bounds[0]].relevance)
    calc_extreme_min = np.mean(calc_df[calc_df.relevance < calc_bounds[1]].relevance)
    calc_df.loc[:,"relevance"] = (calc_df.relevance - calc_extreme_min) / (calc_extreme_max - calc_extreme_min) * 2 - 1
    calend_extreme_max = np.mean(calendar_df[calendar_df.relevance > calend_bounds[0]].relevance)
    calend_extreme_min = np.mean(calendar_df[calendar_df.relevance < calend_bounds[1]].relevance)
    calendar_df.loc[:,"relevance"] = (calendar_df.relevance - calend_extreme_min) / (calend_extreme_max - calend_extreme_min) * 2 - 1
    wiki_extreme_max = np.mean(wikisearch_df[wikisearch_df.relevance > wiki_bounds[0]].relevance)
    wiki_extreme_min = np.mean(wikisearch_df[wikisearch_df.relevance < wiki_bounds[1]].relevance)
    wikisearch_df.loc[:,"relevance"] = (wikisearch_df.relevance - wiki_extreme_min) / (wiki_extreme_max - wiki_extreme_min) * 2 - 0.8 

    df = pd.concat([calc_df, calendar_df, wikisearch_df])

    output_dir = construction_dir("relevance score")
    # Save data frame into a csv file
    df.to_csv(os.path.join(output_dir, file[0]), index=False)

    if plots:

        # Scatter colour depends on relevance score
        #df["scatter colour"] = plt.cm.cool(df["relevance"])

        calendar_df = df[df["tool_name"] == "Calendar"]
        wikisearch_df = df[df["tool_name"] == "WikiSearch"]
        calc_df = df[df["tool_name"] == "Calculator"]

        calc_fig, calc_ax = plt.subplots(figsize=(8,5))
        calc_sub_fig, calc_sub_ax = plt.subplots(figsize=(8,5))
        calend_fig, calend_ax = plt.subplots(figsize=(8,5))
        wiki_fig, wiki_ax = plt.subplots(figsize=(8,5))
        joint_fig, joint_ax = plt.subplots(figsize=(8,5))

        # Give title to each plot
        calc_ax.set_title("Calculator augmented data")
        calend_ax.set_title("Calendar augmented data")
        wiki_ax.set_title("Wikisearch augmented data")


        #You pass the wanted axis to the ax argument
        # df.plot(kind='scatter', x='x', y='y',title="Nice title", grid=True,fontsize=10, ax=ax) 

        # Plot a scatter plot for each tool, where the x axis is the data perplexity and the y axis is the loss_improvement.
        # Modify size of the scatter dots and colour and shape
        calendar_df.plot.scatter(x="perplexity", y="loss_improvement", ax=joint_ax, s=2, c="relevance", label="Calendar", colormap="cool", marker="x")
        wikisearch_df.plot.scatter(x="perplexity", y="loss_improvement", ax=joint_ax, s=2, c="relevance", label="Wikisearch", colormap="cool", marker="o")
        calc_df.plot.scatter(x="perplexity", y="loss_improvement", ax=joint_ax, s=2, label="Calculator", c="relevance", colormap="cool", marker="*")
        # Legend
        joint_ax.legend()
        # title
        joint_ax.set_title("Augmented data")


        # Corresponding values for the colormap
        # Here, we generate values between 0 and 1 based on class_labels
        normalized_class_values = np.linspace(0, 1, len(CALC_SUBTYPES))
        colormap_name = 'tab10'
        color_map = plt.colormaps[colormap_name]

        # Generate a color for each class based on the colormap
        class_colors = color_map(normalized_class_values)

        # Lines of filtering criteria
        x_line = np.linspace(0, 350, 100)
        calc_line = calc_m*x_line + calc_c
        calend_line = calend_m*x_line + calend_c
        wiki_line = wiki_m*x_line + wiki_c
        # The calculator scatter plot should have different ops_labels with different colours:
        for i, label in enumerate(CALC_SUBTYPES):
            calc_df[calc_df["op_label"] == label].plot.scatter(x="perplexity", y="loss_improvement", ax=calc_sub_ax, s=4, label=label, color=class_colors[i])
        calendar_df.plot.scatter(x="perplexity", y="loss_improvement", ax=calend_ax, s=10, c="relevance", colormap="cool",)
        wikisearch_df.plot.scatter(x="perplexity", y="loss_improvement", ax=wiki_ax, s=10, c="relevance", colormap="cool",)
        calc_df.plot.scatter(x="perplexity", y="loss_improvement", ax=calc_ax, s=10, c="relevance", colormap="cool",)
        # Set line width
        calc_sub_ax.plot(x_line, calc_line, color=class_colors[2], label="Filtering criteria", linewidth=2)
        calc_sub_ax.legend()
        calend_ax.plot(x_line, calend_line, color=class_colors[2], label="Filtering criteria", linewidth=2)
        calend_ax.plot(x_line, calend_line+calend_bounds[0], color=class_colors[2], label="Filtering criteria", linewidth=2, linestyle="--")
        calend_ax.plot(x_line, calend_line-calend_bounds[1], color=class_colors[2], label="Filtering criteria", linewidth=2, linestyle="--")
        wiki_ax.plot(x_line, wiki_line, color=class_colors[2], label="Filtering criteria", linewidth=2)
        wiki_ax.plot(x_line, wiki_line+wiki_bounds[0], color=class_colors[2], label="Filtering criteria", linewidth=2, linestyle="--")
        wiki_ax.plot(x_line, wiki_line-wiki_bounds[1], color=class_colors[2], label="Filtering criteria", linewidth=2, linestyle="--")
        calc_ax.plot(x_line, calc_line, color=class_colors[2], label="Filtering criteria", linewidth=2)
        calc_ax.plot(x_line, calc_line+calc_bounds[0], color=class_colors[2], label="Filtering criteria", linewidth=2, linestyle="--")
        calc_ax.plot(x_line, calc_line-calc_bounds[1], color=class_colors[2], label="Filtering criteria", linewidth=2, linestyle="--")

if __name__ == "__main__":
    print("MAIIIIIIIIIIN")
    relevance_score("calculator subtype", plots=True)


MAIIIIIIIIIIN
Starting relevance score calculation...


AssertionError: 

### Relevance score stats

In [25]:
def relevance_score_stats(input):
    input_dir =  construction_dir(input)

    # Load the data
    files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]
    assert len(files) == 1
    data = pd.read_csv(os.path.join(input_dir, files[0]))

    # Get the stats
    print(data['relevance'].describe())
    for tool in TOOL_NAMES:
        print(tool)
        print(data[data.tool_name == tool]['relevance'].describe())
        print()

relevance_score_stats("relevance score")

count    25855.000000
mean        -0.897221
std          0.350527
min         -1.495099
25%         -1.124908
50%         -0.943493
75%         -0.735932
max          2.954931
Name: relevance, dtype: float64
Calculator
count    5880.000000
mean       -0.945068
std         0.388455
min        -1.466302
25%        -1.213272
50%        -1.031171
75%        -0.790287
max         2.626328
Name: relevance, dtype: float64

WikiSearch
count    10290.000000
mean        -0.773997
std          0.312015
min         -1.174287
25%         -0.968750
50%         -0.838849
75%         -0.661947
max          2.954931
Name: relevance, dtype: float64

Calendar
count    9685.000000
mean       -0.999094
std         0.324171
min        -1.495099
25%        -1.235086
50%        -1.064829
75%        -0.836967
max         2.084497
Name: relevance, dtype: float64



## Duplicity Rankings:

In [8]:
def duplicity_rankings(input, metric = "relevance"):

    input_dir = construction_dir(input)
    output_dir = construction_dir("duplicity rankings")

    print(f"Starting duplicity rankings for {input_dir} with metric {metric} to {output_dir}")

    tool_names = ["Calculator", "Calendar", "WikiSearch"]

    if not os.path.exists(os.path.join(output_dir, "stats")):
        os.mkdir(os.path.join(output_dir, "stats"))

    files = os.listdir(input_dir)
    assert len(files) == 1
    df = pd.read_csv(os.path.join(input_dir, files[0]))

    global_duplicity = {}
    tool_specific_duplicity = {
        tool: {} for tool in tool_names
    }
    # Get unique types of op_label and add them to tool_specific_duplicity
    for subset in df.op_label.unique():
        tool_specific_duplicity[subset] = {}
        if subset not in CALC_SUBTYPES:
            print(f"Warning: {subset} not in CALC_SUBTYPES")

    for i, row in df.iterrows():
        tool = row.tool_name
        text = row.text
        score = row[metric]
        subset = row.op_label
        
        try:
            tool_specific_duplicity[tool][text] = tool_specific_duplicity[tool].get(text, []) + [score]
            tool_specific_duplicity[subset][text] = tool_specific_duplicity[subset].get(text, []) + [score]
            global_duplicity[text] = global_duplicity.get(text, []) + [score]
        except KeyError as e:
            print(f"KeyError: {e}")
            print(f"Row: {row}")
            print(f"Iteration: {i}")
            raise e
        
    # Save the duplicity count in a json file
    for tool in tool_names:
        with open(f'{output_dir}/stats/duplicity_improvs_{tool}.json', 'w') as f:
            json.dump(tool_specific_duplicity[tool], f)

    # Save the duplicity count in a json file
    with open(f'{output_dir}/stats/duplicity_count_global_{version_name}.json', 'w') as f:
        json.dump(global_duplicity, f)

    for i, row in df.iterrows():
        text = row.text
        subset = row.op_label
        score = row[metric]
        df.at[i, "duplicity_count_global"] = len(global_duplicity[text])
        df.at[i, "duplicity_ranking_global"] = sorted(global_duplicity[text], reverse= True).index(score)
        df.at[i, "duplicity_ranking_subset"] = sorted(tool_specific_duplicity[subset][text], reverse= True).index(score)

        # ranking value is the index of the loss improvement in the sorted list of loss improvements
        duplicity_ranking_tool = {}
        duplicity_count_tool = {}
        duplicity_ranking_subset = {}
        for tool in tool_names:
            count = 0
            rank = 10000
            try:
                count = len(tool_specific_duplicity[tool][text])
                rank = sorted(tool_specific_duplicity[tool][text], reverse= True).index(score)
            except (ValueError, KeyError):
                count = 0
                rank = -1
            duplicity_count_tool[tool] = count
            duplicity_ranking_tool[tool] = rank
            duplicity_ranking_subset[subset] = rank
        
        df.at[i, "duplicity_ranking_stats"] = str(duplicity_ranking_tool)
        df.at[i, "duplicity_count_stats"] = str(duplicity_count_tool)
        df.at[i, "duplicity_ranking_tool"] = duplicity_ranking_tool[row.tool_name]
        df.at[i, "duplicity_count_tool"] = duplicity_count_tool[row.tool_name]

    df.to_csv(os.path.join(output_dir, files[0]), index = False)


#duplicity_rankings("relevance score")

## Checker. Checks if any text is used in two tools

In [9]:
def check_multi_tool_duplicity(input):
    input_dir = construction_dir(input)

    files = [f for f in os.listdir(input_dir) if f.endswith(".csv")]
    print(files)

    df = pd.read_csv(os.path.join(input_dir, files[0]))

    # find rows where duplicity_count_tool != duplicity_count_global AND duplicity_ranking_global == 0
    df = df[(df["duplicity_count_tool"] != df["duplicity_count_global"]) & (df["duplicity_ranking_global"] == 0)]

    if len(df) == 0:
        print("No duplicity found. CHECK PASSED")
    else:
        print("Duplicity found. CHECK FAILED:")
        print(f"Length of df: {len(df)}")

        for i, row in df.head(20).iterrows():
            print(row.API_call_response_text)
            print(row.duplicity_count_stats)

## Curate training data (subset dataset)
This script creates a training dataset file to feed the train.py

In [58]:
def subset_data(input, p_displaced = 0.0):
    import random
    from beartype import beartype
    import torch
    import csv
    import ast
    random.seed(42)

    input_dir = construction_dir(input)
    output_dir = construction_dir("curated")

    file_list = [f for f in os.listdir(input_dir) if f.endswith(".csv")]
    assert len(file_list) == 1, "There should only be one csv file in the input directory"

    df = pd.read_csv(os.path.join(input_dir,file_list[0]))
    permitted_op_labels = ["multiply","divide","mult_divide", "add", "subtract", "add_subtract", "mix"]

    # Remove from the df all rows that have an op_label that is not in the permitted_op_labels list:
    indices = (df.tool_name != "Calculator") | (df["op_label"].isin(permitted_op_labels))
    print(f"Dropping {sum(~indices)} rows with an op_label that is not in the permitted_op_labels list")
    df = df[indices]
    
    # Remove from the df rows such that: tool_name != "Calculator" and a tool ranking higher than 2:
    # Leaving calc check out as there is no mult_tool_duplicity
    indices_rm = df["duplicity_ranking_tool"] >= 2 # & df["tool_name"] != "Calculator" 
    print(f"Dropping {indices_rm.shape[0]} rows with a tool ranking higher than 1 where tool_name != Calculator")
    df = df[~indices_rm]

    calc_count = sum(indices & (df.tool_name == "Calculator"))
    print(f"Leaving {calc_count} calc examples")

    calc_df = df[df["tool_name"] == "Calculator"].sort_values(by=["relevance"], ascending=True)
    calendar_df = df[df["tool_name"] == "Calendar"][0:len(calc_df*3)].sort_values(by=["relevance"], ascending=True)
    wiki_df = df[df["tool_name"] == "WikiSearch"][0:len(calc_df*10)].sort_values(by=["relevance"], ascending=True)
    
    # Shuffle a list of indices of length len(df):
    indices = list(range(len(df)))
    random.shuffle(indices)

    # Concat the dfs:
    df[indices[:len(calc_df)]] = calc_df
    df[indices[len(calc_df):len(calc_df)+len(calendar_df)]] = calendar_df
    df[indices[len(calc_df)+len(calendar_df):len(calc_df)+len(calendar_df)+len(wiki_df)]] = wiki_df

    # Sort df by decreasing relevance:
    # df = df.sort_values(by=["relevance"], ascending=True)

    # Sample 20% of the indices:
    indices = random.sample(range(df.shape[0]), int(df.shape[0]*p_displaced))
    shuffled_indices = random.sample(indices, len(indices))
    # Insert shuffled rows into the indices position:
    df.iloc[indices] = df.iloc[shuffled_indices]

    # Save the df to a csv file:
    # The csv file should contain the columns: text, token_type, tool_name, calc_subtype
    df[["text", "API_call_response_text", "tokenized_text", "token_type", "tool_name", "op_label", "relevance", "perplexity", "loss_improvement"]].to_csv(os.path.join(output_dir, "train_short.csv"), index=False)
    # Now we save a file with just the texts:
    df[["text", "tool_name"]].to_csv(os.path.join(output_dir, "train_short_texts.csv"), index=False)

    print(f"Final set has {df.shape[0]} rows")

    print(f"There are {df[df.duplicity_ranking_tool == 1].shape[0]} rows with duplicity_ranking_tool == 1")


    return df
fields_to_analyze = ["loss_improvement", "perplexity", "relevance","duplicity_ranking_tool", "duplicity_ranking_subset", "duplicity_ranking_global"]
df = subset_data("duplicity rankings")

print(df[fields_to_analyze].describe())
print("")
print("")

for tool in TOOL_NAMES:
    print(f"Tool: {tool}")
    print(df[df.tool_name == tool][fields_to_analyze].describe())
    print("")

Dropping 3018 rows with an op_label that is not in the permitted_op_labels list
Dropping 23276 rows with a tool ranking higher than 1 where tool_name != Calculator
Leaving 2421 calc examples
Final set has 7263 rows
There are 2285 rows with duplicity_ranking_tool == 1
       loss_improvement   perplexity    relevance  duplicity_ranking_tool  \
count       7263.000000  7263.000000  7263.000000             7263.000000   
mean           1.284702   263.968278    -0.886763                0.314608   
std            0.860120    56.877593     0.363300                0.464392   
min            0.500078    11.900000    -1.495099                0.000000   
25%            0.784509   233.900000    -1.125895                0.000000   
50%            1.038763   276.200000    -0.940349                0.000000   
75%            1.495171   308.050000    -0.723915                1.000000   
max           12.549429   339.900000     2.599937                1.000000   

       duplicity_ranking_subset  dupli

In [9]:
input_dir = construction_dir("curated")
input_2 = construction_dir("duplicity rankings")

files = [f for f in os.listdir(input_dir ) if f.endswith(".csv")]
files2 = [f for f in os.listdir(input_2 ) if f.endswith(".csv")]

df = pd.read_csv(os.path.join(input_dir, files[0]))

df2 = pd.read_csv(os.path.join(input_2, files2[0]))

print(len(df))
print(df.columns)

train_rows = []

for i, row in df[df.tool_name == "Calculator"].iterrows():
    new_row = df2[df2["API_call_response_text"] == row["API_call_response_text"]]

    train_rows.append(new_row)

df3 = pd.concat(train_rows)
print(len(df3))

for i, row in df3.sort_values(by="relevance", ascending=False).head(10).iterrows():
    print(row.API_call_response_text)

7263
Index(['text', 'API_call_response_text', 'tokenized_text', 'token_type',
       'tool_name', 'op_label'],
      dtype='object')
The product of Q5, Q22 and Q23 is the perfect square 2050 · 4633 · 226 = <TOOL>Calculator(4633*4633)→ 21464689.0</TOOL> 2146468900 = 463302 = (2 · 5 ·
The product of Q5, Q22 and Q23 is the perfect square 2050 · 4633 · 226 = <TOOL>Calculator(2050*4633*226)→ 2146468900.0</TOOL> 2146468900 = 463302 = (2 · 5 ·
d 00:05h Aug 13, 2018 6:45:35 pm - 0% 31 13 <TOOL>Calculator(31/13)→ 2.38</TOOL> 2.38 100% 33% RIDE WIT DA MOB 0
Torque is basically proportional to current, so 550A/300A * 35 ft-lb = <TOOL>Calculator(550/300*35)→ 64.05</TOOL> 64 ft-lb.
 then divide by 2 (16 miles /2) / 60 minutes (16 miles/2)/ 1 minute <TOOL>Calculator(16/2)→ 8.0</TOOL> 8 miles/ 1 minute 1 minute/ 8 miles Thank you for the explanation.
Keweenaw County, Michigan's gender ratio is higher than the Michigan State average of 97 men to 100 women (97:100) or <TOOL>Calculator(97/100)→ 0.97</TO

In [18]:
import random
# Mock dataset to convert to pandas dataframe
data = {
    "num_return_sequences": list(range(100)),
    "max_new_tokens": [10,]*100,
    "top_k": list(reversed(range(100))),
}

# Convert to pandas dataframe
df = pd.DataFrame(data)


# Sample 20% of the indices:
indices = random.sample(range(df.shape[0]), int(df.shape[0]*0.15))
shuffled_indices = random.sample(indices, len(indices))
# Insert shuffled rows into the indices position:
df.iloc[indices] = df.iloc[shuffled_indices]

print(df.head(20))

    num_return_sequences  max_new_tokens  top_k
0                      0              10     99
1                      1              10     98
2                      2              10     97
3                      3              10     96
4                      4              10     95
5                      5              10     94
6                      6              10     93
7                      7              10     92
8                      8              10     91
9                      9              10     90
10                    10              10     89
11                    68              10     31
12                    12              10     87
13                    13              10     86
14                    14              10     85
15                    97              10      2
16                    16              10     83
17                    17              10     82
18                    18              10     81
19                    46              10

## Pipeline:

In [29]:
tool_token_substitution("origin")
white_space_removal("tool token substitution")
token_type_masking("white space removal")
calculator_subtype("token type masking")
relevance_score("calculator subtype")
relevance_score_stats("relevance score")
duplicity_rankings("relevance score")
check_multi_tool_duplicity("duplicity rankings")
df = subset_data("duplicity rankings")

fields_to_analyze = ["loss_improvement", "perplexity", "relevance","duplicity_ranking_tool", "duplicity_ranking_subset", "duplicity_ranking_global"]
print(df[fields_to_analyze].describe())
print("")
print("")

for tool in TOOL_NAMES:
    print(f"Tool: {tool}")
    print(df[df.tool_name == tool][fields_to_analyze].describe())
    print("")


Substituting tokens in files in directory:  /vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/construction/origin/GPTJ_shiny_new
Processing:  Calculator_2.csv
Processing:  Calculator_1.csv
Processing:  Calculator_0.csv
Processing:  WikiSearch_0.csv
Processing:  WikiSearch_1.csv
Processing:  WikiSearch_2.csv
Processing:  Calendar_6.csv
Processing:  Calendar_1.csv
Processing:  Calendar_8.csv
Processing:  Calendar_0.csv
Processing:  Calendar_7.csv
Processing:  Calendar_4.csv
Processing:  Calendar_3.csv
Processing:  Calendar_2.csv
Processing:  Calendar_5.csv
Removing long white spaces from /vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/construction/tool_token_substitution/GPTJ_shiny_new and saving to /vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/construction/white_space_removal/GPTJ_shiny_new
Processing Calculator_1.csv
Processing WikiSearch_2.csv
Processing Calendar_2.csv
Processing Calendar_5.csv
Processing Calen

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1151.65it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 35.94it/s]
                                                                  

Dataset csv downloaded and prepared to /vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/toolformer/cache/csv/GPTJ_shiny_new-a68e88b4610ff761/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.
Processing WikiSearch with 3 files
Downloading and preparing dataset csv/GPTJ_shiny_new to /vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/toolformer/cache/csv/GPTJ_shiny_new-17032c3810bf4d50/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1640.96it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 37.26it/s]
                                                                  

Dataset csv downloaded and prepared to /vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/toolformer/cache/csv/GPTJ_shiny_new-17032c3810bf4d50/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.
Processing Calculator with 3 files
Downloading and preparing dataset csv/GPTJ_shiny_new to /vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/toolformer/cache/csv/GPTJ_shiny_new-6c4ae4d6f525f09f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1596.01it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 40.70it/s]
                                                                  

Dataset csv downloaded and prepared to /vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/toolformer/cache/csv/GPTJ_shiny_new-6c4ae4d6f525f09f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.
Starting calculator subtype......
['train.csv']
Loading train.csv
Downloading and preparing dataset csv/GPTJ_shiny_new to /vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/toolformer/cache/csv/GPTJ_shiny_new-ef4e8e77050d060f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 2931.03it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 108.11it/s]
                                                                   

Dataset csv downloaded and prepared to /vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/toolformer/cache/csv/GPTJ_shiny_new-ef4e8e77050d060f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.
Starting relevance score calculation...
Len of calc_df: 6319, len of calendar_df: 9685, len of wikisearch_df: 10290


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calc_df.loc[:,"relevance"] = - np.sin(calc_theta) * calc_df.perplexity + np.cos(calc_theta) * (calc_df.loss_improvement  - calc_c)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calendar_df.loc[:,"relevance"] = - np.sin(calend_theta) * calendar_df["perplexity"] + np.cos(calend_theta) * (calendar_df["loss_improvement"]  - calend_c)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.

count    26294.000000
mean        -0.898189
std          0.351025
min         -1.495099
25%         -1.126576
50%         -0.945268
75%         -0.736922
max          2.954931
Name: relevance, dtype: float64
Calculator
count    6319.000000
mean       -0.945774
std         0.387528
min        -1.463070
25%        -1.212637
50%        -1.032465
75%        -0.793384
max         2.599937
Name: relevance, dtype: float64

WikiSearch
count    10290.000000
mean        -0.773997
std          0.312015
min         -1.174287
25%         -0.968750
50%         -0.838849
75%         -0.661947
max          2.954931
Name: relevance, dtype: float64

Calendar
count    9685.000000
mean       -0.999094
std         0.324171
min        -1.495099
25%        -1.235086
50%        -1.064829
75%        -0.836967
max         2.084497
Name: relevance, dtype: float64

Starting duplicity rankings for /vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/construction/relevance_score/GPTJ_shiny_new w

KeyError: 'subset'

In [16]:
# Open file texts_used_train_resplandor.txt
import pandas as pd


short_train_df = pd.read_csv("texts_used_train_resplandor.csv")
print(short_train_df.head())

# Open training data at path: os.path.join(CONSTRUCTION_DIRS["calculator subtype"]["output"], MODEL_NAME + "_" + version_name)
train_data_dir = os.path.join(CONSTRUCTION_DIRS["calculator subtype"]["output"], MODEL_NAME + "_" + version_name)
files = [f for f in os.listdir(train_data_dir) if f.endswith(".csv")]
train_df = pd.concat([pd.read_csv(os.path.join(train_data_dir, f)) for f in files])

raw_texts = []

# Print columns in short_train_df:
print(f"Columns in short_train_df: {short_train_df.columns}")

for i, row in short_train_df.iterrows():
    raw_texts.append(train_df.iloc[row["data_id"]]["text"])

    # Assert that tokenized text in train_df is the same as the tokenized text in short_train_df
    # assert tokenizer.encode(train_df.iloc[row["data_id"]]["API_call_response_text"]) == row["text"], f"Tokenized text in train_df is not the same as the tokenized text in short_train_df: {train_df.iloc[row['data_id']]['API_call_response_text']} != {tokenizer.decode(ast.literal_eval(row['text']))}"


print(f"Len of raw_texts: {len(raw_texts)}")

# Save raw_texts to csv file with the same name as the short_train_df file but adding _raw
raw_texts_df = pd.DataFrame(raw_texts)
raw_texts_df.to_csv("texts_used_train_resplandor_raw.csv", index=False)
print(f"Saved raw_texts to file: texts_used_train_resplandor_raw.csv")


                                                text  \
0  [2061, 345, 714, 466, 351, 257, 1597, 318, 923...   
1  [1544, 373, 50400, 9771, 9239, 7, 1267, 39310,...   
2  [464, 1074, 468, 4504, 284, 4505, 475, 612, 31...   
3  [37844, 389, 24495, 319, 262, 50400, 9771, 923...   
4  [1026, 973, 262, 14037, 284, 262, 4495, 284, 2...   

                                          token_type   tool_name calc_subtype  \
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  WikiSearch          NaN   
1  [0, 0, 1, 2, 2, 3, 5, 6, 7, 7, 7, 7, 7, 7, 7, ...    Calendar          NaN   
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, ...  WikiSearch          NaN   
3  [0, 0, 0, 0, 0, 1, 2, 2, 3, 5, 6, 7, 7, 7, 7, ...    Calendar          NaN   
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, ...    Calendar          NaN   

   data_id  
0    19343  
1    11057  
2    18943  
3    11238  
4    10203  
Columns in short_train_df: Index(['text', 'token_type', 'tool_name', 'calc_subtype', 'data_id'], d

In [24]:
# Go through values of tool_specific_duplicity and find max min average and std
from statistics import mean, stdev
import numpy as np

global_duplicity # Dictioray whose values we want to analyze

# Get max, min, average, and std
print(max(global_duplicity.values()))
print(min(global_duplicity.values()))
print(mean(global_duplicity.values()))
print(stdev(global_duplicity.values()))

30
2
2.8113050706566916
1.5150422577922704


In [9]:
# Open train file and study lower relevance data points:

dir_file = "/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/train/construction/duplicity_rankings/GPTJ_shiny_new/train.csv"

df = pd.read_csv(dir_file)

print(df.columns)

for _,row in df.sort_values(by="relevance", ascending=True).head(50).iterrows():
    print(row.API_call_response_text)


Index(['url', 'text', 'API_calls_text', 'API_call_response_text', 'position',
       'loss_improvement', 'processed_arg', 'title', 'date_download', 'digest',
       'length', 'nlines', 'source_domain', 'cc_segment', 'original_nlines',
       'original_length', 'language', 'language_score', 'perplexity', 'bucket',
       'tokenized_start_text', 'tool_name', 'tokenized_text', 'token_type',
       'tokenized_end_text', 'start_token_type', 'end_token_type',
       'start_method_A_train_mask', 'end_method_A_train_mask',
       'duplicity_count_tool', 'duplicity_count_global',
       'duplicity_ranking_tool', 'duplicity_ranking_global',
       'duplicity_count_stats', 'duplicity_ranking_stats', 'op_label',
       'ops_used', 'relevance', 'duplicity_ranking_subset'],
      dtype='object')
Even on a dreary winter day <TOOL>Calendar()→ Today is Sunday, January 11, 2015</TOOL> in New Jersey.
You will both appreciate having some alone time together after <TOOL>Calendar()→ Today is Saturday, Augus