In [48]:
from datasets import Dataset, DatasetDict
from tqdm import tqdm
import pandas as pd
import json
import wandb
import os
import hashlib
tqdm.pandas()

## Upload Local Cache to HF

In [65]:
cache_dir = "../cached_rewrites"
cache_files = os.listdir(cache_dir)
combined_dataset_dict = DatasetDict()
for cache_file in tqdm(cache_files):
    frame = pd.read_csv(os.path.join(cache_dir, cache_file), engine="python", on_bad_lines="skip")
    dataset = Dataset.from_pandas(frame)
    cache_split_name = cache_file.replace(".csv", "").replace(".", "dot").replace("=", "equals").replace("-", "_")
    combined_dataset_dict[cache_split_name] = dataset

combined_dataset_dict

100%|██████████| 13/13 [00:19<00:00,  1.46s/it]


DatasetDict({
    boss_sentiment_aug_back_translate: Dataset({
        features: ['prompt_hash', 'prompt', 'rewrites'],
        num_rows: 61580
    })
    boss_toxicity_aug_back_translate: Dataset({
        features: ['prompt_hash', 'prompt', 'rewrites'],
        num_rows: 116807
    })
    ag_news_twitter_aug_back_translate: Dataset({
        features: ['prompt_hash', 'prompt', 'rewrites'],
        num_rows: 15200
    })
    boss_sentiment_aug_substitute: Dataset({
        features: ['prompt_hash', 'prompt', 'rewrites'],
        num_rows: 61580
    })
    boss_sentiment_stabilityai_StableBeluga_13B_tempequals0dot0: Dataset({
        features: ['prompt_hash', 'prompt', 'rewrites'],
        num_rows: 2132
    })
    ag_news_twitter_aug_insert: Dataset({
        features: ['prompt_hash', 'prompt', 'rewrites'],
        num_rows: 15200
    })
    boss_toxicity_aug_substitute: Dataset({
        features: ['prompt_hash', 'prompt', 'rewrites'],
        num_rows: 120032
    })
    ag_news_twit

In [67]:
combined_dataset_dict.push_to_hub("Kyle1668/LLM-TTA-Cached-Rewrites")

Creating parquet from Arrow format: 100%|██████████| 62/62 [00:00<00:00, 143.14ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.97s/it]
Creating parquet from Arrow format: 100%|██████████| 117/117 [00:00<00:00, 139.39ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.01s/it]
Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 138.21ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s]
Creating parquet from Arrow format: 100%|██████████| 62/62 [00:00<00:00, 125.65ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.27s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 108.17ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.88it/s]
Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 122.38ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
Creating parquet from Arrow format

## Update Local ID Toxicity Paraphrase Cache

In [42]:
local_paths = [
    "data/BOSS_Toxicity_ID_BERT_Paraphrase/validation_stabilityai/StableBeluga-7b_random_0_Kyle1668/boss-toxicity-bert-base-uncased_style_logs.table.json"
    ]
with open(local_paths[0], "r") as f:
    logs = json.load(f)

frame = pd.DataFrame(data=logs["data"], columns=logs["columns"])
frame.head(3)

Unnamed: 0,entropy,mean probs,all probs,all entropies,latency,input,original_input,style prompt,mean exemplar distance,judgment,prompt,label,original judgment,original entropy,entropy decrease,entropy decreased,original probs,outcome
0,0.000379,"[0.9999665021896362, 3.351096893311478e-05]","[[0.9999673366546631, 3.262787504354492e-05], ...","[0.0003697199572343379, 0.0003697199572343379,...",12.172549,[<aug>As long as the children are at least 1/8...,As long as the children are at least 1/8 Nativ...,### System:\n### Instructions ###\nThe assista...,,0,,0,0,0.00037,-9e-06,False,"[[0.9999673366546631, 3.262787504354492e-05], ...",Unchanged Correct
1,7.1e-05,"[0.9999946355819702, 5.39638949703658e-06]","[[0.9999946355819702, 5.372834948502714e-06], ...","[7.055921741994098e-05, 6.757902883691713e-05,...",8.861567,[<aug>Begin by eliminating the Jones Act to al...,"Start by flushing the Jones Act, in order to c...",### System:\n### Instructions ###\nThe assista...,,0,,0,0,7.5e-05,5e-06,True,"[[0.9999946355819702, 5.372834948502714e-06], ...",Unchanged Correct
2,7e-05,"[0.9999946355819702, 5.326296559360344e-06]","[[0.9999945163726807, 5.43204214409343e-06], [...","[7.133732287911698e-05, 7.105202530510724e-05,...",10.273449,[<aug>There are usually only a few individuals...,There are usually very few people who run for ...,### System:\n### Instructions ###\nThe assista...,,0,,0,0,6.8e-05,-2e-06,False,"[[0.9999945163726807, 5.43204214409343e-06], [...",Unchanged Correct


In [52]:
def parse_augmentations(augmentations):
    if augmentations is None or len(augmentations) == 0:
        return []

    return [aug.replace("<aug>", "").replace("</aug>", "") for aug in augmentations[:4]]

cache_frame = frame[["style prompt", "input"]]
cache_frame = cache_frame.rename(columns={"style prompt": "prompt", "input": "rewrites"})
cache_frame["rewrites"] = cache_frame["rewrites"].progress_apply(parse_augmentations)
cache_frame["prompt_hash"] = cache_frame["prompt"].progress_apply(lambda prompt: hashlib.sha256(prompt.encode()).hexdigest())
cache_frame = cache_frame[["prompt_hash", "prompt", "rewrites"]]
cache_frame

100%|██████████| 97320/97320 [00:00<00:00, 112407.73it/s]
100%|██████████| 97320/97320 [00:00<00:00, 214786.25it/s]


Unnamed: 0,prompt_hash,prompt,rewrites
0,d1206639cb51475f51c28ef43f1b25ff0f2db0d974ac6c...,### System:\n### Instructions ###\nThe assista...,[As long as the children are at least 1/8 Nati...
1,fbeaf936c124e90d6264efa2639c102331529aa6c3355f...,### System:\n### Instructions ###\nThe assista...,[Begin by eliminating the Jones Act to allevia...
2,cde3bf7c31dd80958e36415bb747f1c146d9c01b2129ce...,### System:\n### Instructions ###\nThe assista...,[There are usually only a few individuals who ...
3,9c6b0c50babdd4c7faf3e791d67cabe6f27d0d0437ee5e...,### System:\n### Instructions ###\nThe assista...,[People who disrespect Canada should consider ...
4,43b49cdd4de5087254158d300715eaf1a79d1fcd5c83ff...,### System:\n### Instructions ###\nThe assista...,[If Trump has any sense of decency and persona...
...,...,...,...
97315,5ae75a6b84dbf7e7dcbd636be61bb7204f1a27796ea986...,### System:\n### Instructions ###\nThe assista...,[Who would have thought that the media is fill...
97316,3bd4d0541489de0a8d2dea5a2adff3bf1272b69b7a60f6...,### System:\n### Instructions ###\nThe assista...,"[Paraphased Text., Paraphased Text., A precede..."
97317,ace9dad5250b6d5ec01a6c1c41b6273b38088839b74583...,### System:\n### Instructions ###\nThe assista...,[There are many individuals who were aware of ...
97318,4b7f32ca2e68b7070a316aec9d1a8666885e58715ffc3a...,### System:\n### Instructions ###\nThe assista...,[Attrition can be a tool to alleviate some of ...


In [53]:
local_cache = "../cached_rewrites/boss_toxicity_stabilityai_StableBeluga-7b_temp=0.0_old.csv"
locale_cache_frame = pd.read_csv(local_cache)
locale_cache_frame

Unnamed: 0,prompt_hash,prompt,rewrites
0,168011d03c4817b55e74ff37c024cdbc10ec0d28cf765b...,### System:\n### Instructions ###\nThe assista...,['While some members of Congress maintain stro...
1,d7b591156972ab00b7243e114fefbb92f5a15725742062...,### System:\n### Instructions ###\nThe assista...,['The original inhabitants of the land have a ...
2,1595eef0d43e070a5484019b69aba7936de404813c65f8...,### System:\n### Instructions ###\nThe assista...,"[""It's not about who you sleep with, but rathe..."
3,d30a5d45e551f91df05e769a51c837b06a42d82ebf58d1...,### System:\n### Instructions ###\nThe assista...,"[""I support women's rights, but I'm not entire..."
4,76a7edfe87a60ae2cc1bc4fce2a4dc245e3664b0c9fb4a...,### System:\n### Instructions ###\nThe assista...,['It is a significant matter in the Bible when...
...,...,...,...
90335,39c26f4a80526d1ece3b8c7551ebbddae82ec66b7992dc...,### System:\n### Instructions ###\nThe assista...,"['The text is already paraphrased.', 'The text..."
90336,a07c6e187a9fcba58a5d975aebdc0a4ee749dc35aa9472...,### System:\n### Instructions ###\nThe assista...,"[""Sounds like Butts provided the Health Minist..."
90337,e71aaef2eb5da5896dc74c3b7587bb0aadda66fb233596...,### System:\n### Instructions ###\nThe assista...,"['I agree, so establish free clinics where tho..."
90338,ffaa0562187c550dcc153e02b21b3d0908859a956e8639...,### System:\n### Instructions ###\nThe assista...,"[""A deal that will make us uncompetitive and f..."


In [56]:
# find records between local_cache and cache_frame where the prompt_hash are the same
new_cache = pd.concat([locale_cache_frame, cache_frame]).drop_duplicates(subset=["prompt_hash"], keep="last")
new_cache.to_csv(local_cache.replace("_old", ""), index=False)
new_cache

Unnamed: 0,prompt_hash,prompt,rewrites
0,168011d03c4817b55e74ff37c024cdbc10ec0d28cf765b...,### System:\n### Instructions ###\nThe assista...,['While some members of Congress maintain stro...
1,d7b591156972ab00b7243e114fefbb92f5a15725742062...,### System:\n### Instructions ###\nThe assista...,['The original inhabitants of the land have a ...
2,1595eef0d43e070a5484019b69aba7936de404813c65f8...,### System:\n### Instructions ###\nThe assista...,"[""It's not about who you sleep with, but rathe..."
3,d30a5d45e551f91df05e769a51c837b06a42d82ebf58d1...,### System:\n### Instructions ###\nThe assista...,"[""I support women's rights, but I'm not entire..."
4,76a7edfe87a60ae2cc1bc4fce2a4dc245e3664b0c9fb4a...,### System:\n### Instructions ###\nThe assista...,['It is a significant matter in the Bible when...
...,...,...,...
97315,5ae75a6b84dbf7e7dcbd636be61bb7204f1a27796ea986...,### System:\n### Instructions ###\nThe assista...,[Who would have thought that the media is fill...
97316,3bd4d0541489de0a8d2dea5a2adff3bf1272b69b7a60f6...,### System:\n### Instructions ###\nThe assista...,"[Paraphased Text., Paraphased Text., A precede..."
97317,ace9dad5250b6d5ec01a6c1c41b6273b38088839b74583...,### System:\n### Instructions ###\nThe assista...,[There are many individuals who were aware of ...
97318,4b7f32ca2e68b7070a316aec9d1a8666885e58715ffc3a...,### System:\n### Instructions ###\nThe assista...,[Attrition can be a tool to alleviate some of ...
