In [17]:
import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [18]:
healthy_cache_path = "../healthy_cache/cached_rewrites/"
healthy_cache_files = os.listdir(healthy_cache_path)

In [19]:
azure_cache_path = "../azure_cache/"
azure_cache_files = os.listdir(azure_cache_path)

In [20]:
# files without overlap
combines_files_set = set(healthy_cache_files).union(set(azure_cache_files))
combines_files_set

{'ag_news_twitter_aug_back-translate.csv',
 'ag_news_twitter_aug_insert.csv',
 'ag_news_twitter_aug_substitute.csv',
 'ag_news_twitter_stabilityai_StableBeluga-7b_temp=0.0.csv',
 'boss_sentiment_aug_back-translate.csv',
 'boss_sentiment_aug_insert.csv',
 'boss_sentiment_aug_substitute.csv',
 'boss_sentiment_stabilityai_StableBeluga-13B_temp=0.0.csv',
 'boss_sentiment_stabilityai_StableBeluga-7b_temp=0.0.csv',
 'boss_toxicity_aug_back-translate.csv',
 'boss_toxicity_aug_insert.csv',
 'boss_toxicity_aug_substitute.csv',
 'boss_toxicity_stabilityai_StableBeluga-7b_temp=0.0.csv'}

In [30]:
for file_name in tqdm(combines_files_set):
    print(file_name)
    healthy_cache_file = healthy_cache_path + file_name
    azure_cache_file = azure_cache_path + file_name
    
    # read the frames and skip bad data
    healthy_cache_df = pd.read_csv(healthy_cache_file, on_bad_lines="warn", engine="python") if os.path.exists(healthy_cache_file) else None
    azure_cache_df = pd.read_csv(azure_cache_file, on_bad_lines="warn", engine="python") if os.path.exists(azure_cache_file) else None
    healthy_cache_len = len(healthy_cache_df) if healthy_cache_df is not None else 0
    azure_cache_len = len(azure_cache_df) if azure_cache_df is not None else 0
    
    combined_df = None
    if healthy_cache_df is not None and azure_cache_df is not None:
        combined_df = pd.concat([healthy_cache_df, azure_cache_df])
    elif healthy_cache_df is None and azure_cache_df is not None:
        combined_df = azure_cache_df
    elif healthy_cache_df is not None and azure_cache_df is None:
        combined_df = healthy_cache_df
    else:
        raise Exception("Both files are None")
    
    combined_df = combined_df.drop_duplicates(subset=["prompt_hash"], keep="first")
    combined_len = len(combined_df)
    print(f"Healthy cache len: {healthy_cache_len}, Azure cache len: {azure_cache_len}, Combined len: {combined_len}")
    
    cached_rewrites_path = "../cached_rewrites/"
    if os.path.exists(cached_rewrites_path) is False:
        os.mkdir(cached_rewrites_path)
    
    combined_df.to_csv(cached_rewrites_path + file_name, index=False)
    

  0%|          | 0/13 [00:00<?, ?it/s]

boss_toxicity_aug_substitute.csv
Healthy cache len: 120032, Azure cache len: 34531, Combined len: 120032


  8%|▊         | 1/13 [00:06<01:14,  6.23s/it]

boss_toxicity_aug_back-translate.csv
Healthy cache len: 116771, Azure cache len: 23299, Combined len: 116801


 15%|█▌        | 2/13 [00:11<01:02,  5.72s/it]

ag_news_twitter_aug_insert.csv
Healthy cache len: 15200, Azure cache len: 15200, Combined len: 15200


 23%|██▎       | 3/13 [00:12<00:35,  3.59s/it]

boss_sentiment_aug_back-translate.csv
Healthy cache len: 61580, Azure cache len: 61580, Combined len: 61580


 31%|███       | 4/13 [00:15<00:30,  3.38s/it]

ag_news_twitter_aug_substitute.csv
Healthy cache len: 15200, Azure cache len: 15200, Combined len: 15200


 46%|████▌     | 6/13 [00:16<00:11,  1.70s/it]

boss_sentiment_stabilityai_StableBeluga-13B_temp=0.0.csv
Healthy cache len: 2132, Azure cache len: 2132, Combined len: 2132
boss_sentiment_stabilityai_StableBeluga-7b_temp=0.0.csv
Healthy cache len: 123133, Azure cache len: 91071, Combined len: 123133


 54%|█████▍    | 7/13 [00:26<00:26,  4.48s/it]

boss_sentiment_aug_insert.csv
Healthy cache len: 61580, Azure cache len: 61580, Combined len: 61580


 62%|██████▏   | 8/13 [00:30<00:21,  4.30s/it]

boss_sentiment_aug_substitute.csv
Healthy cache len: 61580, Azure cache len: 61580, Combined len: 61580


 69%|██████▉   | 9/13 [00:34<00:16,  4.04s/it]

boss_toxicity_stabilityai_StableBeluga-7b_temp=0.0.csv


Skipping line 88066: unexpected end of data


Healthy cache len: 88064, Azure cache len: 122214, Combined len: 133512


 77%|███████▋  | 10/13 [00:49<00:22,  7.55s/it]

boss_toxicity_aug_insert.csv
Healthy cache len: 120032, Azure cache len: 120032, Combined len: 120032


 85%|████████▍ | 11/13 [00:58<00:15,  7.87s/it]

ag_news_twitter_stabilityai_StableBeluga-7b_temp=0.0.csv
Healthy cache len: 30400, Azure cache len: 30400, Combined len: 30400


 92%|█████████▏| 12/13 [01:01<00:06,  6.51s/it]

ag_news_twitter_aug_back-translate.csv
Healthy cache len: 15200, Azure cache len: 15200, Combined len: 15200


100%|██████████| 13/13 [01:02<00:00,  4.82s/it]
