In [1]:
import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
healthy_cache_path = "../healthyml_cache/"
healthy_cache_files = os.listdir(healthy_cache_path)

In [3]:
azure_cache_path = "../cached_rewrites_old/"
azure_cache_files = os.listdir(azure_cache_path)

In [4]:
# files without overlap
combines_files_set = set(healthy_cache_files).union(set(azure_cache_files))
combines_files_set

{'ag_news_twitter_aug_back-translate.csv',
 'ag_news_twitter_aug_insert.csv',
 'ag_news_twitter_aug_substitute.csv',
 'ag_news_twitter_stabilityai_StableBeluga-7b_temp=0.0.csv',
 'boss_sentiment_aug_back-translate.csv',
 'boss_sentiment_aug_insert.csv',
 'boss_sentiment_aug_substitute.csv',
 'boss_sentiment_stabilityai_StableBeluga-13B_temp=0.0.csv',
 'boss_sentiment_stabilityai_StableBeluga-7b_temp=0.0.csv',
 'boss_toxicity_aug_back-translate.csv',
 'boss_toxicity_aug_insert.csv',
 'boss_toxicity_aug_substitute.csv',
 'boss_toxicity_stabilityai_StableBeluga-7b_temp=0.0.csv'}

In [5]:
for file_name in tqdm(combines_files_set):
    print(file_name)
    healthy_cache_file = healthy_cache_path + file_name
    azure_cache_file = azure_cache_path + file_name
    
    # read the frames and skip bad data
    healthy_cache_df = pd.read_csv(healthy_cache_file, on_bad_lines="warn", engine="python") if os.path.exists(healthy_cache_file) else None
    azure_cache_df = pd.read_csv(azure_cache_file, on_bad_lines="warn", engine="python") if os.path.exists(azure_cache_file) else None
    healthy_cache_len = len(healthy_cache_df) if healthy_cache_df is not None else 0
    azure_cache_len = len(azure_cache_df) if azure_cache_df is not None else 0
    
    combined_df = None
    if healthy_cache_df is not None and azure_cache_df is not None:
        combined_df = pd.concat([healthy_cache_df, azure_cache_df])
    elif healthy_cache_df is None and azure_cache_df is not None:
        combined_df = azure_cache_df
    elif healthy_cache_df is not None and azure_cache_df is None:
        combined_df = healthy_cache_df
    else:
        raise Exception("Both files are None")
    
    combined_df = combined_df.drop_duplicates(subset=["prompt_hash"], keep="first")
    combined_len = len(combined_df)
    print(f"Healthy cache len: {healthy_cache_len}, Azure cache len: {azure_cache_len}, Combined len: {combined_len}")
    
    cached_rewrites_path = "../cached_rewrites/"
    if os.path.exists(cached_rewrites_path) is False:
        os.mkdir(cached_rewrites_path)
    
    combined_df.to_csv(cached_rewrites_path + file_name, index=False)
    

  0%|          | 0/13 [00:00<?, ?it/s]

boss_sentiment_aug_insert.csv
Healthy cache len: 61580, Azure cache len: 61580, Combined len: 61580


  8%|▊         | 1/13 [00:01<00:18,  1.54s/it]

ag_news_twitter_aug_insert.csv


 15%|█▌        | 2/13 [00:02<00:09,  1.10it/s]

Healthy cache len: 15200, Azure cache len: 15200, Combined len: 15200
boss_sentiment_stabilityai_StableBeluga-7b_temp=0.0.csv
Healthy cache len: 123133, Azure cache len: 123133, Combined len: 123133


 23%|██▎       | 3/13 [00:06<00:24,  2.47s/it]

boss_toxicity_aug_back-translate.csv
Healthy cache len: 120032, Azure cache len: 116807, Combined len: 120032


 31%|███       | 4/13 [00:09<00:23,  2.64s/it]

boss_sentiment_aug_substitute.csv
Healthy cache len: 61580, Azure cache len: 61580, Combined len: 61580


 38%|███▊      | 5/13 [00:10<00:17,  2.19s/it]

ag_news_twitter_aug_substitute.csv


 46%|████▌     | 6/13 [00:11<00:11,  1.57s/it]

Healthy cache len: 15200, Azure cache len: 15200, Combined len: 15200
ag_news_twitter_aug_back-translate.csv


 54%|█████▍    | 7/13 [00:11<00:07,  1.18s/it]

Healthy cache len: 15200, Azure cache len: 15200, Combined len: 15200
boss_toxicity_stabilityai_StableBeluga-7b_temp=0.0.csv
Healthy cache len: 240019, Azure cache len: 190596, Combined len: 240032


 62%|██████▏   | 8/13 [00:20<00:18,  3.73s/it]

boss_sentiment_aug_back-translate.csv
Healthy cache len: 61580, Azure cache len: 61580, Combined len: 61580


 69%|██████▉   | 9/13 [00:21<00:11,  2.97s/it]

boss_toxicity_aug_substitute.csv
Healthy cache len: 120032, Azure cache len: 120032, Combined len: 120032


 77%|███████▋  | 10/13 [00:24<00:08,  3.00s/it]

ag_news_twitter_stabilityai_StableBeluga-7b_temp=0.0.csv
Healthy cache len: 30400, Azure cache len: 30400, Combined len: 30400


 85%|████████▍ | 11/13 [00:26<00:05,  2.52s/it]

boss_toxicity_aug_insert.csv
Healthy cache len: 120032, Azure cache len: 120032, Combined len: 120032


100%|██████████| 13/13 [00:29<00:00,  2.30s/it]

boss_sentiment_stabilityai_StableBeluga-13B_temp=0.0.csv
Healthy cache len: 2132, Azure cache len: 2132, Combined len: 2132



