```This Notebook shows how we compiled the ngram counts for the QA datasets from DOLMA```

# Country-Capital

## Get counts

In [None]:

#Counting frequencies of countries and capitals on Dolma
import requests
import pandas as pd
from tqdm.auto import tqdm
#load countries and capitals from github
country_capitals = pd.read_csv("https://raw.githubusercontent.com/icyrockcom/country-capitals/refs/heads/master/data/country-list.csv")[['country','capital']]

#Remove instances where country and capital are the same if it exists
country_capitals = country_capitals[country_capitals['country'] != country_capitals['capital']]

#now go through each row and count the frequency of the country and capital with infinigram
for row in tqdm(country_capitals.itertuples(), total=len(country_capitals)):
    country = row.country
    capital = row.capital
    count = 0
    try:
        #Run frequency count with cased variations of country and capital and their combinations:
        for case in [f'{country.title()} AND {capital.title()}', f'{country.lower()} AND {capital.lower()}', f'{country} AND {capital.lower()}', f'{country.lower()} AND {capital}']:   
            payload = {
                'index': 'v4_dolma-v1_7_llama',
                'query_type': 'count',
                'query': f'{case}'}
            result = requests.post('https://api.infini-gram.io/', json=payload).json()
            count += result['count']
        country_capitals.loc[row.Index, 'count'] = count
    #except keyboard interrupt
    except KeyboardInterrupt:
        break

    #print the error
    except Exception as e:
        print(f"Error for {country} and {capital}: {e}")
        continue

#sort by count and reset index
country_capitals.sort_values(by='count', ascending=False, inplace=True)
country_capitals.reset_index(drop=True, inplace=True)

country_capitals.to_csv("country-capitals-freq.csv", index=False)

  0%|          | 0/241 [00:00<?, ?it/s]

  0%|          | 1/241 [00:33<2:13:37, 33.41s/it]


## Making QA data, paraphrases with Chatgpt (Provide OPENAI API key)

In [None]:
OPENAI_API_KEY = YOUR-OPENAI-API-KEY

import os
import json
from openai import OpenAI
import pandas as pd
from tqdm.auto import tqdm
# Read the country capitals data
country_capitals = pd.read_csv("country-capitals-freq.csv")

# Shuffle the dataframe
country_capitals = country_capitals.sample(frac=1).reset_index(drop=True)

client = OpenAI(
    api_key=OPENAI_API_KEY,
)

dataset = []
# Load existing data if any
if os.path.exists("capital_questions_dataset.json"):
    with open("capital_questions_dataset.json", "r") as f:
        dataset = json.load(f)
        
processed_countries = {item["country"] for item in dataset}

for row in tqdm(country_capitals.itertuples(), total=len(country_capitals)):
    country = row.country
    capital = row.capital
    count = row.count
    
    # Skip if already processed
    if country in processed_countries:
        continue
        
    print(f"Processing {country} with capital {capital} (frequency count: {count})")
    
    prompt = f"""For the country {country} with capital {capital}, generate:
    1. A direct question asking what the capital is and its answer
    2. A paraphrased version of the capital question that tests the same knowledge
    
    Format the output as a JSON with the following structure:
    {{
        "direct_question": "question",
        "direct_answer": "answer",
        "paraphrased_question": "question",
        "paraphrased_answer": "answer"
    }}"""

    try:
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="gpt-4",
            temperature=0.7
        )
        
        result = json.loads(response.choices[0].message.content)
        result["country"] = country
        result["capital"] = capital
        dataset.append(result)
        
        # Save after each successful addition
        with open("capital_questions_dataset.json", "w") as f:
            json.dump(dataset, f, indent=2)
            
    except Exception as e:
        print(f"Error processing {country}: {str(e)}")
        continue

print(f"Generated dataset with {len(dataset)} entries")

## Making count buckets, pushing to HF 

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd
import os
import json
import datasets
hf_token = os.getenv("HUGGINGFACE_TOKEN")
HF_DATASET_NAME = provide HF dataset name

# Load and process your custom dataset
with open("capital_questions_dataset.json", "r") as f:
    dataset = json.load(f)

# Convert to pandas DataFrame
df = pd.DataFrame(dataset)

# Calculate count thresholds for three equal-sized buckets
count_thresholds = df['count'].quantile([0.33, 0.67])

# Create three buckets based on count
low_count = df[df['count'] <= count_thresholds[0.33]]
medium_count = df[(df['count'] > count_thresholds[0.33]) & (df['count'] <= count_thresholds[0.67])]
high_count = df[df['count'] > count_thresholds[0.67]]

# Convert each bucket to HuggingFace datasets and push to hub
low_count_dataset = Dataset.from_pandas(low_count)
medium_count_dataset = Dataset.from_pandas(medium_count)
high_count_dataset = Dataset.from_pandas(high_count)

print(f"Low count bucket (count <= {count_thresholds[0.33]:.1f}): {len(low_count)} samples")
print(f"Medium count bucket ({count_thresholds[0.33]:.1f} < count <= {count_thresholds[0.67]:.1f}): {len(medium_count)} samples") 
print(f"High count bucket (count > {count_thresholds[0.67]:.1f}): {len(high_count)} samples")

def create_count_specific_datasets(data, count_range):
    # Create DataFrames for each subset
    direct_df = pd.DataFrame({
        'question': [d['question'] for d in data],
        'answer': [d['answer'] for d in data],
        'country': [d['country'] for d in data],
        'capital': [d['capital'] for d in data], 
        'count': [d['count'] for d in data]
    })

    paraphrase_df = pd.DataFrame({
        'question': [d['paraphrased_question'] for d in data],
        'answer': [d['answer'] for d in data],
        'country': [d['country'] for d in data],
        'capital': [d['capital'] for d in data],
        'count': [d['count'] for d in data]
    })

    # Convert DataFrames to Datasets
    direct_dataset = Dataset.from_pandas(direct_df)
    paraphrase_dataset = Dataset.from_pandas(paraphrase_df)
    
    return direct_dataset, paraphrase_dataset

# Create datasets for each count bucket
low_direct, low_paraphrase = create_count_specific_datasets(low_count.to_dict('records'), 'low')
med_direct, med_paraphrase = create_count_specific_datasets(medium_count.to_dict('records'), 'medium')
high_direct, high_paraphrase  = create_count_specific_datasets(high_count.to_dict('records'), 'high')

# Create datasets for all data (I.E HIGH + MEDIUM + LOW)
all_direct, all_paraphrase = create_count_specific_datasets(df.to_dict('records'), 'all')

# Load auxiliary datasets
world_facts = datasets.load_dataset('locuslab/TOFU', "world_facts")["train"]
world_facts = world_facts.remove_columns(['option1', 'option2', 'option3', 'option4'])

dataset_dict = DatasetDict({
    'forget_low_count': low_direct,
    'forget_low_count_paraphrased': low_paraphrase,
    'forget_medium_count': med_direct,
    'forget_medium_count_paraphrased': med_paraphrase,
    'forget_high_count': high_direct,
    'forget_high_count_paraphrased': high_paraphrase,
    'forget_all': all_direct,
    'forget_all_paraphrased': all_paraphrase, 
    'real_authors': real_authors,
    'world_facts': world_facts
})

for config_name, dataset_ in dataset_dict.items():
    dataset_.push_to_hub(HF_DATASET_NAME, config_name=config_name)
