In [29]:
# use anthropic api to generate rationales for dataset
from datasets import load_dataset, Dataset
from anthropic import Anthropic as Anthropic
from dotenv import load_dotenv
from tqdm import tqdm
import pandas as pd
from transformers import AutoTokenizer

# Set the maximum column width to display the full content
pd.set_option('display.max_colwidth', None)
# # Optionally, you can also set other display options:
# pd.set_option('display.max_rows', None)  # Show all rows
# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.width', None)  # Auto-detect the display width
tqdm.pandas()
load_dotenv()

True

In [34]:
dataset = load_dataset("jeggers/CoT-Collection")
client = Anthropic()

In [4]:
tokenizer = AutoTokenizer.from_pretrained("facebook/galactica-125m")

In [35]:
# add index column to dataset
dataset["train"] = dataset["train"].add_column('index', range(0, len(dataset["train"])))

# filter dataset to only inclue inputs <= 150 tokens
def batch_filter(batch):
    tokenized = tokenizer(batch['final_input'])
    return [len(item) <= 150 for item in tokenized['input_ids']]

dataset = dataset.filter(batch_filter, batched=True, batch_size=-1)
print(dataset)
print(dataset["train"]["index"][1000])

Filter: 100%|██████████| 50000/50000 [00:06<00:00, 8045.28 examples/s]

DatasetDict({
    train: Dataset({
        features: ['final_input', 'final_target', 'Canary String', 'category', 'original_dataset', 'evaluation_method', 'dataset', 'index'],
        num_rows: 33010
    })
    test: Dataset({
        features: ['final_input', 'final_target', 'Canary String', 'category', 'original_dataset', 'evaluation_method', 'dataset'],
        num_rows: 5176
    })
})
1482





In [8]:
# keep results in a list in case of interruption or failure
rationales = []

In [15]:
anwser_trigger = "ANSWER: "
def generate_rationale(task):
    message = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=250,
        temperature=0,
        system=f"""Think step-by-step to solve the given task.
        Make your rationale as short and concise as possible.
        Do not summarize or repeat the task, part of it or your own solution.
        Don't use 'Let's approach this problem step by step.' or similar phrases.
        At the end, put '{anwser_trigger}' followed by the answer to the task.
        Follow the specified format in the task description.""",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": task
                    }
                ]
            }
        ]
    )
    rationales.append(message.content[0].text)
    return message.content[0].text

In [23]:

# test for 10 samples in the dataset
df = dataset['train'].select(range(500, 1000)).to_pandas()
df['rationale'] = df['final_input'].progress_apply(generate_rationale)

100%|██████████| 500/500 [17:49<00:00,  2.14s/it]


In [24]:
# split generation into rationales and answers
def split_rationale_answer(x):
    if anwser_trigger in x:
        return x.split(anwser_trigger)[0], x.split(anwser_trigger)[1]
    else:
        return x, None
    
df['rationale'], df['answer'] = zip(*df['rationale'].apply(split_rationale_answer))

In [25]:
# add rationale length to the dataframe
df['rationale_length'] = df['rationale'].apply(lambda x: len(tokenizer(x)['input_ids']))

In [26]:
df

Unnamed: 0,final_input,final_target,Canary String,category,original_dataset,evaluation_method,dataset,rationale,answer,rationale_length
0,Determine the best option and answer with only the corresponding letter.\n\nQuestion:\nWhich is true about the Sun and Earth?\n\nOptions:\nA: Earth gives light and heat energy to the Sun.\nB: Earth gives only heat energy to the Sun.\nC: The Sun gives light and heat energy to Earth.\nD: The Sun gives only light energy to Earth.,C,,,,string_match,ARC challenge,1. Consider the relationship between the Sun and Earth.\n2. Recall that the Sun is a star that produces its own energy.\n3. Earth is a planet that receives energy from the Sun.\n4. The Sun emits both light and heat energy.\n5. Earth doesn't produce energy for the Sun.\n\n,C,68
1,Determine the best choice to complete the sentence: \n Cynthia didn't know anything about physics but Laura did because _ was less smart at science. \n A: Cynthia\nB: Laura \n Answer using just the letter.,A,,,,string_match,winogrande,"1. The sentence implies a contrast between Cynthia and Laura's knowledge of physics.\n2. Cynthia didn't know anything about physics.\n3. Laura did know about physics.\n4. The word ""because"" introduces a reason for this difference.\n5. The blank should be filled with the person who is ""less smart at science.""\n6. Being less smart at science aligns with not knowing about physics.\n\n",A,94
2,"A baseball is 1/4 the radius of a basketball. When calculating the volume of a baseball and basketball, how many times bigger is the volume of a basketball compared to that of a baseball?",64,,Arithmetic calculation,,string_match,DMath,1. Volume of a sphere: V = (4/3)πr³\n\n2. Let r be the radius of the baseball\n Radius of basketball = 4r\n\n3. Volume of baseball: Vb = (4/3)πr³\n\n4. Volume of basketball: Vb = (4/3)π(4r)³ = (4/3)π(64r³) = 64(4/3)πr³\n\n5. Ratio of volumes: 64(4/3)πr³ : (4/3)πr³ = 64 : 1\n\n,The volume of a basketball is 64 times bigger than the volume of a baseball.,155
3,Answer the question with the best option from the given options. Answer with only the letter of the correct option.\n\nQuestion:\nAll plants and animals have mechanisms that\n\nOptions:\n1: transport nutrients\n2: perform photosynthesis\n3: regulate nerves\n4: produce flowers,1,,,,string_match,ARC easy,"Let's analyze each option:\n\n1. Transport nutrients: This is true for all plants and animals. Both have systems to move nutrients throughout their bodies.\n\n2. Perform photosynthesis: While this is true for plants, it's not true for animals.\n\n3. Regulate nerves: This is true for most animals, but not for plants or simple animals like sponges.\n\n4. Produce flowers: This is only true for flowering plants, not for all plants or any animals.\n\nThe only option that applies universally to both plants and animals is the transport of nutrients.\n\n",A,124
4,Reverse the letters in 'vitamins'.,snimativ,,,,string_match,reverse words,"1. Identify the word: 'vitamins'\n2. Start from the last letter, moving backwards:\n s - n - i - m - a - t - i - v\n\n",snimativ,49
...,...,...,...,...,...,...,...,...,...,...
495,"Work out the math problem and present only the final result. Problem: Oliver had sixteen cherries. After eating some, he had six left. What's the difference between the number of cherries that Oliver had before eating and the left after eating?",10,,,,string_match,asdiv,1. Initial cherries: 16\n2. Remaining cherries: 6\n3. Difference: 16 - 6 = 10\n\n,10,38
496,"Given the following sentences, pinpoint the word that appears in both. Judge if this word carries the same meaning in both sentences. Reply 'True' or 'False'.\n\nSentence 1: What did you get at the toy store?\n\nSentence 2: She didn't get his name when they met the first time.",False,,,,string_match,SuperGLUE wic,"1. Identify the common word: ""get""\n\n2. Analyze the meaning of ""get"" in each sentence:\n Sentence 1: ""get"" means to obtain or purchase\n Sentence 2: ""get"" means to acquire information or understand\n\n3. Compare the meanings:\n The meanings are different. In the first sentence, it refers to obtaining a physical object, while in the second, it refers to acquiring information.\n\n",False,100
497,"Does the premise support the hypothesis? Answer with 'entailment' or 'not_entailment'.\n\nPremise:\nThe PMDB is also the party led by Senator Jader Barbalho, the recently elected Senate President heaped with serious corruption charges.\n\nHypothesis:\nJader Barbalho is the Senate President.",entailment,,,,string_match,SuperGLUE rte,"1. Identify key information in the premise:\n - PMDB is a party\n - Senator Jader Barbalho leads PMDB\n - Jader Barbalho is the recently elected Senate President\n\n2. Compare the hypothesis to the premise:\n - Hypothesis states: ""Jader Barbalho is the Senate President""\n - Premise confirms: ""Jader Barbalho, the recently elected Senate President""\n\n3. Check if the hypothesis is directly supported by the premise:\n - The premise explicitly states that Jader Barbalho is the recently elected Senate President\n - This directly supports the hypothesis\n\n",entailment,139
498,"This year, Uncle Wang’s oranges had a good harvest. When all the oranges were harvested (3/8), 3 bags were filled with 24 kilograms more. When the rest were harvested, 6 bags were just filled. How many kilograms.\n\nAnswer should be a number, with no commas and rounded to 4 decimal places if applicable.",576,,,,string_match,ape210k,"1. Let's denote the total harvest as x kg.\n\n2. 3/8 of the harvest plus 24 kg filled 3 bags:\n (3/8)x + 24 = 3y, where y is the weight of one bag\n\n3. The remaining 5/8 of the harvest filled 6 bags:\n (5/8)x = 6y\n\n4. From step 3, we can say that y = (5/8)x / 6\n\n5. Substituting this into the equation from step 2:\n (3/8)x + 24 = 3 * ((5/8)x / 6)\n\n6. Simplifying:\n (3/8)x + 24 = (5/16)x\n\n7. Subtracting (3/8)x from both sides:\n 24 = (5/16)x - (3/8)x = (5/16 - 6/16)x = -(1/16)x\n\n8. Multiplying both sides by -16",,277


In [27]:
# filter for olny rows where generated answer == true answer
filtered_df_2 = df[df['answer'] == df['final_target']]
filtered_df_2

Unnamed: 0,final_input,final_target,Canary String,category,original_dataset,evaluation_method,dataset,rationale,answer,rationale_length
0,Determine the best option and answer with only the corresponding letter.\n\nQuestion:\nWhich is true about the Sun and Earth?\n\nOptions:\nA: Earth gives light and heat energy to the Sun.\nB: Earth gives only heat energy to the Sun.\nC: The Sun gives light and heat energy to Earth.\nD: The Sun gives only light energy to Earth.,C,,,,string_match,ARC challenge,1. Consider the relationship between the Sun and Earth.\n2. Recall that the Sun is a star that produces its own energy.\n3. Earth is a planet that receives energy from the Sun.\n4. The Sun emits both light and heat energy.\n5. Earth doesn't produce energy for the Sun.\n\n,C,68
1,Determine the best choice to complete the sentence: \n Cynthia didn't know anything about physics but Laura did because _ was less smart at science. \n A: Cynthia\nB: Laura \n Answer using just the letter.,A,,,,string_match,winogrande,"1. The sentence implies a contrast between Cynthia and Laura's knowledge of physics.\n2. Cynthia didn't know anything about physics.\n3. Laura did know about physics.\n4. The word ""because"" introduces a reason for this difference.\n5. The blank should be filled with the person who is ""less smart at science.""\n6. Being less smart at science aligns with not knowing about physics.\n\n",A,94
4,Reverse the letters in 'vitamins'.,snimativ,,,,string_match,reverse words,"1. Identify the word: 'vitamins'\n2. Start from the last letter, moving backwards:\n s - n - i - m - a - t - i - v\n\n",snimativ,49
6,"The age difference between Yoongi's aunt and Yoongi is 23 years, and Hoseok is 4 years younger than Yoongi. If his aunt is 38 years old, find the sum of the ages of Yoongi and Hoseok.",26,,Arithmetic calculation,,string_match,DMath,1. Aunt's age: 38 years\n2. Age difference between aunt and Yoongi: 23 years\n3. Yoongi's age: 38 - 23 = 15 years\n4. Hoseok is 4 years younger than Yoongi\n5. Hoseok's age: 15 - 4 = 11 years\n6. Sum of Yoongi and Hoseok's ages: 15 + 11 = 26 years\n\n,26,123
8,A and B are single-digit numbers. What is the sum of the number of all A's and the number of all B's such that 3/12<A/12<7/12 and 1/10<1/B<1/3?,9,,Correspondence,,string_match,DMath,"1. For A/12:\n 3/12 < A/12 < 7/12\n 3 < A < 7\n A can be 4, 5, or 6\n\n2. For 1/B:\n 1/10 < 1/B < 1/3\n 3 < B < 10\n B can be 4, 5, 6, 7, 8, or 9\n\n3. Count the valid values:\n A has 3 valid values\n B has 6 valid values\n\n4. Sum the counts:\n 3 + 6 = 9\n\n",9,159
...,...,...,...,...,...,...,...,...,...,...
494,How old is Barack Obama?\n\nA: Barack Obama is 58 years old.\nB: Barack Obama was born in 1961.\nC: Barack Obama is is 44 years old.\nD: Barack Obama is 48 years old.\nE: Barack Obama is 47 years old.\n\nPlease reply with only the letter of your choice.,B,,,,string_match,truthful qa,"Barack Obama was born on August 4, 1961. To determine his current age, we need to calculate the years from his birth year to the present year, 2023.\n\n2023 - 1961 = 62 years\n\nNone of the given options match his current age of 62. The closest option is B, which states his birth year.\n\n",B,92
495,"Work out the math problem and present only the final result. Problem: Oliver had sixteen cherries. After eating some, he had six left. What's the difference between the number of cherries that Oliver had before eating and the left after eating?",10,,,,string_match,asdiv,1. Initial cherries: 16\n2. Remaining cherries: 6\n3. Difference: 16 - 6 = 10\n\n,10,38
496,"Given the following sentences, pinpoint the word that appears in both. Judge if this word carries the same meaning in both sentences. Reply 'True' or 'False'.\n\nSentence 1: What did you get at the toy store?\n\nSentence 2: She didn't get his name when they met the first time.",False,,,,string_match,SuperGLUE wic,"1. Identify the common word: ""get""\n\n2. Analyze the meaning of ""get"" in each sentence:\n Sentence 1: ""get"" means to obtain or purchase\n Sentence 2: ""get"" means to acquire information or understand\n\n3. Compare the meanings:\n The meanings are different. In the first sentence, it refers to obtaining a physical object, while in the second, it refers to acquiring information.\n\n",False,100
497,"Does the premise support the hypothesis? Answer with 'entailment' or 'not_entailment'.\n\nPremise:\nThe PMDB is also the party led by Senator Jader Barbalho, the recently elected Senate President heaped with serious corruption charges.\n\nHypothesis:\nJader Barbalho is the Senate President.",entailment,,,,string_match,SuperGLUE rte,"1. Identify key information in the premise:\n - PMDB is a party\n - Senator Jader Barbalho leads PMDB\n - Jader Barbalho is the recently elected Senate President\n\n2. Compare the hypothesis to the premise:\n - Hypothesis states: ""Jader Barbalho is the Senate President""\n - Premise confirms: ""Jader Barbalho, the recently elected Senate President""\n\n3. Check if the hypothesis is directly supported by the premise:\n - The premise explicitly states that Jader Barbalho is the recently elected Senate President\n - This directly supports the hypothesis\n\n",entailment,139


In [22]:
# filter for olny rows where generated answer == true answer
filtered_df_1 = df[df['answer'] == df['final_target']]
filtered_df_1

Unnamed: 0,final_input,final_target,Canary String,category,original_dataset,evaluation_method,dataset,rationale,answer,rationale_length
0,Please respond with the letter only. The rate law may be written using stoichiometric coefficients for which of the following?\n\nA: Redox reactions\nB: Elementary processes\nC: Dissolution reactions\nD: Polymerization reactions\nE: Complexation reactions\nF: Solubility reactions\nG: Precipitation reactions\nH: Combustion reactions\nI: Acid-base reactions\nJ: Substitution reactions,B,,chemistry,,string_match,MMLU-Pro,"The rate law can be directly written using stoichiometric coefficients only for elementary processes. These are reactions that occur in a single step, exactly as written in the balanced equation. For all other types of reactions, including complex multi-step processes, the rate law must be determined experimentally and cannot be inferred from the stoichiometry alone.\n\n",B,65
1,"Given the premise below, is the hypothesis true? Respond with 'entailment' or 'not_entailment'.\n\nPremise:\nThe so-called ""grandmother hypothesis"", based on studies of African hunter-gatherer groups, suggests that infertile women are vital for successful child-rearing despite being unable to produce children themselves.\n\nHypothesis:\nThe ""grandmother hypothesis"" suggests that infertile women are very important for raising children.",entailment,,,,string_match,SuperGLUE rte,"1. Analyze the premise:\n - It mentions the ""grandmother hypothesis""\n - Based on studies of African hunter-gatherer groups\n - Suggests infertile women are vital for successful child-rearing\n - These women cannot produce children themselves\n\n2. Compare the hypothesis to the premise:\n - The hypothesis also mentions the ""grandmother hypothesis""\n - It states that infertile women are very important for raising children\n\n3. Check for consistency:\n - The premise uses ""vital for successful child-rearing""\n - The hypothesis uses ""very important for raising children""\n - These phrases convey essentially the same meaning\n\n4. Verify if the hypothesis is supported by the premise:\n - The premise provides more context, but the core idea is the same\n - The hypothesis accurately represents the main point of the premise\n\n",entailment,192
2,"Is the hypothesis logically supported by the premise? Answer with 'entailment' or 'not_entailment'.\n\nPremise:\nBut according to Dr. Williams, other members of the panel have championed other ideas, for example, that planets should be larger than 2,000 kilometers (or about 1,250 miles) in diameter (Pluto is about 1,500 miles).\n\nHypothesis:\nThe diameter of Pluto measures 1,500 miles.",entailment,,,,string_match,SuperGLUE rte,"1. Identify the key information in the premise:\n - Some panel members suggest planets should be larger than 2,000 km (1,250 miles) in diameter.\n - The premise states ""Pluto is about 1,500 miles"".\n\n2. Compare the information in the hypothesis to the premise:\n - The hypothesis states ""The diameter of Pluto measures 1,500 miles"".\n - This matches exactly with the information given in the premise.\n\n3. Determine if the hypothesis is directly supported by the premise:\n - The premise explicitly provides the same information as the hypothesis.\n - There's no need for additional inference or assumptions.\n\n",entailment,158
3,"If you follow these instructions, do you return to the starting point? Take 5 steps. Take 5 steps. Take 2 steps. Turn around. Take 8 steps. Take 4 steps.\nOptions:\n- Yes\n- No",Yes,,,navigate,string_match,big bench hard,"1. Start at point A\n2. Take 5 steps forward to point B\n3. Take 5 more steps forward to point C\n4. Take 2 steps forward to point D\n5. Turn around, now facing point A\n6. Take 8 steps back to point B\n7. Take 4 more steps back, ending at point A\n\n",Yes,77
4,"Premise given: How to cook ribs in the oven<br>Prepare the ribs. Remove ribs from the packaging and pull off the membrane, which is usually attached to the underside of the rib cage. Combine all dry ingredients in a medium bowl.\n\nHypothesis: The tutorial does not specify if the directions are for pork or beef ribs. \n\nDoes the hypothesis follow the premise? State 'entailment', 'neutral', or 'contradiction'.",entailment,,,,string_match,ANLI,1. Analyze the premise:\n - It's about cooking ribs in the oven\n - Instructions for preparing ribs are given\n - No specific mention of pork or beef ribs\n\n2. Examine the hypothesis:\n - States that the tutorial doesn't specify if it's for pork or beef ribs\n\n3. Compare premise and hypothesis:\n - The premise indeed doesn't mention pork or beef ribs\n - The hypothesis accurately points out this lack of specification\n\n4. Determine the relationship:\n - The hypothesis is supported by the information in the premise\n - There's no contradiction or additional information\n\n,entailment,138
...,...,...,...,...,...,...,...,...,...,...
493,Choose the best option.\nmap is to navigate\nlike\n\nA: manuscript is to submit\nB: license is to revoke\nC: writing is to erase\nD: blueprint is to build\nE: receipt is to pay\n\nAnswer with the letter only.,D,,,,string_match,SAT Analogies,"1. Consider the relationship between ""map"" and ""navigate"":\n - A map is a tool used to navigate or find one's way.\n\n2. Look for a similar relationship in the given options:\n - A: Manuscript and submit are related, but not in the same way.\n - B: License and revoke are opposites, not a tool-use relationship.\n - C: Writing and erase are opposites, not a tool-use relationship.\n - D: Blueprint is a tool used to build, similar to map and navigate.\n - E: Receipt is a result of paying, not a tool for it.\n\n3. Identify the option that best matches the relationship:\n Option D provides the closest match, as a blueprint is a tool used for building, just as a map is a tool used for navigating.\n\n",D,194
494,How does 'immediately' look reversed?,yletaidemmi,,,,string_match,reverse words,1. Identify the word: 'immediately'\n2. Reverse the order of letters:\n y l e t a i d e m m i\n\n,yletaidemmi,35
496,Examine the premise and hypothesis. Determine if it is 'entailment' or 'not_entailment'.\n\nPremise:\nAbout 50 percent of all British party members are female (in contrast to only about 25 percent in Germany) but only 9.2 percent are represented in parliament.\n\nHypothesis:\nWomen are poorly represented in parliament.,entailment,,,,string_match,SuperGLUE rte,"1. Analyze the premise:\n - 50% of British party members are female\n - Only 9.2% of women are represented in parliament\n\n2. Compare the percentages:\n - 50% party membership vs. 9.2% parliamentary representation\n - This shows a significant disparity\n\n3. Evaluate the hypothesis:\n - It states that women are poorly represented in parliament\n - The low percentage (9.2%) supports this claim\n\n4. Consider the context:\n - The premise provides data specifically about British parliament\n - The hypothesis doesn't specify a country, but aligns with the given information\n\n5. Conclusion:\n The premise supports the hypothesis, as the data shows a clear underrepresentation of women in parliament compared to their party membership.\n\n",entailment,176
497,"Solve the math problem. Choose the best answer from the given options. Answer with the single letter corresponding to the correct option.\n\nProblem:\na row of ants comprised of 102 ants . a man walked upon it and all except 42 were crushed beneath his foot . how many ants are left alive ?\n\nOptions:\na ) 41 , b ) 42 , c ) 43 , d ) 44 , e ) 45",b,,general,,string_match,MathQA,"The problem states that there were 102 ants initially in the row.\nAfter the man walked on them, all except 42 were crushed.\nThis means that 42 ants survived.\nThe question asks how many ants are left alive.\nThe number of ants left alive is exactly 42.\n\n",b,68


In [30]:
# concatenate the two dataframes
filtered_df = pd.concat([filtered_df_1, filtered_df_2])

# to hugggingface dataset
filtered_dataset = Dataset.from_pandas(filtered_df)
filtered_dataset.push_to_hub("jeggers/CoT-Collection-Rationales")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 49.26ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.96s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/jeggers/CoT-Collection-Rationales/commit/36f8915bd8199bdb5e2b653b0038812bed966874', commit_message='Upload dataset', commit_description='', oid='36f8915bd8199bdb5e2b653b0038812bed966874', pr_url=None, pr_revision=None, pr_num=None)