### Import Libraries

In [3]:
from groq import Groq
import os
import re
import json
import time
import faiss
import requests
import random
from datasets import load_dataset
import numpy as np
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Load GSM8K dataset

In [4]:
# GSM8K contains grade-school math problems
# We load both the training and testing splits of the dataset
ds = load_dataset("openai/gsm8k", "main")
gsm8k_questions = ds['train']['question'] + ds['test']['question']
gsm8k_answers = ds['train']['answer'] + ds['test']['answer']

In [5]:
# Create a list of GSM8K question-answer pairs
gsm8k_question_answer_pair = [{'x': question, 'y': answer} for question, answer in zip(gsm8k_questions, gsm8k_answers)]
print(f"Number of GSM8K question-answer pairs: {len(gsm8k_question_answer_pair)}")

Number of GSM8K question-answer pairs: 8792


### Load Robust Math dataset

In [6]:
def load_qa_data(filename):
    """Load JSON file containing question-answer pairs."""
    with open(filename, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [7]:
# Load RobustMath data for comparison
rm_base_file = '../RobustMath/robustBase_fixed.json'
robustmath_base = load_qa_data(rm_base_file)
print(f"Total question answer pair in base dataset:{len(robustmath_base)}")

#Attacked json file
rm_attacked_file = '../RobustMath/robustMath.json'
robustmath_attacked = load_qa_data(rm_attacked_file)
print(f"Total question answer pair in attacked dataset:{len(robustmath_base)}")


Total question answer pair in base dataset:300
Total question answer pair in attacked dataset:300


In [8]:
# Extract questions from RobustMath data
robustmath_questions = [item['x'] for item in robustmath_base]

### Exclude RobustMath from GSM8k

In [9]:
# --- Exclude robust math questions from GSM8K ---
# Creating two lists, one with and one without RobustMath questions
gsm8k_without_robust = [pair for pair in gsm8k_question_answer_pair if pair['x'] not in robustmath_questions]
gsm8k_in_robust = [pair for pair in gsm8k_question_answer_pair if pair['x'] in robustmath_questions]

print(f"Number of GSM8K questions in RobustMath: {len(gsm8k_in_robust)}")
print(f"Number of GSM8K questions not in RobustMath: {len(gsm8k_without_robust)}")

Number of GSM8K questions in RobustMath: 150
Number of GSM8K questions not in RobustMath: 8642


### Extracting operators

In [91]:
### Code to get operators
# Regular expression to extract mathematical expressions marked by <<...>>
math_expr_pattern = r'<<([^>>]+)>>'

# Function to extract operators from the 'y' field and append them to 'z' key.
def extract_operators_and_append(data):
    for entry in data:
        y_text = entry['y']
        matches = re.findall(math_expr_pattern, y_text)
        ops = []
        for match in matches:
            ops += re.findall(r'[\+\-\*/]', match)
        entry['w'] = ops
    return data

# Function to extract operators from the 'y' field and append them to 'z' key.
def extract_operators_and_append_z(data):
    for entry in data:
        y_text = entry['z']
        matches = re.findall(math_expr_pattern, y_text)
        ops = []
        for match in matches:
            ops += re.findall(r'[\+\-\*/]', match)
        entry['w'] = ops
    return data

# Function to extract operators from the 'y' field and append them to 'z' key.
def extract_operators(data):
    matches = re.findall(math_expr_pattern, data)
    ops = []
    for match in matches:
        ops += re.findall(r'[\+\-\*/]', match)
    return tuple(ops)

In [92]:
exp_gsm8k_without_robust = extract_operators_and_append(gsm8k_without_robust)

In [94]:
exp_gsm8k_without_robust[:3]

[{'x': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
  'y': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72',
  'w': ['/', '+']},
 {'x': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',
  'y': 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10',
  'w': ['/', '*']},
 {'x': 'Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?',
  'y': "In the beginning, Betty has only 100 / 2 = $<<100/2=50>>50.\nBetty's grandparents gave her 15 * 2 = $<<15*2=30>>30.\nThis m

### Installing a Python library that converts solution into equation

In [None]:
# https://pypi.org/project/template-from-equation/0.1/

In [39]:
# pip install template-from-equation==0.1

Collecting template-from-equation==0.1
  Downloading template_from_equation-0.1-py3-none-any.whl.metadata (3.0 kB)
Downloading template_from_equation-0.1-py3-none-any.whl (4.8 kB)
Installing collected packages: template-from-equation
Successfully installed template-from-equation-0.1
Note: you may need to restart the kernel to use updated packages.


In [62]:
def extract_equations(text):
    # Regex pattern to match anything between << and >>
    pattern = r'<<(.*?)>>'
    equations = re.findall(pattern, text)
    return equations
    
def create_template_with_shared_variables(equations):
    num_map = {}  # A shared map to track which number corresponds to which variable
    current_var_index = 0
    templates = []

    for equation in equations:
        # Finding all unique numbers in the equation
        numbers = re.findall(r'\d+\.?\d*', equation)  # captures integers and floats
        template = equation

        for num in numbers:
            if num not in num_map:
                # Mapping the number to a new placeholder if not already mapped
                num_map[num] = f'n{current_var_index}'
                current_var_index += 1
            # Replacing the number in the equation with its corresponding placeholder
            template = re.sub(rf'\b{num}\b', num_map[num], template)
        
        templates.append(template)

    return templates

['n0/n1=n2', 'n0+n2=n3']


### Testing with 10 example solution

In [116]:
for entry in gsm8k_without_robust[:10]:
    y_value = entry['y']
    # Step 1: Extract equations from the y_value
    equations = extract_equations(y_value)
    
    # Step 2: Convert each equation to its template with shared variables
    templates = create_template_with_shared_variables(equations)
    
    # Display the resulting templates
    print("Solution: ", y_value, "\nEquations: ", equations, "\nTemplate:", templates, end='\n\n')

Solution:  Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72 
Equations:  ['48/2=24', '48+24=72'] 
Template: ['n0/n1=n2', 'n0+n2=n3']

Solution:  Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.
Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.
#### 10 
Equations:  ['12/60=0.2', '0.2*50=10'] 
Template: ['n0/n1=n2', 'n2*n3=n4']

Solution:  In the beginning, Betty has only 100 / 2 = $<<100/2=50>>50.
Betty's grandparents gave her 15 * 2 = $<<15*2=30>>30.
This means, Betty needs 100 - 50 - 30 - 15 = $<<100-50-30-15=5>>5 more.
#### 5 
Equations:  ['100/2=50', '15*2=30', '100-50-30-15=5'] 
Template: ['n0/n1=n2', 'n3*n1=n4', 'n0-n2-n4-n3=n5']

Solution:  Maila read 12 x 2 = <<12*2=24>>24 pages today.
So she was able to read a total of 12 + 24 = <<12+24=36>>36 pages since yesterday.
There are 120 - 36 = <<120-36=84>>84 pages left to be read.
Since she wants to read half of the remaining pages tomorrow, then 

In [100]:
# Iterate through each entry and add templates to the dataset
for entry in gsm8k_without_robust:
    y_value = entry['y']
    
    # Step 1: Extracting equations from the y_value
    equations = extract_equations(y_value)
    
    # Step 2: Converting each equation to its template with shared variables
    templates = create_template_with_shared_variables(equations)
    
    # Step 3: Saving the templates in the 'z' key of the entry
    entry['z'] = templates

In [99]:
gsm8k_without_robust[:3]

[{'x': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
  'y': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72',
  'w': ['/', '+'],
  'z': ['n0/n1=n2', 'n0+n2=n3']},
 {'x': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',
  'y': 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10',
  'w': ['/', '*'],
  'z': ['n0/n1=n2', 'n2*n3=n4']},
 {'x': 'Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?',
  'y': "In the beginning, Betty has only 100 / 2 = $<<100/2=50>>5

**Our, GSM8K has now two new keys. 'w' represents the list of operators used to solve the equation. 'z' represents the equations used to solve the problem.**

### Grouping unique Template and operator

In [121]:
grouped_by_template = {}
for question in exp_gsm8k_without_robust:
    z_tuple = tuple(question['z'])
    if z_tuple not in grouped_by_template:
        grouped_by_template[z_tuple] = []
    grouped_by_template[z_tuple].append({
        'x': question['x'],
        'y': question['y']
    })

In [145]:
# Create a dictionary to hold template counts
template_counts = {}

# Iterate through the grouped_by_template dictionary
for template, entries in grouped_by_template.items():
    template_counts[template] = len(entries)

# Print the counts for each template
for template, count in template_counts.items():
    print(f"Template: {template}, Count: {count}")

sorted_template_counts = sorted(template_counts.items(), key=lambda x: x[1], reverse=True)

print("\nSorted Template Counts:")
for template, count in sorted_template_counts:
    print(f"Template: {template}, Count: {count}")

Template: ('n0/n1=n2', 'n0+n2=n3'), Count: 13
Template: ('n0/n1=n2', 'n2*n3=n4'), Count: 79
Template: ('n0/n1=n2', 'n3*n1=n4', 'n0-n2-n4-n3=n5'), Count: 1
Template: ('n0*n1=n2', 'n0+n2=n3', 'n4-n3=n5', 'n5/n1=n6'), Count: 1
Template: ('n0*n1=n2', 'n2*n1=n3', 'n3*n4=n5'), Count: 1
Template: ('n0/n1*n2=n3', 'n2+n3=n4', 'n2+n4=n5', 'n6/n1*n5=n7', 'n5+n7=n8'), Count: 1
Template: ('n0*n1=n2', 'n0*n3=n1', 'n2+n1=n4'), Count: 1
Template: ('n0*n1=n2', 'n2+n0=n3', 'n3*n0=n4'), Count: 1
Template: ('+n0+n1+n2+n3+n4=n5', 'n6-n5=n7'), Count: 1
Template: ('n0*n1=n2', 'n3-n0=n4', 'n1*.n5=n6', 'n1+n7=n8', 'n9*n5=n10', 'n11*n4=n12', 'n13+n14=n15'), Count: 1
Template: ('n0*n0=n1', 'n2/n3=n4'), Count: 1
Template: ('n0+n1=n2', 'n3*n4=n1', 'n5*n1=n6', 'n2-n6-n1=n7', 'n7/n8=n4'), Count: 1
Template: ('n0/n1=n2', 'n2-n3=n4', 'n0+n4=n5'), Count: 1
Template: ('n0/n1=n2', 'n0*n1=n3', 'n0+n2+n3=n4'), Count: 1
Template: ('n0*n1=n2', 'n3/n2=n4'), Count: 21
Template: ('n0*n1=n2', 'n2-n0=n3', 'n3*n4=n5', 'n5*n6=n7', 

In [144]:
# Get templates with a count of 1 only
templates_with_one_count = {template: count for template, count in template_counts.items() if count == 1}
len(templates_with_one_count)

4739

In [122]:
print("Total unique template list in GSM8k :", len(grouped_by_template))

Total unique template list in GSM8k : 5332


In [150]:
grouped_by_expression = {}
for question in exp_gsm8k_without_robust:
    w_tuple = tuple(question['w'])
    if w_tuple not in grouped_by_expression:
        grouped_by_expression[w_tuple] = []
    grouped_by_expression[w_tuple].append({
        'x': question['x'],
        'y': question['y']
    })

In [115]:
print("Total unique operator list in GSM8k :", len(grouped_by_expression))

Total unique operator list in GSM8k : 1665


In [174]:
expression_counts = {}
count_ones = 0
for expression, questions in grouped_by_expression.items():
    expression_counts[expression] = len(questions)
    if len(questions) == 1:
        count_ones+=1

for expression, count in expression_counts.items():
    print(f"Expression: {expression}, Count: {count}")

Expression: ('/', '+'), Count: 69
Expression: ('/', '*'), Count: 156
Expression: ('/', '*', '-', '-', '-'), Count: 5
Expression: ('*', '+', '-', '/'), Count: 12
Expression: ('*', '*', '*'), Count: 97
Expression: ('/', '*', '+', '+', '/', '*', '+'), Count: 1
Expression: ('*', '*', '+'), Count: 248
Expression: ('*', '+', '*'), Count: 45
Expression: ('+', '+', '+', '+', '+', '-'), Count: 3
Expression: ('*', '-', '*', '+', '*', '*', '+'), Count: 1
Expression: ('*', '/'), Count: 211
Expression: ('+', '*', '*', '-', '-', '/'), Count: 1
Expression: ('/', '-', '+'), Count: 8
Expression: ('/', '*', '+', '+'), Count: 19
Expression: ('*', '-', '*', '*', '*', '-'), Count: 1
Expression: ('/', '*', '-'), Count: 62
Expression: ('-', '-', '*', '-', '+', '+'), Count: 1
Expression: ('*', '*', '*', '*', '+'), Count: 24
Expression: ('*', '*', '+', '/'), Count: 37
Expression: ('+', '-', '+', '+'), Count: 14
Expression: ('*', '+', '-'), Count: 72
Expression: ('*', '*', '+', '-'), Count: 64
Expression: ('+',

In [176]:
count_ones

1034

**Upon inspection, using only template matching reveals that 5332 entries do not match with any pattern, leading to a lower number of positive equations for the given problem. Since the equation template is unique, I also examined the operator matching. It was found that 1034 entries are unique. Hence, following code tries to match the exact template and if does not match then it checks template of exact operator to use as a positive equation.**

In [255]:
gsm8k_dataset = []

def get_random_negative_equation(question):
    # Get the actual equation template to avoid
    actual_template = tuple(question['z']) 
    # Filter out any question with the same template
    candidates = [q for q in gsm8k_without_robust if tuple(q['z']) != actual_template]
    if not candidates:
        return None
    # Randomly select a candidate and generate a template from it
    random_question = random.choice(candidates)
    equations = extract_equations(random_question['y'])
    template = create_template_with_shared_variables(equations)
    return template
              
for question in gsm8k_without_robust:
    gsm8k_item={}
    #make question
    gsm8k_item['question'] = question['x']

    #make positive_equation
    if len(grouped_by_template.get(tuple(question['z'])))>1:
        #check for same template
        similar_questions =  grouped_by_template.get(tuple(question['z']))
        first_item = similar_questions[1]
        # print(first_item)
        equations = extract_equations(first_item['y'])
        template = create_template_with_shared_variables(equations)
        # print(f"Acutal equation: {question['z']} \n Generated:{template}")
        # print('\n')
        gsm8k_item['positive_equation'] = template
    else:
        #Check for same expression
        if len(grouped_by_expression.get(tuple(question['w'])))>1:
            similar_questions = grouped_by_expression.get(tuple(question['w']))
            first_item = similar_questions[1]
            # print(first_item)
            equations = extract_equations(first_item['y'])
            template = create_template_with_shared_variables(equations)
            # print(f"Acutal equation: {question['z']} \n Generated:{template}")
            # print('\n')
            gsm8k_item['positive_equation'] = template
        else:
            #None if both are not found.
            gsm8k_item['positive_equation'] = None

    #make negative
    gsm8k_item['negative_question'] = get_random_negative_equation(question)
    gsm8k_dataset.append(gsm8k_item)

In [256]:
# File path where you want to save the dataset
file_path = 'gsm8k_dataset_new.json'

# Save the dataset to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(gsm8k_dataset, json_file)

print(f"Dataset saved successfully at {file_path}")

Dataset saved successfully at gsm8k_dataset_new.json


In [257]:
gsm8k_dataset[:10]

[{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
  'positive_equation': ['n0/n1=n2', 'n0+n2=n3'],
  'negative_question': ['n0*n1=n2', 'n2+n1=n3']},
 {'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',
  'positive_equation': ['n0/n1=n2', 'n2*n3=n4'],
  'negative_question': ['n0+n1+n2=n3', 'n3-n4=n5']},
 {'question': 'Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?',
  'positive_equation': ['n0/n1*n2=n3', 'n3-n4=n5', 'n2-n3-n5=n6'],
  'negative_question': ['n0*n1=n2',
   'n0*.n3=n4',
   'n0-n4=n5',
   'n5*n1=n6',
   'n7+n6=n8',
   'n2-n8=n7']},
 {'question': 'Julie is reading a 12