In [1]:
# credits:
# https://www.kaggle.com/code/olyatsimboy/aimo-openmath-mistral-baseline
# https://www.kaggle.com/code/aatiffraz/prompt-prediction-w-mixtral-mistral7b-gemma-llama
# https://www.kaggle.com/code/thedrcat/aimo-mixtral-baseline

# Zero-shot MMOS-DeepSeekMath-7B with self-consistency and generated code reasoning evaluation

Self-consistency is a modification of the standard greedy decoding in reasoning pipelines via sampling several diverse answers followed by aggregation, e.g., most common answer ([SC-CoT paper](https://arxiv.org/pdf/2203.11171.pdf)).

In this kernel, we will consider MMOS-DeepSeekMath-7B RL-tuned backbone; in my experiments, this model produces more consistent code reasoning and the code block execution will allow us to decrease arithmetic hallucinations.

In [2]:
!pip install -U /kaggle/input/bitsandbytes-0-42-0-py3-none-any-whl/bitsandbytes-0.42.0-py3-none-any.whl -qq

In [3]:
import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    AutoConfig,
    set_seed
)

set_seed(42)

MODEL_PATH = "/kaggle/input/deepseek-math"

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

config = AutoConfig.from_pretrained(MODEL_PATH)
config.gradient_checkpointing = True


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
#     quantization_config=quantization_config,
    config=config
)

2024-04-13 05:11:23.452225: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-13 05:11:23.452361: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-13 05:11:23.585056: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
model.dtype

torch.bfloat16

In [5]:
import pandas as pd
from tqdm import tqdm
PRIVATE = True

df = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/test.csv')
df.head()

Unnamed: 0,id,problem
0,000aaa,What is $1-1$?
1,111bbb,What is $0\times10$?
2,222ccc,Solve $4+x=4$ for $x$.


In [6]:
if len(df) < 5:
    df = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')
    PRIVATE = False
df.head()

Unnamed: 0,id,problem,answer
0,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",52
1,246d26,Each of the three-digits numbers $111$ to $999...,250
2,2fc4ad,Let the `sparkle' operation on positive intege...,702
3,430b63,What is the minimum value of $5x^2+5y^2-8xy$ w...,800
4,5277ed,There exists a unique increasing geometric seq...,211


In [7]:
import gc
device = 'cuda'

In [8]:
def naive_parse(answer):
    out = []
    start = False
    end = False
    for l in reversed(list(answer)):
        if l in '0123456789' and not end:
            start = True
            out.append(l)
        else:
            if start:
                end = True
        
    out = reversed(out)
    return ''.join(out)

In [9]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype='auto',
    device_map="auto",
)

In [10]:
print(f"Transformers Version: {transformers.__version__}")

Transformers Version: 4.38.2


In [11]:
import torch

torch.backends.cuda.enable_mem_efficient_sdp(False)

In [12]:
import re
import sys
import subprocess


def process_output(output):
    result = output
    
    try:
        code = output.split('```')[1][7:]

        with open('code.py', 'w') as fout:
            fout.write(code)

        batcmd = 'timeout 7 ' + sys.executable + ' code.py'
        try:
            shell_output = subprocess.check_output(batcmd, shell=True).decode('utf8')
            print(shell_output)
            code_output = round(float(eval(shell_output))) % 1000
        except:
            code_output = -1

        print('CODE RESULTS', code_output)
    
    except Exception as e:
        print(e)
        print('ERROR PARSING')
        code_output = -1
    
    try:
        result_output = re.findall(r'\\boxed\{(.*)\}', result)

        print('BOXED', result_output)
        if not len(result_output):
            result_output = naive_parse(result)
        else:
            result_output = result_output[-1]

        print('BOXED', result_output)
        if not len(result_output):
            result_output = -1
        
        else:
            result_output = round(float(eval(result_output))) % 1000
    
    except Exception as e:
        print(e)
        print('ERROR PARSING')
        result_output = -1
    
    return result_output, code_output

In [13]:
import re
from collections import defaultdict


tool_instruction0 = " The answer should be given as a non-negative modulo 1000."
tool_instruction0 += '\nPlease integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}.'
tool_instruction1 = "\nThe answer should be given as a non-negative modulo 1000."
tool_instruction1 += '\nPlease integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}.'
tool_instruction2 = " The answer should be given as a non-negative modulo 1000."
tool_instruction2 += '\n\nPlease integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}.'
tool_instruction3 = "\nThe answer should be given as a non-negative modulo 1000."
tool_instruction3 += '\n\nPlease integrate natural language reasoning with programs to solve the problem above, and put your final answer within \\boxed{}.'
tool_instruction4 = " The answer should be given as a non-negative modulo 1000."
tool_instruction4 += '\nPlease integrate natural language reasoning with python programs to solve the problem above, and put your final answer within \\boxed{}.'

tool_instructions = [tool_instruction0, tool_instruction1, tool_instruction2, tool_instruction3, tool_instruction4]

n_repetitions = 5 if PRIVATE else 2

total_results = []
total_answers = []

for i in tqdm(range(len(df))):
    id_ = df['id'].loc[i]
    problem = df['problem'].loc[i]
    
    results = []
    answers = []
    
    for i in tqdm(range(n_repetitions)):
        try:
            tool_instruction = tool_instructions[i]
            
            messages = [
                {
                    "role": "user", 
                    "content": problem + tool_instruction
                }
            ]
    
            query_prompt = tokenizer.apply_chat_template(
                messages,
                tokenize=False
            )
            
            raw_output = pipeline(
                query_prompt, 
                max_new_tokens=2048, 
                do_sample=True, 
                temperature=0.9,
                return_full_text=False
            )
            raw_output = raw_output[0]['generated_text']

            result_output, code_output = process_output(raw_output)

            torch.cuda.empty_cache()
            gc.collect()

        except Exception as e:
            print(e)
            result_output, code_output = -1, -1
        
        results.append(result_output)
        answers.append(code_output)
    
    total_results.append(results)
    total_answers.append(answers)

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


77/2

CODE RESULTS 38
BOXED ['988']
BOXED 988



 50%|█████     | 1/2 [01:33<01:33, 93.12s/it][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


36

CODE RESULTS 36
BOXED ['36']
BOXED 36



100%|██████████| 2/2 [03:47<00:00, 113.91s/it]
 10%|█         | 1/10 [03:47<34:10, 227.82s/it]
  0%|          | 0/2 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


0

CODE RESULTS 0
BOXED ['10']
BOXED 10



 50%|█████     | 1/2 [02:24<02:24, 144.64s/it][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


list index out of range
ERROR PARSING
BOXED ['499}$. This means that the maximum possible number of yellow numbers there can be is 499. The answer is $\\boxed{499']
BOXED 499}$. This means that the maximum possible number of yellow numbers there can be is 499. The answer is $\boxed{499
unmatched '}' (<string>, line 1)
ERROR PARSING



100%|██████████| 2/2 [03:29<00:00, 104.77s/it]
 20%|██        | 2/10 [07:17<28:56, 217.07s/it]
  0%|          | 0/2 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Traceback (most recent call last):
  File "/kaggle/working/code.py", line 20, in <module>
    result = sparkle_of_number(36)
  File "/kaggle/working/code.py", line 12, in sparkle_of_number
    n = sum(int(digit) for digit in str(n))
  File "/opt/conda/lib/python3.10/site-packages/sympy/core/_print_helpers.py", line 29, in __str__
    return sstr(self, order=None)
  File "/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py", line 372, in __call__
    return self.__wrapped__(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/sympy/printing/str.py", line 998, in sstr
    s = p.doprint(expr)
  File "/opt/conda/lib/python3.10/site-packages/sympy/printing/printer.py", line 292, in doprint
    return self._str(self._print(expr))
  File "/opt/conda/lib/python3.

CODE RESULTS -1
BOXED ['415', '415']
BOXED 415



 50%|█████     | 1/2 [01:04<01:04, 64.71s/it][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.



CODE RESULTS -1
BOXED ['156']
BOXED 156



100%|██████████| 2/2 [02:06<00:00, 63.36s/it]
 30%|███       | 3/10 [09:24<20:30, 175.81s/it]
  0%|          | 0/2 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


800.0

CODE RESULTS 800
BOXED ['400']
BOXED 400



 50%|█████     | 1/2 [00:45<00:45, 45.32s/it][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


800.0

CODE RESULTS 800
BOXED ['311']
BOXED 311



100%|██████████| 2/2 [02:01<00:00, 60.84s/it]
 40%|████      | 4/10 [11:25<15:26, 154.44s/it]
  0%|          | 0/2 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Traceback (most recent call last):
  File "/kaggle/working/code.py", line 26, in <module>
    result = geometric_sequence_sum()
  File "/kaggle/working/code.py", line 15, in geometric_sequence_sum
    for r_value in symbols('r'):
TypeError: 'Symbol' object is not iterable


CODE RESULTS -1
BOXED []
BOXED 715



 50%|█████     | 1/2 [01:22<01:22, 82.50s/it][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


11*sqrt(10) + 111

CODE RESULTS -1
BOXED ['889']
BOXED 889



100%|██████████| 2/2 [02:27<00:00, 73.93s/it]
 50%|█████     | 5/10 [13:53<12:40, 152.07s/it]
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


100

CODE RESULTS 100
BOXED []
BOXED 1



 50%|█████     | 1/2 [01:21<01:21, 81.44s/it][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


100

CODE RESULTS 100
BOXED ['1']
BOXED 1



100%|██████████| 2/2 [02:06<00:00, 63.33s/it]
 60%|██████    | 6/10 [16:00<09:33, 143.44s/it]
  0%|          | 0/2 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


list index out of range
ERROR PARSING
BOXED []
BOXED 97



 50%|█████     | 1/2 [00:56<00:56, 56.59s/it][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


list index out of range
ERROR PARSING
BOXED []
BOXED 921



100%|██████████| 2/2 [01:27<00:00, 43.96s/it]
 70%|███████   | 7/10 [17:28<06:15, 125.29s/it]
  0%|          | 0/2 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


256

CODE RESULTS 256
BOXED ['256', '256']
BOXED 256



 50%|█████     | 1/2 [00:59<00:59, 59.25s/it][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.

100%|██████████| 2/2 [03:34<00:00, 107.37s/it]
 80%|████████  | 8/10 [21:02<05:07, 153.77s/it]

652.0

CODE RESULTS 652
BOXED ['652', '652']
BOXED 652



  0%|          | 0/2 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


list index out of range
ERROR PARSING
BOXED ['58', '58']
BOXED 58



 50%|█████     | 1/2 [01:08<01:08, 68.00s/it][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


58/3

CODE RESULTS 19
BOXED ['801', '801']
BOXED 801



100%|██████████| 2/2 [02:25<00:00, 72.82s/it]
 90%|█████████ | 9/10 [23:28<02:31, 151.23s/it]
  0%|          | 0/2 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


list index out of range
ERROR PARSING
BOXED ['71']
BOXED 71



 50%|█████     | 1/2 [02:47<02:47, 167.67s/it][ASetting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


387

CODE RESULTS 387
BOXED ['3']
BOXED 3



100%|██████████| 2/2 [03:34<00:00, 107.15s/it]
100%|██████████| 10/10 [27:02<00:00, 162.30s/it]


In [14]:
import numpy as np
from collections import Counter

final_answers = []

for a, b in zip(total_answers, total_results):
    a = np.array(a)
    b = np.array(b)
    a[a < 0] = b[a < 0]
    
    pred = Counter(a.tolist()).most_common(2)

    ans = pred[0][0] if not pred[0][0] < 0 else pred[1][0]

    final_answers.append(ans)
    print(ans)


38
0
415
800
715
100
97
256
58
71


In [15]:
df['answer'] = final_answers

In [16]:
df

Unnamed: 0,id,problem,answer
0,229ee8,"Let $k, l > 0$ be parameters. The parabola $y ...",38
1,246d26,Each of the three-digits numbers $111$ to $999...,0
2,2fc4ad,Let the `sparkle' operation on positive intege...,415
3,430b63,What is the minimum value of $5x^2+5y^2-8xy$ w...,800
4,5277ed,There exists a unique increasing geometric seq...,715
5,739bc9,For how many positive integers $m$ does the eq...,100
6,82e2a0,Suppose that we roll four 6-sided fair dice wi...,97
7,8ee6f3,"The points $\left(x, y\right)$ satisfying $((\...",256
8,bedda4,Let $ABCD$ be a unit square. Let $P$ be the po...,58
9,d7e9c9,A function $f: \mathbb N \to \mathbb N$ satisf...,71


In [17]:
df[['id','answer']].to_csv("submission.csv", header=True, index=False)

In [18]:
df[['id','answer']].head()

Unnamed: 0,id,answer
0,229ee8,38
1,246d26,0
2,2fc4ad,415
3,430b63,800
4,5277ed,715


In [19]:
if not PRIVATE:
    df = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')
    df['model_answer'] = final_answers
    df['match'] = df.answer == df.model_answer
    print(f'{df.match.sum()} matches in {len(df)} examples')

1 matches in 10 examples
