In [1]:
# credits:
# https://www.kaggle.com/code/aatiffraz/prompt-prediction-w-mixtral-mistral7b-gemma-llama
# https://www.kaggle.com/code/thedrcat/aimo-mixtral-baseline

# OpenMath models

OpenMath models were trained on synthetic dataset created using PoT-like prompting for Mixtral model.

The key feature is that training instructions were designed in a way to reduce parsing difficulties on the post-processing stage, i.e. the final answer will be inside `\boxed{}` block.

For more details please refer to the [paper](https://arxiv.org/pdf/2402.10176.pdf) and [HF training dataset card](https://huggingface.co/datasets/nvidia/OpenMathInstruct-1)

In this baseline we will consider [OpenMath-Mistral-7B-v0.1](https://huggingface.co/nvidia/OpenMath-Mistral-7B-v0.1-hf)

In [2]:
!pip install -U /kaggle/input/bitsandbytes-0-42-0-py3-none-any-whl/bitsandbytes-0.42.0-py3-none-any.whl -qq

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig

MODEL_PATH = "/kaggle/input/open-math-mistral"

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

config = AutoConfig.from_pretrained(MODEL_PATH)
config.gradient_checkpointing = True


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
#     quantization_config=quantization_config,
    config=config
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
model.dtype

torch.bfloat16

In [5]:
import pandas as pd
from tqdm import tqdm
PRIVATE = True

df = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/test.csv')
df.head()

Unnamed: 0,id,problem
0,000aaa,What is $1-1$?
1,111bbb,What is $0\times10$?
2,222ccc,Solve $4+x=4$ for $x$.


In [6]:
if len(df) < 5:
    df = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')
    PRIVATE = False
df.head()

Unnamed: 0,id,problem,answer
0,208387,"The points $\left(x, y\right)$ satisfying $((\...",320
1,2cda49,For how many positive integers $m$ does the eq...,199
2,68704f,There exists a unique increasing geometric seq...,211
3,7543ec,A function $f: \mathbb N \to \mathbb N$ satisf...,199
4,7b58de,Let the `sparkle' operation on positive intege...,702


In [7]:
prompt_template = "System:\n{system}\n\nUser:\n{user}Assistant:\n{generated_solution}"

system = "You're an expert Python programmer and mathematician. Help the user to solve this problem using code when necessary. Make sure to put the answer (and only answer) inside \\boxed{}."

user = "{question}\n\n{context}"

context = ""
generated_solution = ""

print(system)

You're an expert Python programmer and mathematician. Help the user to solve this problem using code when necessary. Make sure to put the answer (and only answer) inside \boxed{}.


In [8]:
import gc
device = 'cuda'

In [9]:
def naive_parse(answer):
    out = []
    start = False
    end = False
    for l in reversed(list(answer)):
        if l in '0123456789' and not end:
            start = True
            out.append(l)
        else:
            if start:
                end = True
        
    out = reversed(out)
    return int(''.join(out))

In [10]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype='auto',
    device_map="auto",
)

2024-04-02 20:33:59.428906: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-02 20:33:59.429053: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-02 20:33:59.591236: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [11]:
print(f"Transformers Version: {transformers.__version__}")

Transformers Version: 4.38.2


In [12]:
import torch

torch.backends.cuda.enable_mem_efficient_sdp(False)

In [13]:
import re

answers = []

for i in tqdm(range(len(df))):
    try:
        id_ = df['id'].loc[i]
        problem = df['problem'].loc[i]
        
        user_p = user.format(question=problem, context=context)
        query_prompt = prompt_template.format(
            system=system,
            user=user_p,
            generated_solution=generated_solution
        )

        raw_output = pipeline(
            query_prompt, 
            do_sample=False, 
            temperature=0.,
            num_return_sequences=1,
            max_new_tokens=768)
        
        raw_output = raw_output[0]['generated_text']
        
        
        output = re.findall(r'\\boxed\{(.*?)\}', raw_output)[-1]
        
        if not len(output):
            output = naive_parse(raw_output)
        
        output = round(float(output)) % 1000
        
        answers.append(output)
        
        torch.cuda.empty_cache()
        gc.collect()
        
    except Exception as e:
        print(e)
        answers.append(0)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 10%|█         | 1/10 [00:22<03:22, 22.54s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 20%|██        | 2/10 [00:44<02:59, 22.49s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 30%|███       | 3/10 [01:16<03:07, 26.76s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 40%|████      | 4/10 [01:26<02:00, 20.14s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 50%|█████     | 5/10 [01:41<01:31, 18.25s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 60%|██████    | 6/10 [02:02<01:16, 19.09s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 70%|███████   | 7/10 [02:38<01:14, 24.74s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 80%|████████  | 8/10 [03:00<00:47, 23.79s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 90%|█████████ | 9/1

In [14]:
df['answer'] = answers

In [15]:
df

Unnamed: 0,id,problem,answer
0,208387,"The points $\left(x, y\right)$ satisfying $((\...",400
1,2cda49,For how many positive integers $m$ does the eq...,31
2,68704f,There exists a unique increasing geometric seq...,0
3,7543ec,A function $f: \mathbb N \to \mathbb N$ satisf...,503
4,7b58de,Let the `sparkle' operation on positive intege...,4
5,84308e,What is the minimum value of $5x^2+5y^2-8xy$ w...,1
6,939609,"Let $k, l > 0$ be parameters. The parabola $y ...",100
7,a6b0bc,Let $ABCD$ be a unit square. Let $P$ be the po...,1
8,bb8f98,Suppose that we roll four 6-sided fair dice wi...,21
9,dca542,Each of the three-digits numbers $111$ to $999...,322


In [16]:
df[['id','answer']].to_csv("submission.csv", header=True, index=False)

In [17]:
df[['id','answer']].head()

Unnamed: 0,id,answer
0,208387,400
1,2cda49,31
2,68704f,0
3,7543ec,503
4,7b58de,4


In [18]:
if not PRIVATE:
    df = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')
    df['model_answer'] = answers
    df['match'] = df.answer == df.model_answer
    print(f'{df.match.sum()} matches in {len(df)} examples')

0 matches in 10 examples
