<a href="https://colab.research.google.com/github/M1croZavr/CoTResearch/blob/master/CoT_greedy_research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install -q petals

In [None]:
!git clone https://github.com/M1croZavr/CoTResearch.git

In [None]:
!python --version

In [None]:
import torch
import numpy as np
from tqdm.auto import tqdm
from pathlib import Path
from transformers import BloomTokenizerFast, set_seed
from petals import DistributedBloomForCausalLM
from CoTResearch.data_preprocessing import FormattedPrompts, FormattedInputs
from CoTResearch.data_postprocessing import AnswersList

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Running device: {DEVICE}')

In [None]:
MODEL_NAME = "bigscience/bloom-petals"
tokenizer = BloomTokenizerFast.from_pretrained(MODEL_NAME)
model = DistributedBloomForCausalLM.from_pretrained(MODEL_NAME)
model = model.to(DEVICE)

In [None]:
example_prompts = FormattedPrompts(Path('CoTResearch/GSM8K_data/train_data.jsonl'), 3, 123)
example_inputs = FormattedInputs(example_prompts)
with open(Path('CoTResearch/GSM8K_data/test_data.jsonl')) as file:
    example_prompt = example_inputs.sample_input(file.readline())
print(example_prompt)

In [None]:
prompts = FormattedPrompts(
    data_path=Path('CoTResearch/GSM8K_data/train_data.jsonl'),
    n_exemplars=8,
    random_seed=123
    )
prompts.sample_prompts()
inputs = FormattedInputs(prompts)

# Build few-shot prompting mini-sample dataset
N_DATA_POINTS = 50
with open(Path('CoTResearch/GSM8K_data/test_data.jsonl')) as file:
    lines = file.readlines()
    data_points_indices = np.random.randint(0, len(lines), size=(N_DATA_POINTS, ))
    for data_point_index in data_points_indices:
        inputs.sample_input(lines[data_point_index])

In [None]:
answers_list = AnswersList()

In [None]:
len(inputs.inputs), len(inputs.ground_truths)

In [None]:
for i in tqdm(range(N_DATA_POINTS)):
    prompt = inputs.inputs[i]
    gt_answer = inputs.ground_truths[i]
    tokenized_prompt = tokenizer(prompt, return_tensors="pt")["input_ids"].to(DEVICE)
    outputs = model.generate(
        tokenized_prompt,
        max_new_tokens=128,
        return_full_text=False,
        stop=['\n\n', 'Q:'],
        # num_return_sequences=1  # number of paths for ansembling
    )
    predicted_answer = tokenizer.decode(
        outputs[0],
        # truncate_before_pattern=[r'\n\n', r'Q:']
    )
    answers_list.add_answer(predicted_answer, gt_answer)

# payload = {
# "inputs": promt,
# "parameters": {
# "do_sample": True,
# "top_p": X,
# "max_new_tokens": 150,
# "temperature": X,
# "stop": ['.', 'The next day']
# }

In [None]:
predicted_answer

In [None]:
prompt

In [None]:
inputs.inputs[0]

In [None]:
inputs.ground_truths[0]

In [None]:
prompts.prompts