In [1]:
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning, module='tqdm')
warnings.filterwarnings("ignore", category=UserWarning, module='tqdm.notebook')

import transformers
from tqdm import tqdm
import torch
import json
import numpy as np
import pandas as pd

from databench_eval import Evaluator
from databench_eval.utils import load_qa, load_table

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:
device = torch.device("cuda:0")  # Select GPU 1
torch.cuda.set_device(device) 

model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"#meta-llama/Llama-3.1-8B-Instruct"

In [3]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model_name,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.13it/s]
Device set to use cuda:0


In [4]:
qa_dev = load_qa(lang="ES", name="iberlef", split="dev")

In [5]:
def prompt_generator(row: dict) -> str:
    dataset = row["dataset"]
    question = row["question"]
    #df = pd.read_parquet(f"./databenchSPA/{dataset}/all.parquet")
    df = load_table(row["dataset"], lang="ES")
    return f'''
Follow the following instructions
* You must not write any more code apart from that.
* You only have access to pandas and numpy.
* Pay attention to the type formatting .
* You cannot read files from disk.
* The answer should be short and concise, in the format I specify.
* DO NOT do anything else other than filling the function provided.
* DO NOT wrap your code in ```python``` marks. Answer in plain text.
* Answer in one of the following formats, depending on the question
1. True/False (do not answer with np.True_ or np.False_, but rather True or False)
2. with a value from the dataframe, (category/number)
3. with a list of values (either categories or numbers)

import pandas as pd
import numpy as np
import openai
import os
import asyncio

# This is an example
def example(df: pd.DataFrame):
"""Returns the answer to the question: How many rows are there in the dataframe? """
df.columns = {list(df.columns)}
return df.shape[0]

# This is the question you have to answer
def answer(df: pd.DataFrame):
"""Returns the answer to the question: {question} """
df.columns = {list(df.columns)}
return '''

In [6]:
def get_full_prompt(prompt):
    full_prompt = [
        {"role": "system", "content": "You are a pandas code generator. Your goal is to complete the function provided."},
        {"role": "user", "content": prompt}
    ]
    return full_prompt

In [7]:
def example_postprocess(response: str, row: dict):
    try:
        df = load_table(row["dataset"], lang="ES")
        
        global ans
        lead = """
def answer(df):
    return """
        exec_string = (
            lead
            + response
            + "\nans = answer(df)"
        )
        local_vars = {"df": df, "pd": pd, "np": np}
        exec(exec_string, local_vars)

        ans = local_vars["ans"]
        if isinstance(ans, pd.Series):
            ans = ans.tolist()
        elif isinstance(ans, pd.DataFrame):
            ans = ans.iloc[:, 0].tolist()
        return ans.split("\n")[0] if "\n" in str(ans) else ans
    except Exception as e:
        return f"__CODE_ERROR__: {e}"

In [8]:
def save_responses(responses, save_path: str) -> None:
    with open(save_path, "w") as f:
        for response in responses:
            f.write(str(response).replace("\n", " ") + "\n")

In [9]:
responses = []
for instance in tqdm(qa_dev):
    full_prompt = get_full_prompt(prompt_generator(instance))
    outputs = pipeline(
        full_prompt,
        max_new_tokens=1000,
        pad_token_id=pipeline.tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.8,
        top_p=0.9,
    )
    ans = example_postprocess(outputs[0]["generated_text"][-1]['content'], instance)
    responses.append([str(ans)])

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset                 | 10/100 [00:56<08:10,  5.46s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [11:25<00:00,  6.85s/it]


In [10]:
evaluator = Evaluator(qa=qa_dev)
print(f"DataBenchSPA accuracy is {evaluator.eval(responses)}")
save_responses(responses, "predictions.txt")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 7395.41it/s]

DataBenchSPA accuracy is 0.24





In [11]:
# 3.1-8B-Instruct   - DataBenchSPA accuracy is 0.24
# 3.3-70B-Instruct  - DataBenchSPA accuracy is 0.22