In [2]:
!pip3 install google-genai databench-eval

Collecting databench-eval
  Downloading databench_eval-4.0.1-py3-none-any.whl.metadata (8.1 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets->databench-eval)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading databench_eval-4.0.1-py3-none-any.whl (8.4 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: fsspec, databench-eval
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 whic

In [28]:
import pandas as pd
from google import genai
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
key = user_secrets.get_secret("API_KEY")

client = genai.Client(api_key=key)

qa_df = pd.read_csv('/kaggle/input/semvaltask8/competition/test_qa.csv')
base_data_path = '/kaggle/input/semvaltask8/competition/'

In [4]:
import textwrap
import numpy as np 

def postprocess_response(response: str, provided_dataset: str) -> str:
    code_from_llm = response.strip()
    if code_from_llm.startswith("```python"):
        code_from_llm = code_from_llm[len("```python"):].strip()
    if code_from_llm.startswith("```"):
         code_from_llm = code_from_llm[len("```"):].strip()
    if code_from_llm.endswith("```"):
        code_from_llm = code_from_llm[:-len("```")].strip()


    indented_code = textwrap.indent(code_from_llm, '    ')

    provided_df = pd.read_parquet(base_data_path + provided_dataset + '/sample.parquet') 
    func_code_string = code_from_llm
    try:

        exec_scope = {'pd': pd, 'np': np, 'df': provided_df }
        
    except Exception as e:
        print('data load error')
        return f"__DATA_LOAD_ERROR__: {e}"
    
    try:
        exec(func_code_string, exec_scope) # run the generated code with necessary scopes
        answer_func = exec_scope.get('generated_answer_func')
        if callable(answer_func):
            try:
                final_answer = answer_func(exec_scope['df'])
                
                if isinstance(final_answer, (pd.Series, pd.DataFrame)):
                    if isinstance(final_answer, pd.Series):
                        formatted_answer = str(final_answer.tolist())
                    else:
                         formatted_answer = "__CODE_ERROR__: Returned DataFrame, expected scalar/list/bool"
                elif isinstance(final_answer, list):
                    formatted_answer = str(final_answer)
                elif isinstance(final_answer, (int, float, bool, str, np.generic)):
                     formatted_answer = str(final_answer)
                elif final_answer is None:
                     formatted_answer = "__CODE_ERROR__: Returned None"
                else:
                    try:
                        formatted_answer = str(final_answer)
                    except Exception as fmt_e:
                         formatted_answer = f"__FORMAT_ERROR__: Could not convert {type(final_answer)} to string: {fmt_e}"

                if isinstance(formatted_answer, str) and "\n" in formatted_answer:
                    formatted_answer = formatted_answer.split("\n")[0]
                print('ans: ', formatted_answer)
                return formatted_answer

            except Exception as exec_e:
                print('exec error', exec_e)
                return f"__CODE_EXEC_ERROR__: {exec_e}"
        else:
            print('not callable')
            return "__CODE_DEF_ERROR__: generated_answer_func not callable"

    except SyntaxError as syn_e:
         return f"__CODE_SYNTAX_ERROR__: {syn_e}"
    except Exception as e:
        return f"__EXEC_ERROR__: {e}"

In [5]:
def generate_prompt_messages(row: dict, pred_format: str) -> str:
    messages = f"""
Based on your steps for solution, write the function below to return the answer to the question.

def generated_answer_func(df: pd.DataFrame):
# function to answer the question: {row['question']}
# answer format: {pred_format}
"""
    return messages

In [6]:
def generate_review_prompt(dataset_item: dict, previous_code: str, previous_output: str) -> list:
    messages = f"""
Now, review the full function below thoroughly, identify any errors, inefficiencies or areas for improvement. 
Check if the answer format is appropriate and if it precisely what is asked in the question.
Pay attention the wording of the question.
Based on your review, assign a score out of 100 to the solution for the question provided.
You need to be very harsh and mean in calculating grades, and never give full marks to ensure that the marks are authoritative.
Return your evaluation in json format: {{ score: number, comment: brief string }}

# Question: {question}
# Previous Code output: {previous_output}
# Previous Code:
{previous_code}
"""
    return messages

In [7]:
def generate_improve_prompt(dataset_item: dict, previous_code: str, previous_output: str, review: str) -> list:
    messages = f"""
Based on your review, improve and fix the previous function. If no change is needed, just return the previous code.

# Question: {question}
# Previous Code output: {previous_output}
# Previous Code:
{previous_code}
# Previous code review:
{review}
"""
    return messages

In [8]:
from google.genai import types
from time import sleep

def gemini_model_call(message: str) -> str:
    res = ""
    try:
        response = client.models.generate_content(
            model="gemini-2.0-flash", 
            config=types.GenerateContentConfig(
                system_instruction="You are a python programmer who takes a pandas dataset, a question and returns the code to answer that question based on solely the dataset.",
                temperature=0.1
            ),
            contents=message
        )
        res = response.text
        sleep(5)
            
    except Exception as e:
        print(f"Error during model generation: {e}")
        res = f"__CODE_GEN_ERROR__: {e}"

    return res

In [26]:
from datasets import Dataset
from tqdm.auto import tqdm
from time import sleep

NUM_ITERATIONS = 3

qa_dataset = Dataset.from_pandas(qa_df.iloc[[144, 145, 146, 147, 148, 149]])
print(f"Loaded {len(qa_dataset)} questions.")


final_results = {}

for i, data_item in enumerate(tqdm(qa_dataset, desc="Processing Questions")):
    print(data_item)
    current_code = None
    current_postprocess_res = None
    last_res = None
    error_occurred = False

    question = data_item["question"]
    df_sample = pd.read_parquet(base_data_path + data_item["dataset"] + '/sample.parquet') 
    columns = df_sample.columns.tolist()
    dtypes = df_sample.dtypes.to_dict()
    head_csv = df_sample.to_csv(index=False)
    del df_sample 

    instruction = f"""# Table schema
Dataset Name: {data_item["dataset"]}
Dataset columns: {columns}
Data Types: {dtypes}
Full Dataset: {head_csv}
# Output format:
Boolean - if asking a yes-no, true-false question
Cell Value - if the answer should be a string value from dataframe. 
Number - if the answer should be a numerical computed value or just a numerical value from dataframe
List[cell values] - if asked for a list of string values from cells of the dataframe
List[number] - if asked for a list of numerical values either computed or directly from cells
# Rules:
- You have access to pandas and numpy only
- Be careful about types and do type conversion when necessary
- Don't return any explanation or formatting outside code
- You should write short, concise and efficient code
- Pay attention to return type and python indentation
- Don't return a dictionary string. Only the exact value from that dictionary which is asked
- Don't make assumptions about the question or dataset that are not explicitly mentioned
# Example:
def generated_answer_func(df: pd.DataFrame):
    # Finds the EducationField with the fewest employees.

    # Args: df: Pandas DataFrame containing employee data.

    # Returns: The EducationField with the minimum number of employees.
    education_field_counts = df['EducationField'].value_counts()
    least_employed_field = education_field_counts.idxmin()
    return least_employed_field
# Question:
{data_item['question']}"""
    
    chat = client.chats.create(model="gemini-2.5-flash-preview-04-17", config=types.GenerateContentConfig(
        temperature=0.1
    ))
    chat.send_message(instruction)
    res = chat.send_message("To answer the question, describe how the answer should be provided, what should be the format and what specific information should be extracted.")
    pred_format = res.text

    current_code = None
    current_postprocess_res = None
    res = None
    for it in range(NUM_ITERATIONS):
        if it == 0:
            prompt = generate_prompt_messages(data_item, pred_format)
            res = chat.send_message(prompt)
            current_code = res.text
            current_postprocess_res = postprocess_response(current_code, data_item['dataset'])

            sleep(5)
        else:
            print(current_code)
            prompt = generate_review_prompt(data_item, current_code, current_postprocess_res)
            res = chat.send_message(prompt)
            
            sleep(1)
            
            prompt = generate_improve_prompt(data_item, current_code, current_postprocess_res, res.text)
            res = chat.send_message(prompt)
            current_code = res.text
            current_postprocess_res = postprocess_response(current_code, data_item['dataset'])

            sleep(4)
    
    
    final_results[i] = current_postprocess_res
    print(f"  Finished processing Question ID: {i}. Final result stored.")
    print("-" * 20)
    sleep(10)


Loaded 6 questions.


Processing Questions:   0%|          | 0/6 [00:00<?, ?it/s]

{'question': "Are there products from the 'Hacendado' brand in more than one country?", 'dataset': '070_OpenFoodFacts', '__index_level_0__': 144}
ans:  True
```python
import pandas as pd
import numpy as np

def generated_answer_func(df: pd.DataFrame):
    # Filter the dataset to include only products where the 'brands' column contains 'Hacendado'.
    hacendado_products = df[df['brands'].str.contains('Hacendado', na=False)]

    # Extract the unique values from the 'countries_en' column for these filtered products.
    # Drop NaN values before getting unique countries.
    unique_countries = hacendado_products['countries_en'].dropna().unique()

    # Count the number of unique countries.
    # The answer is True if the count of unique countries is greater than 1, and False otherwise.
    return len(unique_countries) > 1
```
ans:  True
```python
import pandas as pd
import numpy as np

def generated_answer_func(df: pd.DataFrame):
    # Filter the dataset to include only products where th

In [27]:
final_results

{0: 'True', 1: 'Hacendado', 2: '[Spain]', 3: 'Green Dot', 4: '1', 5: '[]'}

###### **Evaluation:** 84.67% accuracy