<a href="https://colab.research.google.com/github/nicolaCirillo/ate-it/blob/main/baseline/subtask_a_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import codecs
import json

def load_data(file_path):
  """
  Loads data from a CSV or JSON file and returns a dictionary
  where keys are (doc_id, par_id, sent_id) tuples and values are sentences.

  Args:
    file_path: The path to the input file (CSV or JSON).

  Returns:
    A dictionary containing the loaded data.

  Raises:
    ValueError: If the file format is not supported.
  """
  if file_path.endswith('.csv'):
    # Load data from CSV file
    df = pd.read_csv(file_path)
    df.fillna('', inplace=True) # Fill NaN values with empty strings
    data = {}
    # Iterate over rows and get sentences
    for doc_id, par_id, sent_id, sentence_text, _ in df.itertuples(index=False):
        data[(doc_id, par_id, sent_id)] = sentence_text
  elif file_path.endswith('.json'):
    # Load data from JSON file
    with codecs.open(file_path, 'r', 'utf-8') as f:
      json_data = json.load(f)
    # Extract terms from JSON data
    data = {(row["doc_id"], row["par_id"], row["sent_id"]): row["sentence_text"]
            for row in json_data["data"]}
  else:
    # Raise error for unsupported file formats
    raise ValueError("Unsupported file format. Only CSV and JSON files are supported.")
  return data

In [2]:
data = load_data('../data/subtask_a_dev.csv')

In [4]:
import google.generativeai as genai

# Get the API key from the user
api_key = input("Please enter your Gemini API key: ")
genai.configure(api_key=api_key)

# Define the model
model = genai.GenerativeModel('gemini-2.5-flash')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Test the model
response = model.generate_content("Hello, how are you?")

# Print the model's response
print(response.text)

Hello! I'm doing well, thank you for asking.

How about you?


In [6]:
# Define the batch size (number of sentences passed per execution)

batch_size = 20

# Define system prompt
system_prompt = f"""You are an automatic term extraction agent. You will receive a list of sentences as input.
Your role is to extract waste management terms from the sentences. Output a list of terms for each sentence.

strictly adhere to the Example Output Format:

Example Output Format:
Sentence 1: [term1; term2; term3; term4]
Sentence 2: [term5; term6]
Sentence 3: []
Sentence 4: []
Sentence 5: [term7]


Instructions:
* Extract only terms, ignore named entities;
* Do not extract nested terms;
* Extract only terms related to waste management, ignoring other domains;
* If a sentence contains no terms, output an empty list for that sentence;
* You must output {batch_size} lists of terms, one for each sentence.

"""

In [9]:
from tqdm import tqdm

response_list = []
user_prompt = ""
# iterate over sentences
for i, sent in enumerate(tqdm(list(data.values()))):
  # When batch size is reached
  if (i+1) % batch_size == 0:
    user_prompt += f"Sentence {i+1}:\n {sent}"
    # Send the prompts to the model and collect response
    response = model.generate_content(
        f"System: {system_prompt}\nUser: {user_prompt}"
        )
    response_list.append(response.text)
    # Reset the user prompt
    user_prompt = ""
  # Add sentence to the prompt
  else:
    user_prompt += f"Sentence {i+1}:\n {sent}\n\n"

# Process the remaining data
user_prompt = user_prompt[:-2]
# Send the prompts to the model and collect response
response = model.generate_content(
    f"System: {system_prompt}\nUser: {user_prompt}"
    )
response_list.append(response.text)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 577/577 [11:52<00:00,  1.23s/it]


In [10]:
# Format and check the output
import re

out_data = []
for response in response_list:
  for sent in response.split('\n'):
    id = int(re.search(r'Sentence (\d+):', sent).group(1))
    terms = re.search(r'\[(.*?)\]', sent).group(1).split(';')
    terms = [term.strip().lower() for term in terms if term.strip() != '']
    out_data.append((id, terms))

if len(out_data) != len(data):
  raise ValueError("Output data is not the same length as input data.")
else:
  print("Output data is the same length as input data.")

Output data is the same length as input data.


In [11]:
import codecs
import json

# Create json output file
json_data = {"data": []}
for i, ((doc_id, par_id, sent_id), sentence_text) in enumerate(data.items()):
  json_data["data"].append({
      "document_id": doc_id,
      "paragraph_id": par_id,
      "sentence_id": sent_id,
      "sentence_text": sentence_text,
      "term_list": out_data[i][1]
  })

with codecs.open('baseline_a_1.json', 'w', 'utf-8') as f:
  json.dump(json_data, f, ensure_ascii=False, indent=4)