# Batch creation
These are all auxiliary functions used for jsonl file creation, needed for Dataset preparation, used for OpenAI's batch processing.

In [1]:
import re

# .TXT Processing functions

In [2]:
def get_only_text(line):
    '''
    Extracts everything up to the first number in a string.
    Args:
        line (str): The input string from which to extract text.
    Returns:
        str: The extracted text, or a NaN if no valid value is found.
        str: The numbers following the text, or NaN if no valid value is found.
    '''
    try:
        line = line.strip()
    except AttributeError:
        return float('nan')
    
    match = re.search(r'([^\d]*)(\d.*)', line)
    if match:
        text = match.group(1).strip()
        numbers = match.group(2).strip()
        return text, numbers
    else:
        return float('nan'), float('nan')  # Return NaN if no valid value is found

# JSONL generation Functions

In [5]:
# This is for title generation.
def make_message(category, science=False):
    '''
    Creates a message structure for generating book titles based on a given category.
    Args:
        category (str): The category of the book for which a title is to be generated.
        science (bool): If True, the title will be tailored for a scientific context. Defaults to False.
    Returns:
        list: A list of dictionaries representing the message structure for the title generation. The format is OpenAI chat completion API compatible.
    '''
    dev_prompt = 'Answer only with the title. Do not include any other text.'
    if science:
        user_prompt = f'Write a scientific title for a scientific article classified as {category}: \\n'
    else:
        user_prompt = f'Write a title for a book classified as {category}: \\n'
    messages = [
        {"role": "developer", "content": dev_prompt},
        {"role": "user", "content": user_prompt}
    ]
    return messages


# This is for description generation.
def make_description_message(category, title, science=False):
    '''
    Creates a message structure for generating book descriptions based on a given category and title.
    
    Args:
        category (str): The category of the book.
        title (str): The title of the book for which a description is to be generated.
        science (bool): If True, the description will be tailored for a scientific context. Defaults to False.
        
    Returns:
        list: A list of dictionaries representing the message structure for the description generation. The format is OpenAI chat completion API compatible.
    '''
    dev_prompt = 'Answer only with the description. Do not include any other text.'
    if science:
        user_prompt = f'Write a scientific for the scientific article titled "{title}" classified as {category}: \\n'
    else:
        user_prompt = f'Write a description for the book titled "{title}" classified as {category}: \\n'
    messages = [
        {"role": "developer", "content": dev_prompt},
        {"role": "user", "content": user_prompt}
    ]
    return messages


# This creates a single JSONL entry
def make_jsonl_entry(custom_id, model_id, body_messages:list, max_tokens=50, temps=1.4, num_responses=4):
    '''
    Creates a JSONL entry for OpenAI API requests.
    
    Args:
        custom_id (str or int): A unique identifier for the request.
        body_messages (list): A list of message dictionaries to be included in the request body.
        max_tokens (int): The maximum number of tokens to generate in the response.
        
    Returns:
        dict: A dictionary representing the JSONL entry.
    '''
    if not isinstance(custom_id, str):
        try:
            custom_id = int(custom_id)
        except ValueError:
            raise ValueError("custom_id must be a string or an integer convertible to a string.")


    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": model_id,
            "messages": body_messages,
            "max_tokens": max_tokens,
            "stop": ["\n", "User:", "Developer:", "\\\\n"],
            "temperature": temps,
            "n":num_responses
        }
    }

# .TXT Processing

We expect a .txt file with the following format:

```
<category> <udc_codes>
Example 123.456.789
Lorem Ipsum 123.123
```

Where the first word is the category and the rest is the UDC code.

In [None]:
# Only training on categories that start with A
input_path = 'A.txt'

output_path = 'A_output.csv'

In [None]:
with open(input_path, 'r') as file:
    lines = file.readlines()

# Clean output_path if it exists
try:
    with open(output_path, 'w') as output_file:
        output_file.write("uid,text, udc_1, udc_2, udc_3, udc_4, udc_5\n")
except FileNotFoundError:
    with open(output_path, 'w') as output_file:
        output_file.write("uid,text, udc_1, udc_2, udc_3, udc_4, udc_5\n")

for unique_index, line in enumerate(lines):
    line_stripped = line.strip()
    text, numbers = get_only_text(line_stripped)
    # Remove all quotes from the text, replacing with `
    text = text.replace('\"', '`')  
    text = text.replace('\'', '`')
    numbers = numbers.replace('\"', '`')  
    numbers = numbers.replace('\'', '`')

    # Add padding to numbers if they contain less than max classifications
    max_classifications = 5
    numbers_list = numbers.split(',')
    if len(numbers_list) < max_classifications:
        numbers_list += [''] * (max_classifications - len(numbers_list))
    numbers = ','.join(numbers_list[:max_classifications])  # Limit to max_classifications

    # We then write this to a .csv file for later
    with open(output_path, 'a') as output_file:
        output_file.write(f"{unique_index},\"{text}\",{numbers}\n")



# Jsonl file creation
We'll use a different custom_id system for later uses:


custom_id: cat_num-request-iteration_num


As we will generate 5 titles for each category. Example:


Category: Cats, zoology (line 303 on the .csv file)  
303-request-1 ... "Exploration of the life of cats"  
303-request-2 ... "How cats came to be"  
303-request-3 ... "Why are cats so cute?"  


In [3]:
# We'll use pandas to read the CSV file
import pandas as pd
df = pd.read_csv(output_path)
df

Unnamed: 0,uid,text,udc_1,udc_2,udc_3,udc_4,udc_5
0,0,A Beida,1¢(533.22),,,,
1,1,A-C horizons,631.472.6,,,,
2,2,a cappella music,784.1,,,,
3,3,a la carte menus,642.53,,,,
4,4,a posteriori (natural language) systems,1¢=929.3,,,,
...,...,...,...,...,...,...,...
6355,6355,Azua de Compostela,1e(729.331),,,,
6356,6356,Azuay,1e(866.18),,,,
6357,6357,Azul,1¢(821.2),,,,
6358,6358,azulene,547.913.5,,,,


In [7]:
json_file_path = 'A.jsonl'

# Empty the JSONL file if it exists, or create it if it doesn't
try:
    with open(json_file_path, 'w') as json_file:
        json_file.write("")  # Clear the file
except FileNotFoundError:
    with open(json_file_path, 'w') as json_file:
        pass


for index, row in df.iterrows():
    # Create the title generation message
    category = row['text'] 
    title_messages = make_message(category)
    cust_id = str(row['uid']) + '-request-book' 
    temperature = 1.3  # Increment temperature for each request


    # Create the JSONL entry for title generation
    title_entry = make_jsonl_entry(
        custom_id=cust_id,
        model_id='gpt-4.1-mini-2025-04-14',
        body_messages=title_messages,
        max_tokens=50,
        temps=temperature
    )  
    # Write the entry to the JSONL file
    with open(json_file_path, 'a') as json_file:
        json_file.write(f"{title_entry}\n")

    # Also add request a scientific paper as the last one 
    cust_id = str(row['uid']) + '-request-sci'
    temperature = 1.4  # Fixed temperature for scientific paper request
    title_messages = make_message(category, science=True)
    title_entry = make_jsonl_entry(
        custom_id=cust_id,
        model_id='gpt-4.1-mini-2025-04-14',
        body_messages=title_messages,
        max_tokens=50,
        temps=temperature,
        num_responses=2
    )
    # Write the entry to the JSONL file
    with open(json_file_path, 'a') as json_file:
        json_file.write(f"{title_entry}\n")


# Replace all single quotes with double quotes in the JSONL file
with open(json_file_path, 'r') as file:
    content = file.read()
    content = content.replace("\'", '\"')
with open(json_file_path, 'w') as file:
    file.write(content)
