## Step 2 (still being implemented)

This step processes each specific interest group. It is expected at this stage
each individual submission will be correctly classified into one of the
following categories:

- News
- Platform
- Civil
- Academic
- Individual

We are interested in the first four. Where AI has classified something into the
first four that we had not in our original sweep, we would have checked that as
the last part of step_1 and corrected the list in the files: `list.json`


In [None]:
import json
import os
from datetime import datetime
import pytz
from collections import Counter

promt_file = 'prompt_no_guidance.txt'

# Define the prompt for each individual request
def prompt_formatted() -> str:    
    # Read the first file and set a string variable
    with open(promt_file, 'r') as file:
        prompt = file.read()
        
    with open('prompt_issues.md', 'r') as file:
        issues = file.read()

    # with open('prompt_guidance_note.md', 'r') as file:
    #     guidance_note = file.read()

    with open('prompt_fact_sheet.md', 'r') as file:
        fact_sheet = file.read()

    prompt = prompt.replace('|issues|', issues)
    # prompt = prompt.replace('|guidance_note|', guidance_note)
    prompt = prompt.replace('|fact_sheet|', fact_sheet)    

    return prompt

def get_function(type: str):
    if type == 'academic' or type == 'civil':
        with open('function_civil_academic.json', 'r') as f:
            function = json.load(f)
        return function
    if type == 'platform' or type == 'industry':
        with open('function_digital.json', 'r') as f:
            function = json.load(f)
        return function
    if type == 'news':
        with open('function_news.json', 'r') as f:
            function = json.load(f)
        return function

def get_file_path(doc_id, folder_path = './data/files'):    
    for file_name in os.listdir(folder_path):
        if file_name.startswith(doc_id):
            return os.path.join(folder_path, file_name)

# Load original JSON list
with open('./data/step1/list.json', 'r') as f:
    list_data = json.load(f)

# This gets a list of just items of interest, filtering out individual submissions and those that were ignored
to_process = [item for item in list_data["data"] if item['group'] and item['group'] != 'individual']

# Create step 2 directories
os.makedirs('./data/step2', exist_ok=True)
os.makedirs('./data/step2/toProcess', exist_ok=True)
os.makedirs('./data/step2/output', exist_ok=True)

file_counter = 0
jsonl_file = f"./data/step2/toProcess/jsonl_{file_counter}.jsonl"

# Count the number of each group
group_values = [item['group'] for item in to_process]

# Count occurrences of each group
group_counts = Counter(group_values)

# Convert to dictionary
group_counts_dict = dict(group_counts)

for i in to_process:
    try: 
        md_file_path = get_file_path(i.get("uniqueId"))       
        with open(md_file_path, 'r') as file:
            submission = file.read()
        sub_author = i["submitter"]        
        prompt = prompt_formatted()
        submission_formatted = i["metadata"]["SUBMISSION_CONTENT"]
        function = get_function(i["group"])

        ldata = { "custom_id": i["uniqueId"], "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-2024-05-13", "messages": [{"role": "system", "content": prompt}, {"role": "user", "content": submission_formatted}], "max_tokens": 4096, "temperature": 1e-9, "frequency_penalty": 0, "presence_penalty": 0, "top_p": 0, "tools": [function], "tool_choice": { "type": "function", "function": { "name": "submission_eval" } }}}

        if os.path.exists(jsonl_file) and os.path.getsize(jsonl_file) >= 85 * 1024 * 1024:
            file_counter += 1
            jsonl_file = f"./data/step2/toProcess/jsonl_{file_counter}.jsonl"
                    
        i["metadata"]["step_2"] = {"batch": f'jsonl_{file_counter}.jsonl'}
        
        with open(jsonl_file, 'a') as f:
            json.dump(ldata, f)
            f.write('\n')

    except Exception as e:
        print(e)
        continue

local_timezone = pytz.timezone('Australia/Sydney')
current_time = datetime.now(local_timezone).strftime("%Y-%m-%d %H:%M:%S %Z")

list_data["metadata"]["step_2"] = {    
    "jsonl_batch_creation": {
            "timestamp": current_time,
            "type_breakdown": group_counts_dict,            
        },
    "ai_parameters" : { 
        "custom_id": "SUBMISSION_ID", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-2024-05-13", "messages": [{"role": "system", "content": prompt},{"role": "user", "content": "SUBMISSION_CONTENT"}], "max_tokens": 4096, "temperature": 1e-9, "frequency_penalty": 0, "presence_penalty": 0, "top_p": 0, "tools":[function], "tool_choice": { "type": "function", "function": { "name": "submission_eval" } }}}
        }

with open('./data/step1/list.json', 'w') as f:
    json.dump(list, f)