## Step 2 (still being implemented)

This step processes each specific interest group. It is expected at this stage
each individual submission will be correctly classified into one of the
following categories:

- News
- Platform
- Civil
- Academic
- Individual

We are interested in the first four. Where AI has classified something into the
first four that we had not in our original sweep, we would have checked that as
the last part of step_1 and corrected the list in the files: `list.json`


In [9]:
# CELL 1
import json
import os
from datetime import datetime
import pytz
from collections import Counter

promt_file = 'prompt_no_guidance.txt'

# Define the prompt for each individual request
def prompt_formatted() -> str:    
    # Read the first file and set a string variable
    with open(promt_file, 'r') as file:
        prompt = file.read()
        
    with open('prompt_issues.md', 'r') as file:
        issues = file.read()

    # with open('prompt_guidance_note.md', 'r') as file:
    #     guidance_note = file.read()

    with open('prompt_fact_sheet.md', 'r') as file:
        fact_sheet = file.read()

    prompt = prompt.replace('|issues|', issues)
    # prompt = prompt.replace('|guidance_note|', guidance_note)
    prompt = prompt.replace('|fact_sheet|', fact_sheet)    

    return prompt

def get_function(type: str):
    if type == 'academic' or type == 'civil' or type == 'political' or type == 'government':
        with open('function_civil_academic.json', 'r') as f:
            function = json.load(f)
        return function
    if type == 'platform' or type == 'industry':
        with open('function_digital.json', 'r') as f:
            function = json.load(f)
        return function
    if type == 'news':
        with open('function_news.json', 'r') as f:
            function = json.load(f)
        return function

def get_file_path(doc_id, folder_path = './data/files'):    
    for file_name in os.listdir(folder_path):
        if file_name.startswith(doc_id):
            return os.path.join(folder_path, file_name)

# Load original JSON list
with open('./data/step1/list.json', 'r') as f:
    list_data = json.load(f)

# This gets a list of just items of interest, filtering out individual submissions and those that were ignored
to_process = [item for item in list_data["data"] if item['group'] and item['group'] != 'individual']

# Create step 2 directories
os.makedirs('./data/step2', exist_ok=True)
os.makedirs('./data/step2/toProcess', exist_ok=True)
os.makedirs('./data/step2/output', exist_ok=True)

file_counter = 0
jsonl_file = f"./data/step2/toProcess/jsonl_{file_counter}.jsonl"

# Count the number of each group
group_values = [item['group'] for item in to_process]

# Count occurrences of each group
group_counts = Counter(group_values)

# Convert to dictionary
group_counts_dict = dict(group_counts)

functions = {}

for i in to_process:
    try: 
        md_file_path = get_file_path(i.get("uniqueId"))       
        with open(md_file_path, 'r') as file:
            submission = file.read()
        sub_author = i["submitter"]        
        prompt = prompt_formatted()
        submission_formatted = i["metadata"]["SUBMISSION_CONTENT"]
        function = get_function(i["group"])
        functions[i["group"]] = function

        ldata = { "custom_id": i["uniqueId"], "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-2024-05-13", "messages": [{"role": "system", "content": prompt}, {"role": "user", "content": submission_formatted}], "max_tokens": 4096, "temperature": 1e-9, "frequency_penalty": 0, "presence_penalty": 0, "top_p": 0, "tools": [function], "tool_choice": { "type": "function", "function": { "name": "submission_eval" } }}}

        if os.path.exists(jsonl_file) and os.path.getsize(jsonl_file) >= 85 * 1024 * 1024:
            file_counter += 1
            jsonl_file = f"./data/step2/toProcess/jsonl_{file_counter}.jsonl"
                    
        i["metadata"]["step_2"] = {"batch": f'jsonl_{file_counter}.jsonl'}
        
        with open(jsonl_file, 'a') as f:
            json.dump(ldata, f)
            f.write('\n')

    except Exception as e:
        print(e)
        continue

local_timezone = pytz.timezone('Australia/Sydney')
current_time = datetime.now(local_timezone).strftime("%Y-%m-%d %H:%M:%S %Z")

list_data["metadata"]["step_2"] = {    
    "jsonl_batch_creation": {
            "timestamp": current_time,
            "type_breakdown": group_counts_dict,            
        },
    "ai_parameters" : { 
        "custom_id": "SUBMISSION_ID", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-2024-05-13", "messages": [{"role": "system", "content": prompt},{"role": "user", "content": "SUBMISSION_CONTENT"}], "max_tokens": 4096, "temperature": 1e-9, "frequency_penalty": 0, "presence_penalty": 0, "top_p": 0, "tools": "AS_PER_GROUP_FN", "tool_choice": { "type": "function", "function": { "name": "submission_eval" } }}},
    "functions": functions,
        }

with open('./data/step1/list.json', 'w') as f:
    json.dump(list_data, f)

## We now have a folder with all the prepared files for OpenAI batch calls

We will upload each of these files to OpenAI and then trigger batch processing
of each.


In [11]:
# CELL 2
from openai import OpenAI
import os
from datetime import datetime
import pytz

client = OpenAI(api_key=os.getenv('OPENAI_KEY'), max_retries=3)

jsonl_dir = './data/step2/toProcess'

jsonl_files = [f for f in os.listdir(jsonl_dir) if os.path.isfile(os.path.join(jsonl_dir, f)) and f.endswith('.jsonl')]

file_ids = []

for file in jsonl_files:
    file_object = client.files.create(
        file=open(f"{jsonl_dir}/{file}", "rb"),
        purpose="batch"
    )
    file_ids.append(file_object.id)

with open('./data/step1/list.json', 'r') as f:
    list = json.load(f)

local_timezone = pytz.timezone('Australia/Sydney')
current_time = datetime.now(local_timezone).strftime("%Y-%m-%d %H:%M:%S %Z")

list["metadata"]["step_2"]["file_upload"] = {
            "timestamp": current_time,
            "file_ids": file_ids,            
        }

with open('./data/step1/list.json', 'w') as f:
    json.dump(list, f)

In [12]:
# CELL 3
from openai import OpenAI
import os
from datetime import datetime
import pytz

client = OpenAI(api_key=os.getenv('OPENAI_KEY'), max_retries=3)

# We have now uploaded all the files and have their IDs, lets create a batch job for each
batch_ids = []

with open('./data/step1/list.json', 'r') as f:
    list = json.load(f)

file_ids = list["metadata"]["step_2"]["file_upload"]["file_ids"]

for file_id in file_ids:
    job = client.batches.create(
            input_file_id=file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h"
          )
    batch_ids.append(job.id)

print(batch_ids)

local_timezone = pytz.timezone('Australia/Sydney')
current_time = datetime.now(local_timezone).strftime("%Y-%m-%d %H:%M:%S %Z")

list["metadata"]["step_2"]["batch_creation"] = {
            "timestamp": current_time,
            "batch_ids": batch_ids,
        }

with open('./data/step1/list.json', 'w') as f:
    json.dump(list, f)

['batch_sDcEEYbyii64rVYSXPYiRUP6']


#### The batch processes should now be underway, they will take up to 24hrs

We can run the following cell to check on process


In [14]:
from openai import OpenAI
import json

client = OpenAI(api_key=os.getenv('OPENAI_KEY'),max_retries=3)

with open('./data/step1/list.json', 'r') as f:
    list = json.load(f)

desired_batch_ids = list["metadata"]["step_2"]["batch_creation"]["batch_ids"]

batch_jobs = client.batches.list()

for batch in batch_jobs.data:
    if batch.id in desired_batch_ids:
        print(batch.id, batch.status, batch.request_counts)

batch_sDcEEYbyii64rVYSXPYiRUP6 completed BatchRequestCounts(completed=123, failed=0, total=123)


#### Once processing is done, we can download the completed files

Files are saved here: `./data/step2/output`


In [17]:
# CELL 4
from openai import OpenAI
from datetime import datetime
import pytz

client = OpenAI(api_key=os.getenv('OPENAI_KEY'),max_retries=3)

batch_jobs = client.batches.list()

with open('./data/step1/list.json', 'r') as f:
    list = json.load(f)

# we only want to download the batch jobs that were set up in cell 11
desired_batch_ids = list["metadata"]["step_2"]["batch_creation"]["batch_ids"]

success_count = 0
error_count = 0

created_time = None
completion_time = None

success_files = []
err_files = []

for batch in batch_jobs.data:
    if batch.id in desired_batch_ids:
        # Gets the first created batch time
        if not created_time or batch.created_at < created_time:
            created_time = batch.created_at
        # Gets the lasted completed batch time
        if not completion_time or batch.completed_at > completion_time:
            completion_time = batch.completed_at
        if batch.output_file_id:
            success_count += batch.request_counts.total - batch.request_counts.failed
            output_file = batch.output_file_id
            content = client.files.content(output_file)
            jsonl_file_path = f'./data/step2/output/{output_file}.jsonl'
            content.write_to_file(jsonl_file_path)
            success_files.append(jsonl_file_path)
        # Handle error files
        if batch.error_file_id:
            error_count += batch.request_counts.failed
            err_file = batch.error_file_id
            err_content = client.files.content(err_file)
            err_jsonl_file_path = f'./data/step2/output/err_{err_file}.jsonl'
            err_content.write_to_file(err_jsonl_file_path)
            err_files.append(err_jsonl_file_path)

local_timezone = pytz.timezone('Australia/Sydney')
current_time = datetime.now(local_timezone).strftime("%Y-%m-%d %H:%M:%S %Z")

time_difference = completion_time - created_time

list["metadata"]["step_2"]["batch_creation"]["download_timestamp"] = current_time
list["metadata"]["step_2"]["batch_creation"]["completetion_duration_seconds"] = time_difference
list["metadata"]["step_2"]["batch_creation"]["success"] = success_count
list["metadata"]["step_2"]["batch_creation"]["errors"] = error_count
list["metadata"]["step_2"]["batch_creation"]["success_files"] = success_files
list["metadata"]["step_2"]["batch_creation"]["error_files"] = err_files

with open('./data/step1/list.json', 'w') as f:
    json.dump(list, f)

## Process AI responses and save data

Now we have all the AI responses, we need to process and save the results. This
will update the json file from step 2, and also export the responses as an Excel
file for review. The Excel file will be located: `./data/step2/review2/xlsx`


In [18]:
# CELL 5
from openai import OpenAI
import os
import json
import pandas as pd
from datetime import datetime
import pytz

client = OpenAI(api_key=os.getenv('OPENAI_KEY'),max_retries=3)

# Parses the JSON from a function call, if there is an error in JSON parsing, recalls the LLM with the fix json function to get a valid json response.
def parse_JSON(json_str: str) -> dict:        
    try: 
        return json.loads(json_str)
    except Exception as e:              
        messages = [
      {
        'role': 'system',
        'content':
          'Assistant is a large language model designed to fix and return correct JSON objects.',
      },
      {
        'role': 'user',
        'content': f'ORIGINAL ERROR CONTAINING JSON OBJECT:\n\n{json_str}\n\nERROR MESSAGE: {e}',
      },
    ]
        
        tool_choices = [{
      'type': 'function',
      'function': {
        'name': 'fix_object',
        'description':
          'You will be given an incorrectly formed JSON Object and a error message. You must fix the incorrect JSON Object and return the valid JSON object.',
        'parameters': {
          'type': 'object',
          'properties': {
            'fixedJSON': {
              'type': 'string',
              'description': 'The reformated and error free JSON object. Return the JSON object only!',
            },
          },
          'required': ['fixedJSON'],
        },
      },
    }]                
        response = client.chat.completions.create(
                    model='gpt-4o-2024-05-13',
                    messages=messages,                    
                    max_tokens=4096,
                    temperature=0,
                    tools=tool_choices,
                    tool_choice={ 'type': 'function', 'function': { 'name': 'fix_object' } },        
                )        
                
        second_test_json = response.choices[0].message.tool_calls[0].function.arguments 
                  
        to_return = json.loads(second_test_json)
        return json.loads(to_return['fixedJSON'])

output_folder = './data/step2/output'

jsonl_files = [f for f in os.listdir(output_folder) if os.path.isfile(os.path.join(output_folder, f)) and f.endswith('.jsonl')]

# Load original JSON list
with open('./data/step1/list.json', 'r') as f:
    list_data = json.load(f)

def get_correct_category(AI_category):
    AI_category = AI_category.lower()    
    if AI_category == 'digital platform':
        return 'platform'
    if AI_category == 'civil society':
        return 'civil'
    return AI_category

prompt_tokens = 0
completion_tokens = 0
total_tokens = 0

# Load the JSONL files
for file in jsonl_files:    
    with open(f"{output_folder}/{file}", "r") as f:
        for line in f:
            item = json.loads(line)            
            item_key = item['custom_id']            
            # grab the matching item in our list            
            list_item = next((x for x in list_data["data"] if x['uniqueId'] == item_key), None)
            if list_item:
              if item["response"]["status_code"] != 200:
                list_item["step_2"] = None
                list_item["metadata"]["step_2"]["error"] = item["response"]
                continue                
              json_res = parse_JSON(item['response']['body']['choices'][0]['message']['tool_calls'][0]['function']['arguments'])
              list_item["step_2"] = json_res
              list_item["metadata"]["step_2"]["system_fingerprint"] = item['response']['body']['system_fingerprint']
              list_item["metadata"]["step_2"]["batch_id"] = item['id']
              prompt_tokens += item['response']['body']["usage"]["prompt_tokens"]
              completion_tokens += item['response']['body']["usage"]["completion_tokens"]
              total_tokens += item['response']['body']["usage"]["total_tokens"]

local_timezone = pytz.timezone('Australia/Sydney')
current_time = datetime.now(local_timezone).strftime("%Y-%m-%d %H:%M:%S %Z")

list_data["metadata"]["step_2"]["batch_processed"] = {"timestamp": current_time}
list_data["metadata"]["step_2"]["usage"] = {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens}

# Save the updated list back to the json file
with open('./data/step1/list.json', 'w') as f:
    json.dump(list_data, f)

# Export the list to an Excel file for review
# Convert JSON to DataFrame
df = pd.json_normalize(list_data["data"])

# Save DataFrame to Excel
df.to_excel('./data/step2/step2.xlsx', index=False)