In [None]:
'''
TO-DO:
Prompt GPT to generate a ~2000 word report using 5-15 questions from either one of the ESG topics. Leak 1-2 questions from the other
topics as well. Store all these ground-truth questions along with the paragraph in a JSON file. Use the following format:
[
{
    "text" : (str) the generated text,
    "main_topic": (str) the main topic,
    "main_ground_truth": (Dict) the questions relating to main topic,
    "leak_topic": (str) the leak topic,
    "leak_ground_truth" : (Dict) the questions relating to leak topic,
    "ground_truth": (List) all the questions as a list,
    "input_tokens": (int) number of input tokens,
    "output_tokens": (int) number of output tokens
}
]

The ground_truth dictionary must follow the format:

{
"main_topic": {
    "sub_topic_1": [list of questions],
    ...,
    }
"leak_topic": None or Dict in the same format as main_topic
}

For each of the topics in ESG, repeat this process about 6-7 times. End with ~20 total paragraphs.
'''

In [None]:
# langchain imports
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field


# general imports
import json 
import random
import openai
from typing import Dict, Any, List
from pprint import pprint

In [None]:
OPENAI_API_KEY = "API KEY HERE"
MODEL="gpt-4o-mini"

client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [None]:
# function to load the atomic query dictionary
def load_json(file_path):
    json_data = []
    with open(file_path, 'r') as file:
        data = json.load(file)
        json_data.append(data)
    return json_data

In [None]:
# function to write list of dictionaries to a json
def write_to_json(data, file_path):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

In [None]:
# function to read jsonl file
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            json_object = json.loads(line.strip())
            data.append(json_object)
    return data

In [None]:
esg_data = load_json('./atomic_factors/ESG_factors.json')[0]

In [None]:
def select_questions(questions: Dict[str, Dict[str, List[str]]]) -> Dict[str, Any]:
    '''
    Function to select questions from the entire dataset. The function works by selecting a random main topic from the following
    choices: "Environment", "Social", or "Governance". The function also selects a random number between 5 and 15 as the total
    number of questions. Given the main topic, the model then iterates through all sub-topics and selects random questions between
    each sub-topic. The distribution of questions from sub-topics is not uniform, i.e. questions from every sub-topic might not be 
    present. The model also selects 1 or 2 leak question from a topic that is not the main topic 30% of the time. The model returns
    a dictionary in the following format:
    
    result = {
        "main_topic": main topic,
        "main_ground_truth": questions from the main topic as a Dict of sub-topics,
        "leak_topic": None or leak topic,
        "leak_ground_truth": None or questions from the leak topic as a Dict of sub-topics,
        "ground_truth": list of all questions for easy access
    } 
    '''
    main_topics = ["Environment", "Social", "Governance"]
    main_topic = random.choice(main_topics)
    num_questions = random.randint(5, 8)
    
    main_ground_truth = {}
    ground_truth = []
    
    # select questions for the main topic
    while len(ground_truth) < num_questions:
        for sub_topic, sub_questions in questions[main_topic].items():
            selected = random.sample(sub_questions, random.randint(0, num_questions - len(ground_truth)))
            if selected:
                main_ground_truth[sub_topic.replace('.txt', '')] = selected
                ground_truth.extend(selected)
    
    # prepare the result dictionary
    result = {
        "main_topic": main_topic,
        "main_ground_truth": main_ground_truth,
        "leak_topic": None,
        "leak_ground_truth": None,
        "ground_truth": ground_truth
    }
    
    # 30% chance to have a leak topic
    if random.random() < 0.3:
        leak_topic = random.choice([t for t in main_topics if t != main_topic])
        result["leak_topic"] = leak_topic
        
        leak_ground_truth = {}
        num_leak_questions = random.randint(1, 2)
        
        for sub_topic, sub_questions in questions[leak_topic].items():
            selected = random.sample(sub_questions, min(len(sub_questions), num_leak_questions - len(leak_ground_truth)))
            if selected:
                leak_ground_truth[sub_topic.replace('.txt', '')] = selected
                ground_truth.extend(selected)
            if len(leak_ground_truth) >= num_leak_questions:
                break
        
        result["leak_ground_truth"] = leak_ground_truth
        result["ground_truth"] = ground_truth
    
    return result

In [None]:
def generate_prompt(selected: Dict[str, Any]) -> str:
    prompt = f"Generate a text of approximately 2000 words (plus or minus 200 words) that results in an excerpt from a listed company's disclosure report that will become avaible to investors and regulators. The text should also indirectly answer the following topics (in the form of questions) with supportive statements and actions that have taken place in the last fiscal year that will allow investors and regulatory to accurately determine that the company has definitively complied in a couple of measure steps. The answers when interpreted will in the regulators' mind check a 'Yes' without explicitly mentioning them:\n\n"
    for q in selected['ground_truth']:
        prompt += f"- {q}\n"
    prompt += f"\nThe main focus should be on {selected['main_topic']}."
    if selected['leak_topic']:
        prompt += f" Also, subtly incorporate some elements related to {selected['leak_topic']}."
    prompt += "\nEnsure the text is complex and realistic, and avoid repeating key words from the questions. Do not have headings relating to each question but rather have paragraphs of texts that transition naturally."
    prompt += "\nDo not dedicate paragraphs to each question but rather mix around the topics and also add text related to the topic that do not answer the questions."
    prompt += "\nFormat the output text as a string as if it was the output of a PDF parser and not in markdown format. For example, replace new lines with '\\n' character."
    prompt += "\nRandomly inject mild to severe parsing errors in the textto mimic realistic tools like pypdf2 or pdfminer."
    return prompt

In [None]:
# function to create the input batches files
def create_batch_input(final_list: List[Dict[str, Any]], model_name: str) -> List[Dict[str, Any]]:

    batch_input = []
    for i, item in enumerate(final_list):
        custom_id = f"request-{i+1}"
        item["id"] = custom_id
        batch_input.append({
            "custom_id": custom_id,
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": model_name,
                "messages": [
                    {"role": "system", "content": "You are an expert AI trained to generate annual report excerpts following the prompt carefully."},
                    {"role": "user", "content": item["prompt"]}
                ],
                "max_tokens": 3000
            }
        })
    return batch_input

In [None]:
final_output = []

for i in range(21):
    final_schema = {
        "id": "",
        "text" : "",
        "main_topic": "",
        "main_ground_truth": {},
        "leak_topic": "",
        "leak_ground_truth" : {},
        "ground_truth": [],
        "prompt": "",
        "input_tokens": 0,
        "output_tokens": 0
    }

    ground_truth = select_questions(esg_data)

    final_schema["main_topic"] = ground_truth["main_topic"]
    final_schema["main_ground_truth"] = ground_truth["main_ground_truth"]
    final_schema["leak_topic"] = ground_truth["leak_topic"]
    final_schema["leak_ground_truth"] = ground_truth["leak_ground_truth"]
    final_schema["ground_truth"] = ground_truth["ground_truth"]

    prompt = generate_prompt(ground_truth)
    final_schema["prompt"] = prompt

    final_output.append(final_schema)


input_batches = create_batch_input(final_output, model_name=MODEL)

In [None]:
with open('data_gen_batches.jsonl', 'w') as file:
    for request in input_batches:
        file.write(json.dumps(request) + '\n')

In [None]:
# upload the batch input file
batch_input_file = client.files.create(
  file=open("data_gen_batches.jsonl", "rb"),
  purpose="batch"
)

In [None]:
batch_input_file_id = batch_input_file.id

batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "generate synthetic data"
    }
)

In [None]:
batch_output = client.batches.retrieve(batch.id)

In [None]:
batch_output.status

In [None]:
if batch_output.status == "completed":
    output = client.files.content(batch_output.output_file_id)

In [None]:
print(output.write_to_file('raw_batching_output.jsonl'))

In [None]:
file_path = 'raw_batching_output.jsonl'
jsonl_data = read_jsonl(file_path)

In [None]:
total_input = 0
total_output = 0

for idx, jsonl in enumerate(jsonl_data):
    output_id = jsonl["custom_id"]
    if final_output[idx]['id'] == output_id:
        final_output[idx]["text"] = jsonl['response']['body']['choices'][0]['message']['content']
        final_output[idx]["input_tokens"] = jsonl['response']['body']['usage']['prompt_tokens']
        final_output[idx]["output_tokens"] = jsonl['response']['body']['usage']['completion_tokens']
    else:
        for item in final_output:
            if item['id'] == output_id:
                final_output[idx]["text"] = jsonl['response']['body']['choices'][0]['message']['content']
                final_output[idx]["input_tokens"] = jsonl['response']['body']['usage']['prompt_tokens']
                final_output[idx]["output_tokens"] = jsonl['response']['body']['usage']['completion_tokens']
    total_input += jsonl['response']['body']['usage']['prompt_tokens']
    total_output += jsonl['response']['body']['usage']['completion_tokens']

In [None]:
# save the output
write_to_json(final_output, 'processed_output.json')

In [None]:
print(total_input, total_output)