In [None]:
import os
import json
import numpy as np
import openai
from typing import List, Dict
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
OPENAI_API_KEY = "API KEY HERE"
MODEL="gpt-4o-mini"

client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [None]:
def load_json_files(folder_path):
    json_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            with open(os.path.join(folder_path, filename), 'r') as file:
                data = json.load(file)
                json_data.append(data['page_content'])
    return json_data

In [None]:
# jinaai similarity function
def get_similar_questions(page_embedding, atomic_embeddings, top_n):
    similarities = [cosine_similarity(page_embedding.reshape(1, -1), ae.reshape(1, -1))[0][0] for ae in atomic_embeddings]
    
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    
    return top_indices

In [None]:
# openai similarity function (currently only for one page)
def find_top_similar_embeddings(page_embeddings, atomic_embeddings, top_n=20):
    page_embedding = np.array(page_embeddings[0])
    
    atomic_embeddings_array = np.array(atomic_embeddings)

    similarities = cosine_similarity([page_embedding], atomic_embeddings_array)[0]
    
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    
    return top_indices.tolist()

In [None]:
data_chunks = load_json_files('./testing/testing_json')

In [None]:
with open('./testing/combined_test.txt', 'r') as f:
    atomic_chunks = [line.rstrip() for line in f]

In [None]:
model = SentenceTransformer("jinaai/jina-embeddings-v2-base-en", trust_remote_code=True)
model.max_seq_length = 1024

In [None]:
page_embeddings_response = client.embeddings.create(
    input=data_chunks,
    model="text-embedding-3-small"
)

atomic_embeddings_response = client.embeddings.create(
    input=atomic_chunks,
    model="text-embedding-3-small"
)

In [None]:
# extract embeddings
page_embeddings_openai = [page_embeddings_response.data[0].embedding]
atomic_embeddings_openai = [item.embedding for item in atomic_embeddings_response.data]
# store token counts
page_token_count_openai = page_embeddings_response.usage.prompt_tokens
atomic_token_count_openai = atomic_embeddings_response.usage.prompt_tokens

In [None]:
page_embeddings = model.encode(data_chunks)
atomic_embeddings = model.encode(atomic_chunks)

In [None]:
relevant_incides = get_similar_questions(page_embeddings[0], atomic_embeddings, top_n=20)
# relevant_incides = find_top_similar_embeddings(page_embeddings_openai, atomic_embeddings_openai, top_n=20)
print(relevant_incides)

In [None]:
print(data_chunks[0])

In [None]:
for i in relevant_incides:
    print(atomic_chunks[i])

In [None]:
# COMPUTING RECALL
# XY_chunks = positive samples in the data
# atomic_list = samples model detected as positive
with open('./testing/tax_test.txt', 'r') as f:
    XY_chunks = [line.rstrip() for line in f]

atomic_list = [atomic_chunks[q] for q in relevant_incides]
count = 0
for i in atomic_list:
    if(i in XY_chunks):
        count += 1
print(count)
print(count*100/len(XY_chunks))

In [None]:
def extract_data(context: str, questions: List[str], model_name: str) -> List[Dict]:    
    def format_q_a(question: str, answer: str, snippet: str, input_tokens: int, output_tokens: int) -> Dict:
        return {
            "question": question,
            "answer": answer,
            "relevant_snippet": snippet,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens
        }

    system_message = {"role": "system", 
                      "content": "You are an expert at analyzing corporate reports and answering questions with a 'Yes' or 'No'.\
                        You only answer 'Yes' if you find evidence to do so from the provided report. Otherwise, answer with 'No'. \
                        If evidence is found, save the snippet of text where the evidence is."}
    context_message = {"role": "user", 
                       "content": f"Here's an excerpt from a corporate report:\n\n{context}\n\nPlease answer the following questions based \
                        on this text. If the answer is clearly 'Yes' based on the text, provide the relevant snippet. If the answer is 'No' \
                        answer 'No' and provide the relevant snippet. If the answer is not mentioned, just answer 'No' and 'Not Found' as the \
                        snippet."}

    functions = [
        {
            "name": "answer_question",
            "description": "Answers a question based on the provided context",
            "parameters": {
                "type": "object",
                "properties": {
                    "question": {"type": "string", "description": "The question to be answered"},
                    "answer": {"type": "string", "enum": ["Yes", "No"], "description": "The answer to the question"},
                    "snippet": {"type": "string", "description": "The relevant text snippet if the answer is Yes, otherwise an empty string"}
                },
                "required": ["question", "answer", "snippet"]
            }
        }
    ]

    results = []

    for question in questions:
        messages = [system_message, context_message, {"role": "user", "content": question}]
        
        response = client.chat.completions.create(
            model=model_name,
            messages=messages,
            functions=functions,
            function_call={"name": "answer_question"}
        )

        response_message = response.choices[0].message
        
        if response_message.function_call:
            function_args = json.loads(response_message.function_call.arguments)
            results.append(format_q_a(
                question=function_args["question"],
                answer=function_args["answer"],
                snippet=function_args["snippet"],
                input_tokens=response.usage.prompt_tokens,
                output_tokens=response.usage.completion_tokens
            ))

    return results

In [None]:
output = extract_data(data_chunks[0], atomic_list, MODEL)

In [None]:
# saving the output file as json
output_filename = "tax_output.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

In [None]:
# evaluate the output
def load_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data


file_path = './output/tax_output.json'
loaded_data = load_json_file(file_path)

total_input_tokens = 0
total_output_tokens = 0
yes_questions = []

for item in loaded_data:
    total_input_tokens += item['input_tokens']
    total_output_tokens += item['output_tokens']
    if item['answer'] == 'Yes':
        yes_questions.append(item['question'])

In [None]:
true_yes_questions = [
    'Does the company publicly disclose its tax strategy or policy?', 
    'Does the company disclose its effective tax rate and explain any significant variations?', 
    'Has the company implemented a responsible tax principles framework?', 
    'Has the company implemented a whistleblowing mechanism for tax-related concerns?', 
    'Does the company provide transparency on its tax governance structure?', 
    'Does the company report on its engagement with tax authorities and stakeholders?'
]
yes_count = 0
for item in yes_questions:
    if item in true_yes_questions:
        yes_count += 1

print(yes_count*100/len(true_yes_questions))