In [None]:
'''
TO-DO:
Load same text and combined atomic queries. Convert both to embeddings using different embedding models.
Extract the top k atomic queries most similar to the text. Try out different similarity metrics.
Select the combination of k, embedding model, and similarity metrics that give the best recall.
'''

In [None]:
import os
import json
import numpy as np
import openai
from typing import List, Dict
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
OPENAI_API_KEY = "API KEY HERE"
MODEL="gpt-4o-mini"

client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [4]:
# function to load the atomic query dictionary
def load_json(file_path):
    json_data = []
    with open(file_path, 'r') as file:
        data = json.load(file)
        json_data.append(data)
    return json_data

In [2]:
# openai similarity function (currently only for one page)
def find_top_similar_embeddings(page_embeddings, atomic_embeddings, top_n=20):
    page_embedding = np.array(page_embeddings[0])
    
    atomic_embeddings_array = np.array(atomic_embeddings)

    similarities = cosine_similarity([page_embedding], atomic_embeddings_array)[0]
    
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    
    return top_indices.tolist()

In [3]:
# load the text and ground truth
data = {"text": "In the past fiscal year, our company has undertaken significant steps toward ensuring that our business model aligns with sustainable practices, particularly through a robust framework that emphasizes the intricate interdependencies with various ecosystems. We believe that understanding and disclosing the ecosystems we materially depend on is paramount, not only for regulatory compliance but also for long-term viability in a rapidly changing environment. Our comprehensive mapping of these ecosystems has been integrated into our strategic discussions, allowing us to recognize areas of vulnerability and opportunity. In our environmental impact assessments, biodiversity and ecosystem health are unequivocally prioritized. This past year, we enhanced our evaluation protocols to incorporate thorough analyses of local biodiversity affected by our operational footprint. We conducted detailed field studies that photographed and cataloged various species within our operational vicinity, thus ensuring that our impacts on biological diversity are not only measured but actively addressed. The findings from these studies have now been embedded in our environmental impact assessments, providing a clearer picture of our potential impacts and the necessary mitigative measures. Recognizing the escalating physical and transition risks linked to biodiversity and ecosystems has been a focal point in our risk management strategy. Throughout the year, we have diligently identified key factors that could impact our operations, from climate-related disruptions to shifts in regulatory standards aimed at preserving ecological balance. Our risk management framework has undergone a thorough review and now includes specific protocols for monitoring these salient risks. This holistic approach ensures that our responses to environmental challenges are both proactive and adaptive. Moreover, we are keenly aware of the pressing issues around deforestation and peatland loss, particularly within our supply chain. This year, we reiterated our commitment to eliminate these detrimental practices by 2025. As part of this strategy, we have forged partnerships with various environmental organizations to promote sustainable sourcing of materials and initiated training programs for suppliers to ensure compliance with our stringent policies. The integration of this objective into our supply chain management reflects our dedication to upholding environmental integrity across all facets of our operations. To support our commitment to sustainable development, we have embedded these risks within our overarching risk management framework. Every department within our organization is now required to assess potential environmental risks as a component of their operational strategies. This systematic approach has resulted in a dedicated task force that meets quarterly to evaluate and refine our processes, ensuring that we remain aligned with our sustainability goals. Transitioning to a low-carbon economy is no longer a mere objective but a strategic imperative for our company. This fiscal year, we have taken substantial strides in minimizing our carbon footprint, which included investing in renewable energy and improving energy efficiency across all primary facilities. Our carbon reduction goals have been publicly disclosed and serve as a guiding principle in our decision-making processes. This transition is reflected in our annual sustainability report, where we outline specific actions taken, such as procuring 50% of our energy from renewable sources, as part of a broader goal of achieving complete energy independence by 2030. In conjunction with our efforts regarding climate change, the company has made strides in formulating a coherent climate change policy that aligns with global standards and best practices. This policy not only reflects our commitment to reducing greenhouse gas emissions but also outlines our approach to engaging stakeholders in these initiatives. Our extensive consultations with environmental experts and community stakeholders have informed the policy, ensuring it is practical and effective in addressing the multifaceted challenges associated with climate change. Social responsibility remains a core value in our business operations, particularly regarding the rights of children within our supply chain. We have conducted comprehensive assessments to identify and mitigate risks to children's rights, ensuring that all business relationships, especially those concerning raw material sourcing, are compliant with international standards. Our collaborative efforts with key stakeholders, including NGOs and local communities, have strengthened our approach, allowing us to safeguard children's rights and promote their well-being effectively. The integration of social considerations does not merely stop short at compliance but extends to actively fostering an environment where children can thrive. Through community engagement initiatives, we are investing in education and health programs that directly benefit children in areas impacted by our operations. These initiatives are not only beneficial to the communities we operate in but also serve to enhance our corporate reputation among investors and stakeholders alike. We are committed to transparency and accountability in all aspects of our environmental and social governance. As such, we have established clear metrics for performance tracking in our sustainability practices and made these metrics publicly available. This level of transparency allows our investors and regulatory bodies to form an accurate assessment of our adherence to best practices in environmental stewardship and social responsibility. The dedication to continuous improvement in our environmental impact assessments has also led us to adopt innovative technologies that further enhance our data collection and reporting capabilities. By leveraging advanced analytics and geospatial technologies, we can gain deeper insights into the ecological implications of our operations. This technology-driven approach has proven instrumental in identifying trends in biodiversity health and ecosystem resilience, thus informing our decision-making process. Furthermore, our engagement with local communities has expanded to include educational workshops focused on environmental stewardship and biodiversity conservation. Participants include not only local residents but also employees, creating a shared understanding of how our business activities intersect with community interests and ecological health. By fostering dialogue and collaboration, we aim to jointly develop strategies that mitigate negative impacts and enhance positive outcomes for both the business and the ecosystems we depend on. In summary, our company has made meaningful advancements in integrating ecological considerations into our operational framework while ensuring that the social dimensions of our impact are not overlooked. The actions taken throughout the fiscal year reflect a comprehensive understanding of the challenges and opportunities presented by our interconnected ecosystems. The commitment to transparency, proactive measures in risk management, and strategic adaptations towards low-carbon operations substantiate our resolve to contribute positively to both the environment and society. Through sustained efforts and stakeholder engagement, we are poised to navigate the complexities of the modern corporate landscape, driving initiatives that resonate with our core values and the expectations of investors and regulators alike.",
        "ground_truth": ["Does the company disclose any ecosystems it materially depends on?", "Are biodiversity and ecosystems included in the company’s environmental impact assessments?",
                     "Has the company identified material physical and transition risks related to biodiversity and ecosystems?", 
                     "Has the company taken action to eliminate deforestation and peatland loss from its business activities and value chains by 2025?", 
                     "Are these risks incorporated into the company's risk framework?", 
                     "Does the company consider the transition to a low-carbon economy in its strategic decisions?", 
                     "Does the company have a publicly disclosed climate change policy?", 
                     "Does the company identify and assess salient risks related to children's rights in its business operations, supply chains (including raw material sourcing), and other business relationships, as well as the marketing and use of products and services?"]
}

In [24]:
with open('./atomic_factors/atomic_factors.txt', 'r') as f:
    atomic_chunks = [line.rstrip() for line in f]

In [None]:
page_embeddings_response = client.embeddings.create(
    input=data["text"],
    model="text-embedding-3-small"
)

atomic_embeddings_response = client.embeddings.create(
    input=atomic_chunks,
    model="text-embedding-3-small"
)

In [None]:
# extract embeddings
page_embeddings_openai = [page_embeddings_response.data[0].embedding]
atomic_embeddings_openai = [item.embedding for item in atomic_embeddings_response.data]
# store token counts for cost calculations
page_token_count_openai = page_embeddings_response.usage.prompt_tokens
atomic_token_count_openai = atomic_embeddings_response.usage.prompt_tokens

In [None]:
relevant_incides = find_top_similar_embeddings(page_embeddings_openai, atomic_embeddings_openai, top_n=20)
print(relevant_incides)

In [None]:
for i in relevant_incides:
    print(atomic_chunks[i])

In [26]:
ground_truth_indices = []

for gt in data['ground_truth']:
    ground_truth_indices.append(atomic_chunks.index(gt))

In [27]:
print(ground_truth_indices)

[101, 95, 91, 89, 92, 170, 164, 9]
