In [2]:
import os
import dotenv

# Load the environment variables from .venv file
dotenv.load_dotenv('.env')

# Get the value of CHATGPT_API_KEY from the environment
CHATGPT_API_KEY = os.getenv('CHATGPT_API_KEY')

#print(CHATGPT_API_KEY)

In [4]:
import glob
import os

ESBM = {}


# Define the path pattern to find .txt files in the data folder
path_pattern = 'data/dbpedia/*.txt'

# Use glob to get a list of all .txt files in the data folder
file_list = glob.glob(path_pattern)

# Loop through each file found
for file_path in file_list:
    print(f'Processing file: {file_path}')
    
    # Generate an output filename based on the input filename
    base_name = os.path.basename(file_path)
    output_file_name = f'output_{base_name}'
    
    clean_base_name = base_name.split('.txt')[0]
    print(clean_base_name)
    if clean_base_name not in ESBM:
        ESBM[clean_base_name] = {}

    with open(file_path, 'r') as file:

        for line in file:
            # Split the line on tabs to isolate the URL
            parts = line.split('\t')

            if len(parts) > 1:  # Check if there are at least two parts after splitting
                url = parts[1].strip()  # Get the URL part and remove any extra spaces
                
                # Extract the last part of the URL after the last '/'
                last_part = url.split('/')[-1]
                
                # Remove trailing comma if it exists
                if last_part.endswith(','):
                    last_part = last_part[:-1]
                
                # Replace underscores with spaces if necessary
                # Check if there are underscores and then replace them with spaces
                if '_' in last_part:
                    last_part = last_part.replace('_', ' ')

                # Check if the base_name is already in the dictionary
                
                
                ESBM[clean_base_name][parts[0]] = last_part
                #ids.append(parts[0])
                #data.append(last_part)

        #ESBM[base_name] = data

        #data = []

print(ESBM)


Processing file: data/dbpedia/ESBM_benchmark_v1.0.txt
ESBM_benchmark_v1.0
Processing file: data/dbpedia/ESBM_benchmark_v1.2.txt
ESBM_benchmark_v1.2
{'ESBM_benchmark_v1.0': {'1': '3WAY FM', '2': 'Adrian Griffin', '3': 'Andrew Kippis', '4': 'Anthony Beaumont-Dark', '5': 'Dallas Keuchel', '6': 'E. K. Mawlong', '7': 'Finn Schiander', '8': 'Hagar Wilde', '9': 'Ludwigsburg University', '10': 'Roderick Carr', '11': '2009–10 Swiss Cup', '12': '2011 Kor Royal Cup', '13': '2012 League of Ireland Cup Final', '14': '2011 Sparta Prague Open', '15': '2012–13 UEFA Champions League', '16': 'Battle of Bregalnica', '17': 'Battle of Rottofreddo', '18': 'Burgery ambush', '19': 'Massacre on 34th Street', '20': "Triathlon at the 2000 Summer Olympics – Men's", '21': 'Akalwadi', '22': 'Chitita', '23': 'Kuleh Bayan', '24': 'Phong Thạnh Tây', '25': 'Reamer Barn', '26': 'Richmond–Petersburg Turnpike', '27': 'Uelsby', '28': 'Wehlaberg', '29': 'Wernshausen', '30': 'Yayoidai Station', '31': 'African grey hornbill',

In [4]:
import math
import numpy as np

import glob
import os
import json  # Import JSON to write output files

import openai
from openai import OpenAI


# Set the API key
client = OpenAI(
  api_key=CHATGPT_API_KEY
)

MODEL = "gpt-4o-mini"

# Assuming ESBM is a predefined dictionary
for key, value in ESBM.items():
    path_pattern = f'data/dbpedia/{key}'
    folder_list = glob.glob(path_pattern)
    
    json_output_list = []  # List to hold output dictionaries

    # Loop through each file found
    for file_path in folder_list:
        print(f'Processing folder: {file_path}')
        
        # Generate an output filename based on the folder name
        base_name = os.path.basename(file_path)

        for key2, value2 in value.items():
            file_list = glob.glob(f'{file_path}/{key2}/*top*.nt')

            for nt_file in file_list:
                print(f'Processing file: {nt_file}')
                with open(nt_file, 'r') as file:
                    rdf_content = file.read()
                
                # Make a request to the OpenAI API
                try:
                    completion = client.chat.completions.create(
                        model=MODEL,
                        messages=[
                            {"role": "system", "content": "I will give you text - triples and you will respond strictly with coherent text without using your knowledge but only the triples."},
                            {"role": "user", "content": rdf_content}
                        ],
                        #logprobs=True,
                    )
                    gpt_answer = completion.choices[0].message.content

                    '''
                    # log probabilities from the model response
                    logprobs = [token_logprob.logprob for token_logprob in completion.choices[0].logprobs.content]

                    # Convert log probabilities to regular probabilities
                    probabilities = [math.exp(logprob) for logprob in logprobs]

                    # Print individual probabilities
                    #for logprob, prob in zip(logprobs, probabilities):
                    #    print(f"Log Probability: {logprob:.6f} => Probability: {prob:.6f}")

                    logprobs_mean = np.mean(logprobs)
                    total_probability = math.prod(probabilities)
                    average_probability = np.mean(probabilities)

                    #print(f"Mean Log Probability: {logprobs_mean:.6f}")
                    #print(f"Total Probability: {total_probability:.6f}")
                    #print(f"Average Probability: {average_probability:.6f}")
                    '''

                    # Create dictionary for JSON output
                    output_dict = {
                        "id": str(key2),
                        "filename": nt_file,
                        #"logprobs_mean": logprobs_mean,
                        #"total_probability": total_probability,
                        #"mean_probability": average_probability,
                        "Title": value[key2],  # Assuming the folder name as title
                        "Answer": gpt_answer,
                        "sparql_query": rdf_content  # Assuming this field is empty for now
                    }
                    json_output_list.append(output_dict)
                    

                except Exception as e:
                    print(f"Error processing file {nt_file}: {e}")

    # Write the output list to a JSON file for each folder
    json_output_path = os.path.join(file_path, f'{base_name}_output.json')
    with open(json_output_path, 'w') as json_file:
        json.dump(json_output_list, json_file, indent=4)

    print(f'JSON output written to: {json_output_path}')


Processing folder: data/dbpedia/ESBM_benchmark_v1.0
Processing file: data/dbpedia/ESBM_benchmark_v1.0/1/1_gold_top5_3.nt
Processing file: data/dbpedia/ESBM_benchmark_v1.0/1/1_gold_top10_3.nt
Processing file: data/dbpedia/ESBM_benchmark_v1.0/1/1_gold_top5_2.nt
Processing file: data/dbpedia/ESBM_benchmark_v1.0/1/1_gold_top10_2.nt
Processing file: data/dbpedia/ESBM_benchmark_v1.0/1/1_gold_top10_5.nt
Processing file: data/dbpedia/ESBM_benchmark_v1.0/1/1_gold_top5_5.nt
Processing file: data/dbpedia/ESBM_benchmark_v1.0/1/1_gold_top5_1.nt
Processing file: data/dbpedia/ESBM_benchmark_v1.0/1/1_gold_top10_1.nt
Processing file: data/dbpedia/ESBM_benchmark_v1.0/1/1_gold_top5_0.nt
Processing file: data/dbpedia/ESBM_benchmark_v1.0/1/1_gold_top10_0.nt
Processing file: data/dbpedia/ESBM_benchmark_v1.0/1/1_gold_top10_4.nt
Processing file: data/dbpedia/ESBM_benchmark_v1.0/1/1_gold_top5_4.nt
Processing file: data/dbpedia/ESBM_benchmark_v1.0/2/2_gold_top5_3.nt
Processing file: data/dbpedia/ESBM_benchmark_