In [21]:
from transformers import GPT2Tokenizer
import json
import openai
import warnings
warnings.filterwarnings("ignore")
import os
from openai import OpenAI
openai.api_key = "ADD YOUR API KEY"
os.environ["OPENAI_API_KEY"] = "ADD YOUR API KEY"
client = OpenAI()
import pandas as pd
import requests

def exact_openai_token_count(text):
	# Initialize the GPT-2 tokenizer
	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

	# Tokenize the text and count the number of tokens
	tokens = tokenizer.encode(text)

	return len(tokens)

def get_info(table_id: str) -> str:
	
	
	url = f"https://opendata.cbs.nl/ODataFeed/odata/{table_id}/TableInfos?$format=json"
	response = requests.get(url)
	
	
	jsonfile = json.loads(response.text)['value'][0]
	text =  jsonfile['Summary'] + " " + jsonfile['ShortDescription']  

	return  jsonfile['Summary'], jsonfile['Description'] 


def prompt_chatgpt(doc, model="gpt-3.5-turbo-0125"):


	template = f"""
		You are provided a table title, its summary, and a description: '{doc}':

		Based on this I want you to return the following
		- "Keywords": List of Keywords and terms that best portray the table.
		- "Summary": A summary of the table that helps with the understanding \\
		of the table and is optimized for document retrieval, this summary should not \\
		contain more than 75 words. 
	"""

	input_tokens = exact_openai_token_count(template)
	input_cost = input_tokens / 1000 * 0.0005   # Cost per 1K tokens for input

	
	response = client.chat.completions.create(
		model=model,    
		response_format={"type": "json_object"},
		messages=[
			{"role": "system", "content": "Based on a table description you generate a list of keywords and a Summary in a document retrieval optimized format. In the Dutch Language, in a json file"},
			{"role": "user", "content": template}
		],
		temperature=0.0,

	)
	output = json.loads(response.choices[0].message.content)
	output_tokens = exact_openai_token_count(f"{output}")
	output_cost = output_tokens / 1000 * 0.0015  # Cost per 1K tokens for output

	# Calculate the total cost by adding input and output costs
	total_cost = input_cost + output_cost

	# # Print the costs
	# print(f"Input tokens: {input_tokens}, Cost: ${input_cost:.4f}")
	# print(f"Output tokens: {output_tokens}, Cost: ${output_cost:.4f}")
	# print(f"Total cost: ${total_cost:.4f}")
	return output, total_cost

In [22]:
tabledf = pd.read_pickle("data/tabledf.pkl")
measure_df = pd.read_pickle("data/measure_dimensions_df.pkl")

In [28]:
import random
keywords, summaries = [], []
for index, row in tabledf[:1].iterrows():
    table_title = row['Table Title']

    measures = list(row['Measure'])
    ## take random 3 measures
    if len(measures) > 5:
        selected_measures = random.sample(measures, 5)

    else:
        selected_measures = measures


    measure_string = "De tabel heeft de volgende karakteristieken:"

    for measure in selected_measures:
        try:
            measure_df_row = measure_df[measure_df['id'] == measure]
            title = measure_df_row['title'].values[0]

            measure_string += f" {title},"
        except:
            continue


    ## need to fetch table summary and short description

    summary, desc = get_info(row['table_id'])


    final_prompt = f"Voor de tabel met de titel '{table_title}', de volgende karakteristieken: {measure_string} en omschrijving {summary}"

    answer, cost = prompt_chatgpt(final_prompt)
    keywords.append(answer['Keywords'])
    summaries.append(answer['Summary'])



