In [None]:
"""' Install Dependencies """

!pip install openai
!pip install pandas

In [None]:
""" Interaction Encoder (Part) """

import os
import json

import httpx
import pandas as pd
from openai import OpenAI

from google.colab import drive
drive.mount("/content/drive")


def get_config(config_file_path, dataset_name):
  global config

  with open(config_file_path, "r", encoding="utf-8") as f:
    data_config = json.load(f)

  config = {
    "MODEL_NAME": "text-embedding-3-small",
    "OPENAI_API_BASE": data_config.get("large_language_model").get("openai").get("openai_api_base"),
    "OPENAI_API_KEY": data_config.get("large_language_model").get("openai").get("openai_api_key"),
    "QUESTION_FILE_PATH": os.path.join(data_config.get("dataset").get(dataset_name).get("dpath"), data_config.get("dataset").get(dataset_name).get("question_file")),
    "KNOWLEDGE_FILE_PATH": os.path.join(data_config.get("dataset").get(dataset_name).get("dpath"), data_config.get("dataset").get(dataset_name).get("knowledge_file")),
    "COGNITIVE_FILE_PATH": os.path.join(data_config.get("dataset").get(dataset_name).get("dpath"), data_config.get("dataset").get(dataset_name).get("cognitive_file")),
    "ENCODING_QUESTION_FILE_PATH": os.path.join(data_config.get("dataset").get(dataset_name).get("save_path"), data_config.get("dataset").get(dataset_name).get("(encoding) question_file")),
    "ENCODING_KNOWLEDGE_FILE_PATH": os.path.join(data_config.get("dataset").get(dataset_name).get("save_path"), data_config.get("dataset").get(dataset_name).get("(encoding) knowledge_file")),
    "ENCODING_COGNITIVE_FILE_PATH": os.path.join(data_config.get("dataset").get(dataset_name).get("save_path"), data_config.get("dataset").get(dataset_name).get("(encoding) cognitive_file"))
  }


def get_llm_encoding(api_base, api_key, text, model="text-embedding-3-small"):
  client = OpenAI(
    base_url=api_base,
    api_key=api_key,
    http_client=httpx.Client(base_url=api_base, follow_redirects=True)
  )

  text = text.replace("\n", " ")
  return client.embeddings.create(input = [text], model=model).data[0].embedding


if __name__ == "__main__":
  get_config(config_file_path="./llm_agents/data/data_config.json", dataset_name="bepkt")

  df_q = pd.read_csv(config.get("QUESTION_FILE_PATH"))
  df_q["description_encoding"] = df_q["description"].apply(lambda x: get_llm_encoding(config.get("OPENAI_API_BASE"), config.get("OPENAI_API_KEY"), x))
  df_q.to_csv(config.get("ENCODING_QUESTION_FILE_PATH"), index=False)

  df_k = pd.read_csv(config.get("KNOWLEDGE_FILE_PATH"))
  df_k["knowledge_component_encoding"] = df_k["knowledge_component_name"].apply(lambda x: get_llm_encoding(config.get("OPENAI_API_BASE"), config.get("OPENAI_API_KEY"), x))
  df_k.to_csv(config.get("ENCODING_KNOWLEDGE_FILE_PATH"), index=False)

  df_c = pd.read_csv(config.get("COGNITIVE_FILE_PATH"))
  df_c["cognitive_level_encoding"] = df_c["cognitive_level_name"].apply(lambda x: get_llm_encoding(config.get("OPENAI_API_BASE"), config.get("OPENAI_API_KEY"), x))
  df_c.to_csv(config.get("ENCODING_COGNITIVE_FILE_PATH"), index=False)