In [None]:
import nltk
from dotenv import load_dotenv
import openai
import os
import backoff
from openai.error import RateLimitError, OpenAIError
import pandas as pd
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

"""
This script scrapes the Microsoft Learn website for all the courses and their descriptions.
openai example: https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb
"""

openai.api_key = os.getenv("OPENAI_API_KEY")
load_dotenv()

nltk.download("all")

class Embeddings:
    def __init__(self, df: pd.DataFrame, token_limit: int):
        self.df = df
        self.token_limit = token_limit
        self.model = "text-embedding-ada-002"

    def count_tokens(text: str) -> int:
        """count the number of tokens in a string"""
        return len(tokenizer.encode(text))

    @backoff.on_exception(backoff.expo, (RateLimitError, OpenAIError))
    def get_embedding(self, text: str, model: str) -> list[float]:
        result = openai.Embedding.create(model=model, input=text)
        return result["data"][0]["embedding"]

    def get_doc_embedding(self, text: str) -> list[float]:
        return self.get_embedding(text=text, model=self.model)

    def compute_query_embeddings(self, text: str) -> list[float]:
        return self.get_embedding(text=text, model=self.model)

    def compute_doc_embeddings(self) -> dict[tuple[str, str], list[float]]:
        """
        Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

        Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
        """
        return {
            idx: self.get_doc_embedding(
                str(r.content).replace("\n", " ").replace("\t", " ")
            )
            for idx, r in self.df.iterrows()
        }

    def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
        """
        Read the document embeddings and their keys from a CSV.

        fname is the path to a CSV with exactly these named columns:
            "title", "heading", "0", "1", ... up to the length of the embedding vectors.
        """

        df = pd.read_csv(fname, header=0)
        max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
        return {
            (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)]
            for _, r in df.iterrows()
        }
