In [7]:
import pandas as pd
import glob
import pickle

from os import path
from openai import OpenAI

import torch

from transformers import AutoTokenizer, AutoModel


client = OpenAI()

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [8]:
def split_string(input_string, chunk_size=20000):
    return [input_string[i:i + chunk_size] for i in range(0, len(input_string), chunk_size)]

In [15]:
def read_texts():

    texts, metadatas, results, file_names = [], [], [], []
    for file_path in glob.glob('data/*.txt'):

        file_name, _ = path.splitext(path.basename(file_path))
        data = dict(param.split('=') for param in file_name.split('&'))

        with open(file_path) as file:
            text = file.read()
            for splitted in split_string(text):
                data['text'] = splitted
                file_names.append(file_name)
                results.append(data)

    return file_names, results

In [16]:
file_names, results = read_texts()
df = pd.DataFrame.from_records(results, index=file_names)

In [None]:
for index, row in df.iterrows():
    embedding = get_embedding(row['text'])
    with open(f'data/embedding-ada-002-{index}.pkl', 'wb') as file:
        pickle.dump(embedding, file)