In [50]:
import os
from dotenv import load_dotenv
import time
import random
import tqdm
import json

from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.document_loaders import JSONLoader

from datasets import load_dataset

In [8]:
# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["title"] = record.get("title")
    metadata["source"] = record.get("source")
    metadata["id"] = record.get("id")

    return metadata

In [2]:
# Load env variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [19]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY,
             model='text-davinci-003',
             temperature=0.2)

prompt = PromptTemplate(
    input_variables=["answer"],
    template="Generate a one line question that would produce the following answer: \n\nAnswer: {answer}\n\nQuestion:",
)

chain = LLMChain(llm=llm, prompt=prompt)

In [9]:
loader = JSONLoader(
    file_path=f"../data/cs_CL_train.jsonl",
    jq_schema='.',
    content_key="text",
    metadata_func=metadata_func,
    json_lines=True)

docs = loader.load()

In [47]:
#qa = []

for doc in tqdm.tqdm(random.sample(docs, 530)):
    res = chain.run(doc.page_content)
    qa.append({"question": res.strip(),
               "answer": doc.page_content})

    time.sleep(0.5)

 46%|████████████████████████████████████████████████████████████▎                                                                      | 244/530 [05:52<06:22,  1.34s/it]Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: Request failed due to server shutdown {
  "error": {
    "message": "Request failed due to server shutdown",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'Request failed due to server shutdown', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Mon, 25 Sep 2023 16:44:32 GMT', 'Content-Type': 'application/json', 'Content-Length': '141', 'Connection': 'keep-alive', 'access-control-allow-origin': '*', 'openai-model': 'text-davinci-003', 'openai-organization': 'user-so2qcd1fggxbyrn5icvmo3us', 'openai-processing-ms': '8709', 'openai-version': '2020-10-01', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-ratelimit-li

In [48]:
len(qa)

1000

In [51]:
# save documents
with open(f"../data/cs_CL_train_qa.jsonl", "w") as outfile:
    for qa_pair in qa:
        outfile.write(json.dumps(qa_pair) + '\n')