In [None]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"

    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()

    document_id = hash_hex[:8]

    return document_id

In [None]:
import json

with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []

for docs in docs_raw:
    course_name = docs['course']

    for doc in docs['documents']:
        doc['course'] = course_name
        
        documents.append(doc)

for doc in documents:
    doc['id'] = generate_document_id(doc)

In [None]:
from collections import defaultdict

hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [None]:
len(hashes), len(documents)

In [None]:
with open('documents-with-idxs.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=4)

In [None]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import torch
import gc

torch.cuda.empty_cache()
gc.collect()

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct", torch_dtype=torch.float16, trust_remote_code=True) ## , quantization_config=quant_config

In [None]:
def llm(prompt):
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device="cuda",)
    
    generation_args = {
        "max_new_tokens": 200,
        "return_full_text": False,
        "temperature": 1,
        "do_sample": False,
    }

    with torch.no_grad():
        output = pipe([{"role": "user", "content": prompt}], **generation_args)

    return output[0]['generated_text']


def generate_question(doc):
    prompt = prompt_template.format(**doc)

    response = llm(prompt)

    return response

In [None]:
from tqdm.auto import tqdm

results = {}

for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_question(doc)

    results[doc_id] = json.loads(questions.replace("json","").replace("\n","").replace("`","").lstrip().rstrip())