In [None]:
import os
import re
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

from IPython.display import Markdown, display
from dotenv import load_dotenv
from openai import OpenAI

# from config import system_prompt, user_prompt_1, user_prompt_2, clean

load_dotenv()
openai_key = os.getenv('OPENAI_KEY')
client = OpenAI(api_key=openai_key)

In [28]:
# Reload and import configuration - directly import in production

import config
import importlib
config = importlib.reload(config)
globals().update({k: getattr(config, k) for k in [
    'system_prompt', 'user_prompt_1', 'user_prompt_2', 'clean']})

In [12]:
def generate(messages, model='gpt-4.1-nano'):
    start = time.time()
    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.3,  # Adjust for more creative or deterministic output
        max_tokens=10000,  # Increase this if responses are getting cut off
        top_p=0.3,
        frequency_penalty=0,
        presence_penalty=0
    )

    elapsed = time.time() - start
    print(f"Completion took {elapsed:.2f} seconds")

    return completion.choices[0].message.content, completion

In [9]:
import json

with open('Lectures.json', 'r', encoding='utf-8') as f:
    lectures = json.load(f)


In [21]:
os.makedirs("outputs", exist_ok=True)
transcripts = {}
transcripts_lock = threading.Lock()

def process_lecture(lecture):
    id = lecture['index']
    title = lecture['title']
    content = lecture['content']

    if id > 2:
        return

    lec_prompt_1 = user_prompt_1 + title +  "\n\n" + content

    # Step 1

    text_1, completion_1 = generate([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": lec_prompt_1}])

    # Step 2 A
    
    filepath = os.path.join("outputs", f"{id:02d} {re.sub(r'[<>:"/\\|?*]', '', title)}.md")

    with open(filepath, "w", encoding="utf-8") as f:
        f.write(clean(text_1))

    # Step 2 B
    
    text_2, completion_2 = generate([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": lec_prompt_1},
        {"role": "assistant", "content": text_1},
        {"role": "user", "content": user_prompt_2},])
    
    with transcripts_lock:
        transcripts[id] = {
            "title": title,
            "content": text_2,
        }

In [None]:
with ThreadPoolExecutor() as executor:
    target_lectures = [lecture for lecture in lectures if lecture['index'] <= 2]
    executor.map(process_lecture, target_lectures)

Completion took 12.16 seconds
Completion took 13.34 seconds
Completion took 14.55 seconds
Completion took 8.75 seconds
Completion took 10.44 seconds
Completion took 9.20 seconds


In [23]:
# Save Transcripts

transcripts_list = [
    {"index": lecture_id, "title": transcript["title"], "content": transcript["content"]}
    for lecture_id, transcript in sorted(transcripts.items())
]
with open("outputs/transcripts.json", "w", encoding="utf-8") as f:
    json.dump(transcripts_list, f, ensure_ascii=False, indent=2)