# Attempt 1 (using "content"): Error due to length of docs exceeding model allowed length

In [7]:
import asyncio
import json
from PyPDF2 import PdfReader
import os

from embeddings import compute_text_embedding  
from openai_clients import create_openai_embed_client  

async def extract_text_from_pdf(path):
    try:
        reader = PdfReader(path)
        text = []
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
            else:
                print(f"No text extracted from page in {path}")
        return "\n".join(text)
    except Exception as e:
        print(f"Failed to extract text from {path}: {e}")
        return None

async def generate_json_entry(doc_id, doc_name, doc_path, embed_client, embed_model):
    text_content = await extract_text_from_pdf(doc_path)
    if text_content:
        try:
            embedding = await compute_text_embedding(
                q=text_content,
                openai_client=embed_client,
                embed_model=embed_model
            )
            return {
                "Id": str(doc_id),
                "Name": doc_name,
                "Content": text_content,
                "Embedding": embedding
            }
        except Exception as e:
            print(f"Failed to compute embedding for {doc_name}: {e}")
            return None
    else:
        print(f"No content to embed for {doc_name}")
        return None

async def main():
    embed_client, embed_model, _ = await create_openai_embed_client()
    if embed_client is None or embed_model is None:
        print("Failed to initialize OpenAI client.")
        return

    documents = [
        ("1", "Exam Procedures for Candidates", "/Users/akshsabherwal/Desktop/Exam-Procedures-for-Candidates.pdf"),
        ("2", "Spring Exam Timetable 2024", "/Users/akshsabherwal/Desktop/Spring-Exam-Timetable-2024-Final.pdf")
    ]

    json_data = []
    for doc_id, doc_name, doc_path in documents:
        json_entry = await generate_json_entry(doc_id, doc_name, doc_path, embed_client, embed_model)
        if json_entry:
            json_data.append(json_entry)
        else:
            print(f"Failed to create JSON entry for {doc_name}")

    json_file_path = "seed_lse_data.json"
    try:
        with open(json_file_path, "w") as f:
            json.dump(json_data, f, indent=4)
        print(f"JSON file created successfully at {json_file_path}")
    except Exception as e:
        print(f"Failed to write JSON file: {e}")

if __name__ == "__main__":
    asyncio.run(main())


Failed to compute embedding for Exam Procedures for Candidates: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 11846 tokens (11846 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}
Failed to create JSON entry for Exam Procedures for Candidates
Failed to compute embedding for Spring Exam Timetable 2024: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 16847 tokens (16847 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}
Failed to create JSON entry for Spring Exam Timetable 2024
JSON file created successfully at seed_lse_data.json


# Attempt 2 (using "description")

In [10]:
import asyncio
import json
from embeddings import compute_text_embedding  # Ensure correct imports
from openai_clients import create_openai_embed_client  # Import OpenAI client creation function

async def generate_json_entry(doc_id, doc_name, description, link, embed_client, embed_model):
    try:
        embedding = await compute_text_embedding(
            q=description,
            openai_client=embed_client,
            embed_model=embed_model
        )
        return {
            "Id": str(doc_id),
            "Name": doc_name,
            "Description": description,
            "Link": link,
            "Embedding": embedding
        }
    except Exception as e:
        print(f"Failed to compute embedding for {doc_name}: {e}")
        return None

async def main():
    embed_client, embed_model, _ = await create_openai_embed_client()
    if embed_client is None or embed_model is None:
        print("Failed to initialize OpenAI client.")
        return

    # Manually defined descriptions and links for each document
    documents = [
        ("1", "Exam Procedures for Candidates", "Outlines essential procedures for LSE's in-person exams for the 2023/24 academic year. It details candidate responsibilities, exam conduct rules, permitted materials, use of electronic devices, and protocols for e-Exams, including equipment requirements and emergency procedures, to ensure fairness and integrity during the examination process.", "https://info.lse.ac.uk/current-students/services/assets/documents/Exam-Procedures-for-Candidates.pdf"),
        ("2", "Spring Exam Timetable 2024", "Detailed exam schedule for LSE's Spring 2024 session, including dates, times, and courses.", "https://info.lse.ac.uk/current-students/services/assets/documents/Spring-Exam-Timetable-2024-Final.pdf")
    ]

    json_data = []
    for doc_id, doc_name, description, link in documents:
        json_entry = await generate_json_entry(doc_id, doc_name, description, link, embed_client, embed_model)
        if json_entry:
            json_data.append(json_entry)
        else:
            print(f"Failed to create JSON entry for {doc_name}")

    json_file_path = "seed_lse_data.json"
    try:
        with open(json_file_path, "w") as f:
            json.dump(json_data, f, indent=4)
        print(f"JSON file created successfully at {json_file_path}")
    except Exception as e:
        print(f"Failed to write JSON file: {e}")

if __name__ == "__main__":
    asyncio.run(main())


JSON file created successfully at seed_lse_data.json
