In [41]:
import os
import time
import tiktoken
from dotenv import load_dotenv
from typing import List, Tuple
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
load_dotenv()

True

In [21]:
enc = tiktoken.get_encoding("cl100k_base")

def count_tokens(text: str) -> int:
    return enc.encode(text)

def split_text_to_chunks(text: str, chunk_size:int=500) -> List[str]:
    tokens = count_tokens(text)

    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk_tokens = tokens[i:i+chunk_size]
        chunk_text = enc.decode(chunk_tokens)
        chunks.append(chunk_text)

    return chunks

In [22]:
API_KEY = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model='gpt-4.1-nano', max_completion_tokens=300, temperature=0, api_key=API_KEY)

In [11]:
chunk_prompt_template = ChatPromptTemplate.from_template("""
<document>
{document_text}
</document>

<chunk>
{chunk_text}
</chunk>

You are summarizing a chunk of a larger document.
The summary must:
- Briefly capture the main idea of the chunk (max 300 tokens).
- Clearly link the chunk to the broader document context.
- Avoid filler, repetition, or generic phrases.
- Be standalone so it can be stored and retrieved without needing the full document.

Write exactly ONE concise paragraph.
""")

In [49]:
def split_and_summarize_body(body_list: List[str], tpm_limit:int=180000) -> List[List[str]]:
    tasks: List[Tuple[int, List[HumanMessage], int]] = []
    for i, body_text in enumerate(body_list):
        if not body_text:
            continue

        chunks = split_text_to_chunks(body_text)
        for chunk in chunks:
            formatted_prompt = chunk_prompt_template.format(document_text=body_text, chunk_text=chunk)
            token_est = count_tokens(formatted_prompt)
            tasks.append((i, HumanMessage(content=formatted_prompt), token_est))

    if not tasks:
        return [[] for _ in body_list]
    
    summaries: List[List[str]] = [[] for _ in body_list]
    current_batch: List[Tuple[int, List[HumanMessage], int]] = []
    current_tokens = 0
    start_time = time.time()

    def run_batch(batch: List[Tuple[int, List[HumanMessage], int]]):
        responses = llm.batch([msg for _, msg, _ in batch])
        for (doc_index, _, _), resp in zip(batch, responses):
            summaries[doc_index].append(resp.content.strip())

    for task in tasks:
        _, _, token = task
        if current_tokens + token > tpm_limit and current_batch:
            run_batch(current_batch)
            current_batch, current_tokens = [], 0

            elapsed = time.time() - start_time
            if elapsed < 30:
                time.sleep(30-elapsed)
            start_time = time.time()

        current_batch.append(task)
        current_tokens += token

    if current_batch:
        run_batch(current_batch)

    return summaries

In [33]:
meta_data_prompt = ChatPromptTemplate.from_template("""
You are an expert email summarizer. Generate ONE professional paragraph summarizing the email.
STRICT RULES:
1. You MUST include these fields exactly once: Email_id, Thread_id, From, To, CC, Source, Date, Labels, Subject.
2. You MUST reproduce every field value exactly as provided, without paraphrasing or omission.
3. You MUST explicitly list EVERY recipient in "To" and "CC". Do NOT use 'others', 'etc.', or shorten lists.
4. If a field is empty or 'None', output the word "None".
5. Write as a single paragraph, grammatically correct, no bullet points.
6. The output MUST follow this format:
Email Email_id, part of thread Thread_id, was sent by From. The To recipients are To. The CC's are CC. It originated from Source on Date. The email carries the labels Labels. The subject of the email is 'Subject'.

Here is the metadata for the email:
{metadata}
""")

In [None]:
parser = StrOutputParser()
def get_metadata_summaries(metadata_list: List[str], tpm_limit:int = 180000) -> List[str]:
    meta_tasks = List[Tuple[List[HumanMessage], int]] = []
    for md in metadata_list:
        formatted_prompt = meta_data_prompt.format(metadata=md)
        token_est = count_tokens(formatted_prompt)
        meta_tasks.append((HumanMessage(content=formatted_prompt), token_est))

    if not meta_tasks:
        return []
    
    meta_summaries: List[str] = []
    
    current_batch = []
    current_tokens = 0
    start_time = time.time()

    def run_batch(batch: List[Tuple[List[HumanMessage], int]]):
        responses = llm.batch([msg for msg, _ in batch])
        for resp in responses:
            meta_summaries.append(resp.content.strip())

    for task in meta_tasks:
        _, _, token = task
        if current_tokens + token > tpm_limit and current_batch:
            run_batch(current_batch)
            current_batch, current_tokens = [], 0

            elapsed = time.time()-start_time
            if elapsed<30:
                time.sleep(30-start_time)
            start_time = time.time()

        current_batch.append(task)
        current_tokens += token

    if current_batch:
        run_batch(current_batch)

    return meta_summaries