In [None]:
from abc import ABC, abstractmethod

class LLM(ABC):

    @abstractmethod
    def next_token_logits(self, prev_token_ids: list[int]) -> list[float]:
        """
        Compute probability distribution for the next token conditioned on previous tokens.

        This function makes an http request to a server that can serve many requests in parallel.

        Args:
            prev_token_ids (list[int]): Tokens generated so far.

        Returns:
            list[float]: Logits for the next token. Length equal to the vocabulary size.
        """
        ...

    @abstractmethod
    def get_stop_token_id(self) -> int:
        """
        Retrieve the token that signals the end of generation.

        Returns:
            int: The integer token ID that indicates termination.
        """
        ...

 PART 1

 

In [None]:
def generate_token_ids(input_token_ids: list[int], llm: LLM) -> list[int]:
    """
    Generate tokens autoregressively using a language model until the stop token is
    encountered.

    Args:
        input_token_ids (list[int]): Token IDs for the prompt.
        llm (LLM): The language model.

    Returns:
        list[int]: A list of generated token IDs, including the stop token at the end.
    """
    # your code here

In [None]:
import numpy as np

def softmax(x: list[float]) -> list[float]:
    x = np.array(x)
    x = x - np.max(x)
    e = np.exp(x)
    return list(e / np.sum(e))

def generate_token_ids(input_token_ids: list[int], llm: LLM) -> list[int]:
    id_stop_token = llm.get_stop_token_id()
    
    while input_token_ids[-1] != id_stop_token:

        logits = llm.next_token_logits(input_token_ids)
        prob = softmax(logits)
        # sample some int from prob
        id_next_token = np.random.choice(np.arange(len(logits)), p=prob)
        input_token_ids.append(id_next_token)

    return input_token_ids


Part 2

In [None]:
def generate_token_ids_multiple(input_token_ids: list[int], llm: LLM, n: int) -> list[list[int]]:
    """
    Generate multiple i.i.d. sequences from a language model.

    Args:
        input_token_ids (list[int]): Token IDs for the prompt.
        llm (LLM): The language model.
        n (int): Number of i.i.d. sequences to return.

    Returns:
        list[list[int]]: A list of lists of generated token IDs. Length n.
    """
    # your code here

In [None]:
from copy import copy

def generate_token_ids_multiple(input_token_ids: list[int], llm: LLM, n: int) -> list[list[int]]:
    out = []
    for rollout in range(n):
        np.random.seed(rollout)
        start_tokens = copy(input_token_ids)
        out.append(generate_token_ids(start_tokens, llm))

    return out

In [None]:
import asyncio

async def generate_token_ids_multiple(input_token_ids: list[int], llm: LLM, n: int) -> list[list[int]]:
    # Optimized version: llm.next_token_logits can be run in parallel
    id_stop_token = llm.get_stop_token_id()
    
    tokens = []
    for _ in range(n):
        tokens.append( copy(input_token_ids) )
    
    not_done = list(range(n)) # id of the sequences that have not reached the stop token yet
    
    while len(not_done) > 0:
        # generate next logits in parallel for all sequences
        logits = await asyncio.gather([llm.next_token_logits(tokens[i]) for i in not_done])
        probs = [softmax(logit) for logit in logits]

        for i in range(len(not_done)):
            np.random.seed(i)
            id_next_token = np.random.choice(np.arange(len(logits[not_done[i]])), p=probs[i])
            tokens[not_done[i]].append(id_next_token)
            if id_next_token == id_stop_token:
                not_done.remove(i) # stop generation of sequence i when reaching the stop token
    
    return tokens

