#CafChem Teaching - LLMs and token sampling.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/MauricioCafiero/CafChemTeach/blob/main/notebooks/LLM_Sampling_CafChem.ipynb)

## This notebook allows you to:
- Load models and tokenizers from Huggingface
- use greedy or temperature-based sampling for token selection
- use a pipeline for model use

## Requirements:
- GPU; Colab free tier, or will also run on Kaggle free tier (turn on accelerator GPU).
- Huggingface token in secrets; may need repo access.

##Import libraries

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
import pandas as pd
import numpy as np
import random

## Define functions

In [None]:
def setup_decoder(model_name: str):
  '''
    Setup the decoder model. Pulls model and tokenizer from HuggingFace, sets up device.

      Args:
        model_name: model to pull from HuggingFace.
      Returns:
        tokenizer, model, device
  '''
  device = "cuda" if torch.cuda.is_available() else "cpu"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

  print(f"model setup complete: {model_name}")
  return tokenizer, model, device

def decoder_inference(model, tokenizer, device, input_txt: str, n_steps: int,
                      TEMP = 0.0):
  '''
    Takes input_txt and performs autoregressive inference. Captures the chosen token and
    probability at each step.

      Args:
        model: model to pull from HuggingFace.
        tokenizer: tokenier associated with the model
        device: cpu or cuda
        input_txt: prompt to decoder
        n_steps: how many tokens to use
        TEMP: temperature for inference
        use_ramp: Boolean, use temperature ramp?
      Returns:
        iterations: a list containing the input at each step, the chosen token
                    and probability
  '''
  input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)

  iterations = []

  with torch.no_grad():
      for c in range(n_steps):
          iteration = dict()

          iteration["Input"] = tokenizer.decode(input_ids[0])
          output = model(input_ids=input_ids)

          # Select logits of the first batch and the last token and apply softmax
          next_token_logits = output.logits[0, -1, :]
          next_token_probs = torch.softmax(next_token_logits, dim=-1)

          if TEMP < 0.015:
            token_id = torch.argmax(next_token_probs,axis=-1)
          else:
            scaled_probs = next_token_probs**(1/TEMP)
            scaled_probs = scaled_probs / scaled_probs.sum()

            token_id = np.random.choice(scaled_probs.shape[-1], p=scaled_probs.cpu().numpy())
            token_id = torch.tensor(token_id).to(device)

          token_prob = next_token_probs[token_id].cpu().numpy()
          token_choice = (f"{tokenizer.decode(token_id)} ({100 * token_prob:.2f}%)")

          iteration[f"Choice"] = token_choice

          # Append predicted next token to input
          input_ids = torch.cat([input_ids, token_id.unsqueeze(0).unsqueeze(0)], dim=-1)

          iterations.append(iteration)
  return iterations

## Set up model

In [None]:
model_name = 'HuggingFaceTB/SmolLM2-360M-Instruct'

tokenizer, model, device = setup_decoder(model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/724M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

model setup complete: HuggingFaceTB/SmolLM2-360M-Instruct


## View output token-by-token

In [None]:
input_txt = "What country has the best cuisine?"
iterations = decoder_inference(model, tokenizer,device, input_txt, n_steps= 50,
                               TEMP = 0.4)

In [None]:
iterations

[{'Input': 'What country has the best cuisine?',
  'Choice': '<|im_end|> (36.10%)'},
 {'Input': 'What country has the best cuisine?<|im_end|>',
  'Choice': '\n (100.00%)'},
 {'Input': 'What country has the best cuisine?<|im_end|>\n',
  'Choice': '<|im_start|> (93.35%)'},
 {'Input': 'What country has the best cuisine?<|im_end|>\n<|im_start|>',
  'Choice': 'ass (99.89%)'},
 {'Input': 'What country has the best cuisine?<|im_end|>\n<|im_start|>ass',
  'Choice': 'istant (100.00%)'},
 {'Input': 'What country has the best cuisine?<|im_end|>\n<|im_start|>assistant',
  'Choice': '\n (100.00%)'},
 {'Input': 'What country has the best cuisine?<|im_end|>\n<|im_start|>assistant\n',
  'Choice': 'The (16.59%)'},
 {'Input': 'What country has the best cuisine?<|im_end|>\n<|im_start|>assistant\nThe',
  'Choice': ' best (27.77%)'},
 {'Input': 'What country has the best cuisine?<|im_end|>\n<|im_start|>assistant\nThe best',
  'Choice': ' country (32.48%)'},
 {'Input': 'What country has the best cuisine?<|i

In [None]:
input_txt = "What country has the best cuisine?"
iterations = decoder_inference(model, tokenizer,device, input_txt, n_steps= 50,
                               TEMP = 0.2)

In [None]:
iterations

[{'Input': 'What country has the best cuisine?', 'Choice': '\n (33.93%)'},
 {'Input': 'What country has the best cuisine?\n', 'Choice': '\n (42.38%)'},
 {'Input': 'What country has the best cuisine?\n\n', 'Choice': 'I (7.01%)'},
 {'Input': 'What country has the best cuisine?\n\nI',
  'Choice': ' am (10.45%)'},
 {'Input': 'What country has the best cuisine?\n\nI am',
  'Choice': ' asking (8.78%)'},
 {'Input': 'What country has the best cuisine?\n\nI am asking',
  'Choice': ' for (27.63%)'},
 {'Input': 'What country has the best cuisine?\n\nI am asking for',
  'Choice': ' the (20.85%)'},
 {'Input': 'What country has the best cuisine?\n\nI am asking for the',
  'Choice': ' best (29.50%)'},
 {'Input': 'What country has the best cuisine?\n\nI am asking for the best',
  'Choice': ' country (38.75%)'},
 {'Input': 'What country has the best cuisine?\n\nI am asking for the best country',
  'Choice': ' for (24.46%)'},
 {'Input': 'What country has the best cuisine?\n\nI am asking for the best cou

## Use a pipeline

In [45]:
pipe_llm = pipeline("text-generation", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [52]:
messages = [{"system": "you are a helpful assistant","role": "user", "content": input_txt}]
result = pipe_llm(messages, max_new_tokens=500, temperature=1.0)

In [53]:
result

[{'generated_text': [{'system': 'you are a helpful assistant',
    'role': 'user',
    'content': 'What country has the best cuisine?'},
   {'role': 'assistant',
    'content': 'Based on various datasets and research, the best cuisine can vary depending on what it refers to. However, some of the top culinary destinations include France, Italy, Spain, Japan, Mexico, China, India, India, Thailand, Japan, and Thailand. Each has its own unique specialties and culinary traditions.'}]}]