|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 3:</h2>|<h1>Evaluating LLMs<h1>|
|<h2>Section:</h2>|<h1>Quantitative evaluations<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: MAUVE diversity<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202509" target="_blank">udemy.com/course/dulm_x/?couponCode=202509</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
!pip install mauve-text

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import requests

import mauve
from tqdm import tqdm

import torch
from transformers import AutoTokenizer,AutoModelForCausalLM

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# import GPT2 and disable normalizations
gptSmall = AutoModelForCausalLM.from_pretrained('gpt2').to(device)
gptLarge = AutoModelForCausalLM.from_pretrained('gpt2-large').to(device)
gptSmall.eval()
gptLarge.eval()

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.set_pad_token_id = tokenizer.eos_token_id

# Exercise 1: Get models data

In [None]:
# ~25 mins
gptSmall_data = []
gptLarge_data = []

numReps = 100
numTokens = 400

# generate token sequences
for _ in tqdm(range(numReps),desc='Generating tokens...'):

  ### in GPT2-small
  out = gptSmall.generate(
      torch.tensor([[tokenizer.bos_token_id]]).to(device),
      pad_token_id = tokenizer.eos_token_id,
      min_length = numTokens,
      max_length = numTokens,
      do_sample  = True,
      top_k      = 50,
      top_p      = .95,
  )
  gptSmall_data.append(out[0][1:])


  ### and repeat for GPT2-large
  out = gptLarge.generate(
      torch.tensor([[tokenizer.bos_token_id]]).to(device),
      pad_token_id = tokenizer.eos_token_id,
      min_length = numTokens,
      max_length = numTokens,
      do_sample  = True,
      top_k      = 50,
      top_p      = .95,
  )
  gptLarge_data.append(out[0][1:])

# Exercise 2: Test against human texts

In [None]:
# all books have the same url format; they are unique by numerical code
baseurl = 'https://www.gutenberg.org/cache/epub/'

bookurls = [
    # code       title
    ['64317', 'GreatGatsby'     ],
    ['11',    'AliceWonderland' ],
    ['1513',  'RomeoJuliet'     ],
    ['76',    'HuckFinn'        ],
    ['2148',  'EdgarAllenPoe'   ],
    ['829',   'GulliversTravels']
]

In [None]:
mauves = np.zeros((2,len(bookurls)))
booki = 0

xticklabels = []

# loop through the books
for code,title in bookurls:

  # get the text
  fullurl = baseurl + code + '/pg' + code + '.txt'
  txt = requests.get(fullurl).text

  xticklabels.append(title[:5])

  # tokenize the text
  tokens = tokenizer.encode(txt,return_tensors='pt')

  # get random contiguous segments
  ix = torch.randint(len(tokens[0])-numTokens,size=(numReps,))
  human_data = tokens[0][ix[:,None] + torch.arange(numTokens)].to(device)



  # --- GPT_SMALL mauve score
  mauve_output  = mauve.compute_mauve(
      p_tokens  = gptSmall_data,
      q_tokens  = human_data,
      verbose   = False,
      device_id = 0
  )
  mauves[0,booki] = mauve_output.mauve



  # --- GPT_LARGE mauve score
  mauve_output  = mauve.compute_mauve(
      p_tokens  = gptLarge_data,
      q_tokens  = human_data,
      verbose   = False,
      device_id = 0
  )
  mauves[1,booki] = mauve_output.mauve

  # update book index
  booki += 1


In [None]:
plt.figure(figsize=(10,4))

plt.bar(np.arange(len(bookurls))-.2,mauves[0,:],width=.4,facecolor=[.7,.9,.7],edgecolor='k',label='Small')
plt.bar(np.arange(len(bookurls))+.2,mauves[1,:],width=.4,facecolor=[.7,.7,.9],edgecolor='k',label='Large')

plt.gca().set(xticks=range(6),xticklabels=xticklabels,ylabel='MAUVE score')
plt.legend()
plt.show()

# Exercise 3: MAUVE after prompting

In [None]:
# ~18 mins
gptLarge_dataGT = []
nPromptTokens = 100

# get large tokens, prompted by human text
for b in range(numReps):
  out = gptLarge.generate(
      human_data[b,:nPromptTokens].unsqueeze(0).to(device),
      pad_token_id = tokenizer.eos_token_id,
      min_length = nPromptTokens+numTokens,
      max_length = nPromptTokens+numTokens,
      do_sample  = True,
      top_k      = 50,
      top_p      = .95,
  )
  gptLarge_dataGT.append(out[0,nPromptTokens:])

  if b%5==0: print(f'Finished {b:2}/{numReps} generations.')

In [None]:
# --- GPT_LARGE mauve score
mauve_output  = mauve.compute_mauve(
    p_tokens  = gptLarge_dataGT,
    q_tokens  = human_data,
    verbose   = False,
    device_id = 0
)

In [None]:
print(f'Naive MAUVE score : {mauves[1,-1]:.3f}')
print(f'Prompt MAUVE score: {mauve_output.mauve:.3f}')