### Using Flan T5

Google just released new SOTA LLMs on huggingface (better than OPT and Bloom). 
It's instruction-finetuned via reinforcement learning. This might be the best open-source LLM for our system, and should replace OPT.

* Paper: https://arxiv.org/abs/2210.11416
* huggingface: https://huggingface.co/docs/transformers/model_doc/flan-t5

In [None]:
!pip install accelerate sentencepiece

In [14]:
import torch
import accelerate
from transformers import T5Tokenizer, T5ForConditionalGeneration

from transformers import T5Tokenizer, T5EncoderModel
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5EncoderModel.from_pretrained("google/flan-t5-large", device_map="auto", torch_dtype=torch.float16)

input_ids = tokenizer(
    "One", return_tensors="pt"
).input_ids  # Batch size 1
outputs = model(input_ids=input_ids, )
last_hidden_states = outputs.last_hidden_state
last_hidden_states

Some weights of the model checkpoint at google/flan-t5-large were not used when initializing T5EncoderModel: ['decoder.block.1.layer.1.layer_norm.weight', 'decoder.block.11.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.14.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.1.EncDecAttention.q.weight', 'decoder.block.0.layer.2.layer_norm.weight', 'decoder.block.4.layer.1.layer_norm.weight', 'decoder.block.17.layer.0.layer_norm.weight', 'decoder.block.22.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.layer_norm.weight', 'decoder.block.13.layer.2.DenseReluDense.wo.weight', 'decoder.block.22.layer.0.SelfAttention.o.weight', 'decoder.block.4.layer.2.DenseReluDense.wo.weight', 'decoder.block.9.layer.0.SelfAttention.k.weight', 'decoder.block.21.layer.1.EncDecAttention.o.weight', 'decoder.block.8.layer.0.SelfAttention.v.weight', 'decoder.block.5.layer.0.SelfAttention.k.weight', 'decoder.block.2.layer.0.SelfAttention.v.weight', 'decoder.block.2.layer.1.EncDecAttention.

tensor([[[ 0.0033,  0.0014,  0.0020,  ...,  0.0083, -0.0006,  0.0019],
         [-0.0797,  0.0751,  0.0628,  ..., -0.0573,  0.1196,  0.0057]]],
       dtype=torch.float16, grad_fn=<ToCopyBackward0>)

In [5]:
import more_itertools
files = ['hi', 'be', 'oh', 'my']
batches = list(more_itertools.divide(2, files))
batches
for b in batches:
  print(b)
  for iner in b:
    print(iner)

<list_iterator object at 0x7f088e676b00>
hi
be
<list_iterator object at 0x7f088e5ccfa0>
oh
my


In [15]:
import lovely_tensors as lt
lt.monkey_patch()

print(input_ids)
last_hidden_states

tensor[1, 2] i64 μ=278.000 σ=391.737 [[555, 1]]


tensor[1, 2, 1024] f16 n=2048 x∈[-0.254, 0.297] μ=0.000 σ=0.043 grad ToCopyBackward0

: 

In [52]:
import deeplake as dl
BATCH_NAME          = 'parallel_15'
# WHISPER_RESULTS_DATASET_PATH        = f'/mnt/storage_ssd/v1_whisper_results_{BATCH_NAME}'
WHISPER_RESULTS_DATASET_PATH        = f'/mnt/storage_ssd/whisper_results_{BATCH_NAME}'

ds = dl.load(WHISPER_RESULTS_DATASET_PATH)
ds.summary()

/mnt/storage_ssd/whisper_results_parallel_15 loaded successfully.
Dataset(path='/mnt/storage_ssd/whisper_results_parallel_15', tensors=['caption', 'segment_metadata', 'video_filename', 'video_filepath'])

      tensor        htype     shape     dtype  compression
     -------       -------   -------   -------  ------- 
     caption        text    (5346, 1)    str     None   
 segment_metadata   json    (5346, 1)    str      lz4   
  video_filename    text    (5346, 1)    str     None   
  video_filepath    text    (5346, 1)    str     None   


In [53]:
start_index = 16748-800
all_tokenized = []
for i, sample in enumerate(ds):
  # if i < start_index:
  #   continue
  # print(sample.caption.data())
  # print(sample.segment_metadata.data())
  import traceback
  try:
    if sample.caption.data()['value']:
      # print(i)
      # print(sample.caption.data()['value'])
      input_ids = tokenizer(
          sample.caption.data()['value'], return_tensors="pt"
      ).input_ids
      all_tokenized.append(input_ids.shape[1])
  except Exception as e:
    # print(f"Error {e}")
    # print(traceback.print_exc())
    break
  # if i > start_index:
  #   break
all_tokenized

[38,
 30,
 33,
 34,
 31,
 30,
 34,
 36,
 31,
 29,
 40,
 41,
 33,
 34,
 35,
 30,
 31,
 28,
 34,
 35,
 31,
 32,
 32,
 38,
 30,
 32,
 39,
 34,
 31,
 32,
 32,
 33,
 37,
 34,
 27,
 29,
 35,
 34,
 40,
 31,
 33,
 40,
 31,
 36,
 35,
 32,
 34,
 28,
 29,
 29,
 27,
 26,
 36,
 36,
 34,
 33,
 32,
 36,
 36,
 34,
 35,
 33,
 32,
 37,
 31,
 35,
 37,
 27,
 25,
 27,
 23,
 31,
 31,
 27,
 29,
 31,
 28,
 31,
 30,
 32,
 29,
 31,
 33,
 46,
 30,
 30,
 40,
 33,
 42,
 44,
 38,
 40,
 37,
 33,
 32,
 33,
 35,
 35,
 37,
 37,
 36,
 37,
 44,
 38,
 43,
 41,
 43,
 32,
 40,
 37,
 32,
 42,
 34,
 40,
 35,
 38,
 31,
 44,
 31,
 33,
 34,
 35,
 29,
 33,
 31,
 31,
 27,
 32,
 31,
 27,
 32,
 28,
 29,
 32,
 29,
 34,
 40,
 37,
 32,
 28,
 27,
 34,
 28,
 28,
 31,
 30,
 31,
 31,
 33,
 31,
 34,
 32,
 31,
 33,
 36,
 41,
 27,
 28,
 30,
 34,
 29,
 31,
 25,
 31,
 32,
 29,
 25,
 27,
 32,
 24,
 30,
 30,
 29,
 28,
 24,
 33,
 36,
 24,
 28,
 26,
 31,
 26,
 30,
 31,
 28,
 27,
 26,
 24,
 34,
 26,
 27,
 29,
 34,
 40,
 38,
 30,
 33,
 25,
 30,
 32,


In [56]:
ds[2672].caption.data()['value']

'PIPETTE PUMP ONE TWENTY-TWO HUNDRED-MICROLITER PIPETTER ONE ONE HUNDRED-ONE THOUSAND-MICROLITER PIPETTER ONE EIGHT-CHANNEL TEN-ONE HUNDRED-MICROLITER'

In [50]:
input_ids = tokenizer(
          ds[1576].caption.data()['value'], return_tensors="pt"
      ).input_ids
input_ids

tensor[1, 57] i64 x∈[1, 28889] μ=6.584e+03 σ=7.789e+03

In [55]:
all_tokenized

# take mean of list all_tokenized
import numpy as np
print("mean", np.mean(all_tokenized))
# take max of list all_tokenized
print("max", np.max(all_tokenized))

print("argmax", np.argmax(all_tokenized))

# take min of list all_tokenized
print("min", np.min(all_tokenized))

# take median of list all_tokenized
print("median", np.median(all_tokenized))

# take std of list all_tokenized
print("std", np.std(all_tokenized))

mean 32.398615787504674
max 70
argmax 2672
min 20
median 32.0
std 4.776280983863671


In [37]:
from transformers import T5Tokenizer, T5EncoderModel
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5EncoderModel.from_pretrained("google/flan-t5-large", device_map="auto", torch_dtype=torch.float16)

input_ids = tokenizer(
    "Studies have been shown that owning a dog is good for you", return_tensors="pt"
).input_ids  # Batch size 1
outputs = model(input_ids=input_ids, )
last_hidden_states = outputs.last_hidden_state
last_hidden_states

KeyboardInterrupt: 

In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto", torch_dtype=torch.float16)

CONTEXT = "{Example: A Two-Bit Gray Code Counter}Let's begin with a two-bit Gray code counter with no inputs.As we mentioned in Notes Set 2.1, a Gray code is a cycle over allbit patterns of a certain length in which consecutive patterns differin exactly one bit.For simplicity, our first few examples are based on counters anduse the internal stateof the FSM as the output values.  You should already knowhow to design combinational logic for the outputs if it were necessary.The inputs to a counter, if any, are typically limited to functionssuch as starting and stopping the counter, controlling the counting direction, and resetting the counter to a particular state.A fully-specified transition diagram for a two-bit Gray code counter appears below.With no inputs, the states simply form a loop, withthe counter moving from one state to the next each cycle.Each state in the diagram is marked with the internal state value S_1S_0 (before the ``/'') and the output Z_1Z_0 (after the ``/''), which are always equal for this counter.Based on the transition diagram, we can fill in the K-maps for the next-state values S_1^+ and S_0^+ as shown to the right of thetransition diagram, then derive algebraic expressions in the usual way to obtainS_1^+=S_0 and S_0^+={{S_1}}.We then use the next-state logic to develop the implementationshown on the far right, completing our first counter design."
PROMPT = "Please answer this person's question accurately, clearly and concicely. Context: "
QUESTION = "Question: What are the inputs and outputs of a Gray code counter? "
input_text = PROMPT + CONTEXT + QUESTION + "Answer: "

article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
summary = "Weiter Verhandlung in Syrien."

input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
labels = tokenizer(text_target=summary, return_tensors="pt")

outputs = model.generate(input_ids, max_length=1024)
print(tokenizer.decode(outputs[0]))

In [None]:
last_hidden_states.shape

In [None]:
from transformers import T5Tokenizer, T5EncoderModel

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5EncoderModel.from_pretrained("t5-small")
input_ids = tokenizer(
    "Studies have been shown that owning a dog is good for you", return_tensors="pt"
).input_ids  # Batch size 1
outputs = model(input_ids=input_ids)
last_hidden_states = outputs.last_hidden_state

In [None]:
last_hidden_states.shape