# Installation

In [None]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 6.5 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 60.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 57.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 43.1 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 54.1 MB/s 
Collecting dill<0.3.6
  Downloading dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
[K     |██████

# Imports

In [None]:
import tensorflow as tf
import numpy as np
import io
import os
import pandas as pd
import re
import string
import time
from numpy import random
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense,Flatten,InputLayer,BatchNormalization,Dropout,Input,LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from datasets import load_dataset
from transformers import create_optimizer,DataCollatorForSeq2Seq,DataCollatorForLanguageModeling,BlenderbotTokenizerFast,BlenderbotSmallTokenizerFast,TFBlenderbotForConditionalGeneration

In [None]:
MAX_LENGTH=256

# Dataset Preparation

In [None]:
#kaggle datasets download -d drmatters/joe-rogan

In [None]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d christianlillelund/joe-rogan-experience-1169-elon-musk
!unzip "/content/joe-rogan-experience-1169-elon-musk.zip" -d "/content/dataset/"

Downloading joe-rogan-experience-1169-elon-musk.zip to /content
  0% 0.00/59.1k [00:00<?, ?B/s]
100% 59.1k/59.1k [00:00<00:00, 41.1MB/s]
Archive:  /content/joe-rogan-experience-1169-elon-musk.zip
  inflating: /content/dataset/joe-rogan-experience-1169-elon-musk.csv  


In [None]:
filepath="/content/dataset/joe-rogan-experience-1169-elon-musk.csv"
dataset = load_dataset('csv', data_files=filepath)



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-90edec479b41c1d7/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-90edec479b41c1d7/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Timestamp', 'Speaker', 'Text'],
        num_rows: 1831
    })
})

In [None]:
dataset['train'][0]

{'Timestamp': '[00:00:00]',
 'Speaker': 'Joe Rogan',
 'Text': 'Ah, ha, ha, ha. Four, three, two, one, boom. Thank you. Thanks for doing this, man. Really appreciate it.'}

In [None]:
model_id="facebook/blenderbot-400M-distill"
tokenizer = BlenderbotTokenizerFast.from_pretrained(model_id,truncation_side="left")

Downloading:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/62.9k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

In [None]:
data_array=[]
NUM_SAMPLES=3

In [None]:
tokenizer.pad_token=tokenizer.eos_token

In [None]:
for i in range(NUM_SAMPLES,len(dataset['train'])):
  discussion=""
  bot_output=tokenizer.bos_token+dataset['train'][i]["Text"]+tokenizer.eos_token
  for j in reversed(range(i-NUM_SAMPLES,i)):
    discussion=tokenizer.bos_token+dataset['train'][j]["Text"]+tokenizer.eos_token+discussion
    data_array.append([discussion,bot_output])

In [None]:
pd.DataFrame(data_array,columns=["discussion","bot_output"]).to_csv('discussion.csv')

In [None]:
import pandas as pd

df = pd.read_csv('/content/discussion.csv')

In [None]:
df

Unnamed: 0.1,Unnamed: 0,discussion,bot_output
0,0,<s>It's very good to meet you.</s>,<s>Nice to meet you too.</s>
1,1,<s>You're welcome.</s><s>It's very good to mee...,<s>Nice to meet you too.</s>
2,2,"<s>Ah, ha, ha, ha. Four, three, two, one, boom...",<s>Nice to meet you too.</s>
3,3,<s>Nice to meet you too.</s>,<s>And thanks for not lighting this place on f...
4,4,<s>It's very good to meet you.</s><s>Nice to m...,<s>And thanks for not lighting this place on f...
...,...,...,...
5479,5479,<s>You're welcome.</s><s>All you assholes out ...,"<s>All right, thank you.</s>"
5480,5480,"<s>I believe it's true too. So, thank you.</s>...","<s>All right, thank you.</s>"
5481,5481,"<s>All right, thank you.</s>","<s>Good night, everybody. END OF TRANSCRIPTAut..."
5482,5482,"<s>All you assholes out there, be nice. Be nic...","<s>Good night, everybody. END OF TRANSCRIPTAut..."


In [None]:
filepath="/content/discussion.csv"
dataset = load_dataset('csv', data_files=filepath)



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-db887086d62d20e8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-db887086d62d20e8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'discussion', 'bot_output'],
        num_rows: 5484
    })
})

In [None]:
def preprocess_function(example):
  return tokenizer(
    example['discussion'],
    text_target=example['bot_output'],
    padding='max_length',
    max_length=512,
    truncation=True,)

In [None]:
tokenized_dataset=dataset.map(
    preprocess_function,remove_columns=dataset["train"].column_names
)

  0%|          | 0/5484 [00:00<?, ?ex/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5484
    })
})

In [None]:
model = TFBlenderbotForConditionalGeneration.from_pretrained(model_id)

Downloading:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBlenderbotForConditionalGeneration.

Some layers of TFBlenderbotForConditionalGeneration were not initialized from the model checkpoint at facebook/blenderbot-400M-distill and are newly initialized: ['final_logits_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model, return_tensors="tf")

In [None]:
tf_train_dataset=tokenized_dataset["train"].to_tf_dataset(
    shuffle=True,
    batch_size=4,
    collate_fn=data_collator,
)

You're using a BlenderbotTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
for i in tf_train_dataset.take(1):
  print(i)

{'input_ids': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[   1, 1445,   21, ...,    2,    2,    2],
       [   1, 7721,  411, ...,    2,    2,    2],
       [   1,  452,  341, ...,    2,    2,    2],
       [   1, 1117,  341, ...,    2,    2,    2]])>, 'attention_mask': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>, 'labels': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[   1, 1720,  327, ...,    2,    2,    2],
       [   1, 1167,   19, ...,    2,    2,    2],
       [   1, 1182,  758, ...,    2,    2,    2],
       [   1, 1167,   21, ...,    2,    2,    2]])>}


In [None]:
tf_train_dataset

<PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'labels': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None)}>

In [None]:
def replacements(a):
  for i in [1,2]:
    condition = tf.equal(a, i)
    case_true = -100*tf.ones_like(a)
    
    case_false = a
    a=tf.where(condition, case_true, case_false)
  return a

In [None]:
a=tf.constant([[1,3,234,445,2,2,2],
               [1,3445,234,34,23,2,2]])
replacements(a)

<tf.Tensor: shape=(2, 7), dtype=int32, numpy=
array([[-100,    3,  234,  445, -100, -100, -100],
       [-100, 3445,  234,   34,   23, -100, -100]], dtype=int32)>

In [None]:
def prepare_labels(inputs):
  return {'input_ids':inputs['input_ids'],
          'attention_mask':inputs['attention_mask'],
          'labels':replacements(inputs['labels'])}

In [None]:
train_dataset=tf_train_dataset.map(prepare_labels)

In [None]:
for i in train_dataset.take(1):
  print(i)

{'input_ids': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[   1,  452, 2735, ...,    2,    2,    2],
       [   1,  649,  341, ...,    2,    2,    2],
       [   1, 3276,   21, ...,    2,    2,    2],
       [   1,  946,  304, ...,    2,    2,    2]])>, 'attention_mask': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>, 'labels': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[-100,  880,   21, ..., -100, -100, -100],
       [-100,  691,   19, ..., -100, -100, -100],
       [-100,  553,  513, ..., -100, -100, -100],
       [-100, 1216,  324, ..., -100, -100, -100]])>}


# Modeling

In [None]:
#model = TFBlenderbotForConditionalGeneration.from_pretrained(model_id)
model.summary()

Model: "tf_blenderbot_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFBlenderbotMainLaye  multiple                 364802560 
 r)                                                              
                                                                 
 final_logits_bias (BiasLaye  multiple                 8008      
 r)                                                              
                                                                 
Total params: 364,810,568
Trainable params: 364,802,560
Non-trainable params: 8,008
_________________________________________________________________


In [None]:
num_train_steps=len(tf_train_dataset)
optimizer, schedule = create_optimizer(
  init_lr=6e-5,
  num_warmup_steps=1_000,
  num_train_steps=num_train_steps,
)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
history=model.fit(train_dataset, epochs=1)

Epoch 1/2
 239/1371 [====>.........................] - ETA: 31:34 - loss: 1.9835

KeyboardInterrupt: ignored

In [None]:
model.load_weights('/content/drive/MyDrive/nlp/text_generation/blenderbot.h5')

In [None]:
#model.save_weights('/content/drive/MyDrive/nlp/text_generation/blenderbot.h5')

# Testing

In [None]:
input_text=tokenizer.bos_token+"Hello Elon glad to have you on my podcast."+tokenizer.eos_token+tokenizer.bos_token+"Thanks for Having me."+tokenizer.eos_token+tokenizer.bos_token+"i heard you are building robots. Tell me more about them."+tokenizer.eos_token+tokenizer.bos_token+"Well... Currently working on a robot which can do all house chores for you "+tokenizer.eos_token+tokenizer.bos_token+"Can this robot be used in Mars?"+tokenizer.eos_token

In [None]:
print(input_text)

<s>Hello Elon glad to have you on my podcast.</s><s>Thanks for Having me.</s><s>i heard you are building robots. Tell me more about them.</s><s>Well... Currently working on a robot which can do all house chores for you </s><s>Can this robot be used in Mars?</s>


In [None]:
history=tokenizer(input_text, return_tensors="tf")

In [None]:
MAX_NEW_TOKENS=16

In [None]:
init_time=time.time()
output=model.generate(**history,max_new_tokens=MAX_NEW_TOKENS,do_sample=True,top_p=0.9)

print(tokenizer.decode(output[0]))
print(time.time()-init_time)

<s> I don't think so. I think it's just going to do household chores for you. That's it. You're welcome. Thank you, bye.
19.985085010528564


In [None]:
init_time=time.time()
output_temp = model.generate(**history,max_new_tokens=MAX_NEW_TOKENS,do_sample=True,temperature=1.0, top_k=0)
print(tokenizer.decode(output_temp[0]))
print(time.time()-init_time)

<s> I don't think so. I think it's just going to do house chores. That's all it does. No, it doesn't do anything useful. It's just useless. It doesn't know what to do with it. So, I'm trying to figure out a way to
17.997707843780518


In [None]:
init_time=time.time()
output_temp = model.generate(**history,max_new_tokens=MAX_NEW_TOKENS, do_sample=True,temperature=2.0, top_k=0)
print(tokenizer.decode(output_temp[0]))
print(time.time()-init_time)

<s> Yes, yes it can be. But you do not want to make it stupid. Otherwise, it is going to do a lot of work. It will waste time dig tunnels.That's not going to help.
17.967918872833252


In [None]:
init_time=time.time()
output_temp = model.generate(**history,max_new_tokens=MAX_NEW_TOKENS,do_sample=True,temperature=0.5, top_k=0)
print(tokenizer.decode(output_temp[0]))
print(time.time()-init_time)

<s><s> I don't think so. I think it's just going to do it for you. You can do it. It's just, you know, it's going to be on your roof. You have to hook it up to some sort of a magnetic detector.
17.95498299598694


In [None]:
init_time=time.time()
output_topk = model.generate(**history,max_new_tokens=MAX_NEW_TOKENS,do_sample=True,top_k=50)
print(tokenizer.decode(output_topk[0]))
print(time.time()-init_time)

<s> I don't think so. I think it's just going to do house chores. That's all it does. It doesn't do anything useful. It's just doing it for you. You're welcome. Thank you, bye.
18.42564582824707


In [None]:
init_time=time.time()
output_topk = model.generate(**history,max_new_tokens=MAX_NEW_TOKENS,do_sample=True,temperature=2.0,top_k=50)
print(tokenizer.decode(output_topk[0]))
print(time.time()-init_time)

<s> I don't see why not. What would you use a robot for? I mean, it doesn't necessarily have to do with robotics, right? It's more about creativity. It's your imagination. What ideas do you have?
18.44620966911316


## Chat

In [None]:
MAX_LENGTH=1024
chat_input=""#"<s> A discussion between myself and Elon Musk who thinks his robots can get to mars</s>"

for step in range(10):
  my_text=input(">> Host:")
  new_user_input_ids = tokenizer.encode(
      tokenizer.bos_token+my_text+tokenizer.eos_token,return_tensors='tf')
  if step>0:
    chat_input=chat_input+tokenizer.bos_token+chat_history+tokenizer.eos_token+tokenizer.bos_token+my_text+tokenizer.eos_token
    bot_input_ids = tokenizer.encode(chat_input,return_tensors='tf')
    
  else:
    chat_input=tokenizer.bos_token+my_text+tokenizer.eos_token
    bot_input_ids = tokenizer.encode(chat_input,return_tensors='tf')

  chat_history_ids = model.generate(
      bot_input_ids,max_length=MAX_LENGTH,
      do_sample=True,
      temperature=2.0,top_k=50)
  
  chat_history=tokenizer.decode(chat_history_ids[0],skip_special_tokens=True,)
   
  print(">> Elon Musk: {}".format(tokenizer.decode(chat_history_ids[0], skip_special_tokens=True)))

>> Host:Host:Hello Elon, how are you doing?
>> Elon Musk:  I'm doing very well. Thank you for asking. How are you? What are you up to?
>> Host:I'm doing quite well. Currently building this robot for Mars
>> Elon Musk:  That's really cool. How did you come up with that idea? Do you program it yourself?
>> Host:Yes of course, though i have a team. What about you, what are you building at Tesla?
>> Elon Musk:  Right now, nothing. Eventually, though, I'd like to turn it into Elon Musk and call it Tesla SpaceX. I don't know yet. We'll find out in a couple of years probably.
>> Host:Oh really? What about the robot which helps humans do house chores?
>> Elon Musk:  I don't see why not. I mean, who doesn't want a chimpanzee in their house anyway? Right, right. Right. Okay. Let's just let it go. We'll move on to the next thing. That's going to be the end of the world. We're going to turn it into anthropomorphic beings. We won't even be able to program it. It won't last long. It's kind of sad. I