In [None]:
# !pip install "tensorflow-text==2.4.*"

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import csv
import datetime

In [None]:
print(tf.__version__)

2.9.2


In [None]:
chars = sorted(set("abcdefghijklmnopqrstuvwxyz0123456789 -,;.!?:’’’/\|_@#$%ˆ&*˜‘+-=()[]{}' ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
chars = list(chars)
EOS = ''
UNK = ""
PAD = "" 
chars.append(UNK)
chars.append(EOS) 
chars.insert(0, PAD)  

In [None]:
char2idx = {u:i for i, u in enumerate(chars)}
idx2char = np.array(chars)

In [None]:
def char_idx(c):
  if c in chars:
    return char2idx[c]
  return char2idx[UNK]

In [None]:
data = []
MAX_LEN = 75
with open("/content/drive/MyDrive/NLP/text_generation/resume/data.tsv","r") as file:
  lines = csv.reader(file,delimiter='\t')
  for line in lines:
    hdline = line[0]
    cnvrtd = [char_idx(c) for c in hdline]
    if len(cnvrtd) >= MAX_LEN:
      cnvrtd = cnvrtd[0:MAX_LEN-1]
      cnvrtd.append(char2idx[EOS])
    else:
      cnvrtd.append(char_idx(EOS))
      remain = MAX_LEN - len(cnvrtd)
      if remain > 0:
        for i in range(remain):
          cnvrtd.append(char2idx[PAD])
    data.append(cnvrtd) 


In [None]:
len(data)

623272

In [None]:
np_data = np.array(data)

In [None]:
np_data_in = np_data[:,:-1]
np_data_out = np_data[:,1:]

In [None]:
# np_data_in[0],np_data_out[0]

In [None]:
vocab_size = len(chars)
embedding_dim = 256
rnn_units = 1024
BATCH_SIZE=256

In [None]:
X_train = tf.data.Dataset.from_tensor_slices((np_data_in,np_data_out))
X_train = X_train.shuffle(100000,reshuffle_each_iteration=True).batch(BATCH_SIZE,drop_remainder=True)

In [None]:
def build_model(vocab_size,embedding_dim,rnn_units,batch_size):
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size,embedding_dim,mask_zero=True,batch_input_shape=[batch_size,None]),
      tf.keras.layers.GRU(rnn_units,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform'),
      tf.keras.layers.Dropout(0.1),
      tf.keras.layers.Dense(vocab_size,activation="softmax")
  ])
  return model

In [None]:
import os
dt = datetime.datetime.today().strftime("%Y-%b-%d-%H-%M-%S")
checkpoint_dir = '/content/drive/MyDrive/NLP/text_generation/resume/checkpoints/epoch_'+dt
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True)

In [None]:
class LearningRateScheduler(tf.keras.callbacks.Callback):
  def __init__(self,init_lr,decay,steps,start_epoch):
    super().__init__()
    self.init_lr = init_lr
    self.decay = decay
    self.steps = steps
    self.start_epoch = start_epoch

  def on_epoch_begin(self,epoch,logs=None):
    if not hasattr(self.model.optimizer,'lr'):
      raise ValueError("Optimizer must have 'lr' attribute")
    lr = float(tf.keras.backend.get_value(self.model.optimizer.lr))
    if(epoch >= self.start_epoch):
      scheduled_lr = self.init_lr / (1 + self.decay * (epoch/self.steps))
      tf.keras.backend.set_value(self.model.optimizer.lr,scheduled_lr)
      print(f'\n epoch : {epoch} learning rate is {scheduled_lr}')

In [None]:
tf.config.run_functions_eagerly(True) 
model = build_model(vocab_size,embedding_dim,rnn_units,BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (256, None, 256)          24576     
                                                                 
 gru (GRU)                   (256, None, 1024)         3938304   
                                                                 
 dropout (Dropout)           (256, None, 1024)         0         
                                                                 
 dense (Dense)               (256, None, 96)           98400     
                                                                 
Total params: 4,061,280
Trainable params: 4,061,280
Non-trainable params: 0
_________________________________________________________________


In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam',loss=loss)

In [None]:
# EPOCHS=10
# lr_decay = LearningRateScheduler(0.001, 4., EPOCHS, 10)
# history = model.fit(X_train, epochs=EPOCHS,callbacks=[checkpoint_callback, lr_decay])

Epoch 1/10


  return dispatch_target(*args, **kwargs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# model.save("/content/drive/MyDrive/NLP/text_generation/resume/saved_model")



In [None]:
# model = tf.keras.models.load_model("/content/drive/MyDrive/NLP/text_generation/resume/saved_model")

In [None]:
def build_gen_model(vocab_size,embedding_dim,rnn_units,batch_size):
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size,embedding_dim,batch_input_shape=[batch_size,None]),
      tf.keras.layers.GRU(rnn_units,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform'),
      tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [None]:
gen_model = build_gen_model(vocab_size,embedding_dim,rnn_units,batch_size=1)
checkpoint_dir = '/content/drive/MyDrive/NLP/text_generation/resume/checkpoints/epoch_2022-Nov-23-21-07-50'
gen_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
gen_model.build(tf.TensorShape([1,None]))
gen_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (1, None, 256)            24576     
                                                                 
 gru_3 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_3 (Dense)             (1, None, 96)             98400     
                                                                 
Total params: 4,061,280
Trainable params: 4,061,280
Non-trainable params: 0
_________________________________________________________________




In [None]:
def generate_text(model,start_string,temprature=0.7,num_generate=75):
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval,0)
  print(input_eval)
  text_generated = []
  for i in range(num_generate):
    predicitions = model(input_eval)
    predicitions = tf.squeeze(predicitions,0)
    predicitions = predicitions / temprature

    predicted_id = tf.random.categorical(predicitions,num_samples=1)[-1,0].numpy()
    input_eval = tf.expand_dims([predicted_id],0)
    text_generated.append(idx2char[predicted_id])
  return (start_string + ''.join(text_generated))

In [None]:
generate_text(gen_model,start_string="apple")

tf.Tensor([[61 76 76 72 65]], shape=(1, 5), dtype=int32)


'apple partnership with Beyonc record $3 billion for most remarks'

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 7.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 24.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 48.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [2]:
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

In [3]:
gpt2tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2 = TFGPT2LMHeadModel.from_pretrained("gpt2",pad_token_id=gpt2tokenizer.eos_token_id)

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [8]:
input_ids = gpt2tokenizer.encode("this week",return_tensors='tf')

In [9]:
greedy_output = gpt2.generate(input_ids,max_length=50)
print("Output:\n" + 50 * '-')
print(gpt2tokenizer.decode(greedy_output[0],skip_special_tokens=True))

Output:
--------------------------------------------------
this week, the U.S. Supreme Court ruled that the government can't force a person to pay for a prescription drug.

The ruling, which was announced by the U.S. Court of Appeals for the D.C. Circuit


In [10]:
beam_output = gpt2.generate(input_ids,max_length=50,num_beams=5,no_repeat_ngram_size=3,early_stopping=True)
print("Output:\n" + 50 * '-')
print(gpt2tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
--------------------------------------------------
this week).

"I think it's important for us to be able to make sure that we're doing everything we can to ensure that our players are getting the best out of each other," he said.


In [11]:
beam_output = gpt2.generate(input_ids, max_length=50, do_sample=True, top_k=25,temperature=0.58)
print("Output:\n" + 50 * '-')
print(gpt2tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
--------------------------------------------------
this week) was a great opportunity to see what the new team is doing and learn from the mistakes we made. I'm happy to be here and I'm excited to be in this league for the long haul. I know we're going to be
