## ML Project Ideas Generator
Stylized text generation from a small text corpus using transfer-learning

In [1]:
import os

import numpy as np
import tensorflow as tf

from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

from transformers import GPT2Config, GPT2TokenizerFast, TFGPT2LMHeadModel

In [2]:
tf.device('/GPU:0') if tf.config.list_physical_devices('GPU') else tf.device('/CPU:0')

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-12-06 16:35:01.025058: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-06 16:35:01.025409: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


<tensorflow.python.eager.context._EagerDeviceContext at 0x107f3fbc0>

### Preparing the tokenizer

In [3]:
# initialize tokenizer

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([Lowercase()])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()


In [4]:
# train the tokenizer

trainer = BpeTrainer(vocab_size=50000, initial_alphabet=ByteLevel.alphabet(), special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])
tokenizer.train(files=['./data/ml_project_ideas.txt'], trainer=trainer)






In [5]:
# save the tokenizer

tokenizer.save('./tokenizer/tokenizer.json')

In [6]:
# import the saved tokenizer

tokenizer = GPT2TokenizerFast.from_pretrained('tokenizer')

# add special tokens to the vocabulary
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'mask_token': '[MASK]'})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0

### Loading and pre-processing the dataset 

In [7]:
# load dataset
dataset_path = './data/ml_project_ideas.txt'
content = open(dataset_path, 'r', encoding='utf-8').readlines()

len(content)


2592

In [8]:
# clean the dataset
data = tokenizer.eos_token.join(map(lambda seq: seq.strip(), filter(lambda seq: len(seq) > 10, content)))

# encode the dataset
encoded_dataset = tokenizer.encode(data)

# fix sequence length
sequence_length = len(max(content, key=len))

# create training dataset

encoded_dataset = encoded_dataset[:len(encoded_dataset) // sequence_length * sequence_length]

train_data = encoded_dataset[:int(len(encoded_dataset) * 0.9)]
train_data = np.array(train_data).astype(np.int32)

labels = encoded_dataset[1:int(len(encoded_dataset) * 0.9) + 1]
labels = np.array(labels).astype(np.int32)

batch_size = 32
dataset = tf.data.Dataset.from_tensor_slices((train_data, labels))
dataset = dataset.shuffle(buffer_size=len(train_data)).batch(batch_size, drop_remainder=True)

len(dataset)


1365

### Initializing the model

In [9]:
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

model = TFGPT2LMHeadModel(config)


In [10]:
config


GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 2,
  "embd_pdrop": 0.1,
  "eos_token_id": 3,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.21.0",
  "use_cache": true,
  "vocab_size": 7946
}

### Creating the model

In [11]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 3e-5, epsilon = 1e-08, clipnorm = 1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)  # from_logits=True is important here
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])


### Fine-tuning the model

In [12]:
epochs = 1
model.fit(dataset.repeat(epochs), steps_per_epoch=len(dataset), epochs=epochs)


2022-12-06 16:35:10.293701: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-12-06 16:35:10.299378: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




<keras.callbacks.History at 0x17ecb5280>

In [13]:
model.summary()

Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 91944960  
 r)                                                              
                                                                 
Total params: 91,944,960
Trainable params: 91,944,960
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.save_weights('./model/model_weights.h5')


In [15]:
model.save_pretrained('./model/model')


### Generating text

In [16]:
model = TFGPT2LMHeadModel.from_pretrained('./model/model')


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./model/model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [17]:
def generate(model):
    input_ids = tokenizer.encode('[CLS] ML', return_tensors='tf')
    output = model.generate(input_ids, max_length=20, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [18]:
generate(model)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to 3 (first `eos_token_id`) to generate sequence


' ml'