**extract embedding layers from**:
- pre-trained TFAutoModelForSequenceClassification: Can't get the hidden layers)
    - reference: https://huggingface.co/docs/transformers/v4.15.0/custom_datasets#sequence-classification-with-imdb-reviews
- pre-trained TFBertModel/TFGPT2Model pre-trained : can access to hidden layers + [CLS] + customer info
    - reference: ttps://colab.research.google.com/drive/1-JIJlao4dI-Ilww_NnTc0rxtp-ymgDgM?usp=sharing
    - https://huggingface.co/docs/transformers/model_doc/gpt2
    - https://github.com/huggingface/transformers/issues/11891
    - huggingface top 10 models: https://www.sabrepc.com/blog/Deep-Learning-and-AI/top-10-hugging-face-models-for-tensorflow
- transformers fine-tuned
- language model with attention self-trained : can access to hidden layers

In [4]:
import gzip
import pandas as pd
import pickle
from tqdm.notebook import tqdm as tqdm
tqdm.pandas()

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

from sklearn.model_selection import train_test_split
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset

from transformers import create_optimizer
import tensorflow as tf
from transformers import BertModel, BertConfig
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification

from tensorflow.keras import regularizers

from transformers import BertTokenizer, TFBertModel, BertConfig
from transformers import GPT2Tokenizer, TFGPT2Model, GPT2Config #distGPT2

import numpy as np
from datasets import load_metric
from transformers import TrainingArguments, Trainer

## read data

In [2]:
# def parse(path):
#   g = gzip.open(path, 'rb')
#   for l in g:
#     yield eval(l)

# def getDF(path):
#   i = 0
#   df = {}
#   for d in parse(path):
#     df[i] = d
#     i += 1
#   return pd.DataFrame.from_dict(df, orient='index')

# df = getDF('./data/reviews_Pet_Supplies.json.gz') #1,235,316
# metadata_df = getDF('./data/meta_Pet_Supplies.json.gz')
# merged_df = pd.merge(df, metadata_df, on = 'asin', how = 'left')

## Prepare a dataset: AutoTokenizer, and convert `datasets.arrow_dataset.Dataset` to TF_DF format

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", output_hidden_states=True)

In [4]:
# df.overall.head()

In [5]:
# df['reviewText'].head()

In [6]:
# df[['overall', 'reviewText']].shape

In [7]:
# def preprocess_function(sen):
#     return tokenizer(sen, truncation=True)

In [8]:
# tokenized_Review = df['reviewText'].map(preprocess_function)

In [9]:
# for token_dict, label in zip(tokenized_Review, df.overall):
#     token_dict['label'] = label - 1

In [10]:
# tokenized_Review[1]

In [11]:
# with open("./data/tokenized_Review.pickle", "wb") as handle:
#     pickle.dump(tokenized_Review, handle)

In [6]:
with open("./data/tokenized_Review.pickle", "rb") as handle:
    tokenized_Review = pickle.load(handle)

In [23]:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")

In [24]:
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='tf')

In [25]:
train, test = train_test_split(tokenized_Review, test_size=0.2, random_state = 1234)

In [26]:
sample_train = train.sample(frac = 0.01, random_state = 1234)
sample_test = test.sample(frac = 0.01, random_state = 1234)

In [27]:
#ds = tokenized_Review.train_test_split(test_size=0.2, seed=1234)

In [28]:
new_df = pd.DataFrame(list(sample_train))
train = Dataset(pa.Table.from_pandas(new_df))

In [29]:
train

Dataset({
    features: ['attention_mask', 'input_ids', 'label'],
    num_rows: 9883
})

In [30]:
new_df2 = pd.DataFrame(list(sample_test))
test = Dataset(pa.Table.from_pandas(new_df2))

In [31]:
test

Dataset({
    features: ['attention_mask', 'input_ids', 'label'],
    num_rows: 2471
})

In [32]:
tf_train_set = train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = test.to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

## TFAutoModelForSequenceClassification, cant access the hidden layer

In [26]:

batch_size = 16
num_epochs = 5
batches_per_epoch = len(train) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(
    init_lr=2e-5, 
    num_warmup_steps=0, 
    num_train_steps=total_train_steps
)

In [39]:

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", 
                                                             output_hidden_states = True, 
                                                             num_labels=5)


# isInstance of distilbert configuration class: DistilBertForSequenceClassification (DistilBERT model)

# isInstance of albert configuration class: AlbertForSequenceClassification (ALBERT model)

# isInstance of camembert configuration class: CamembertForSequenceClassification (CamemBERT model)

# isInstance of xlm roberta configuration class: XLMRobertaForSequenceClassification (XLM-RoBERTa model)

# isInstance of roberta configuration class: RobertaForSequenceClassification (RoBERTa model)

# isInstance of bert configuration class: BertForSequenceClassification (Bert model)

# isInstance of xlnet configuration class: XLNetForSequenceClassification (XLNet model)

# isInstance of xlm configuration class: XLMForSequenceClassification (XLM model)

# isInstance of flaubert configuration class: FlaubertForSequenceClassification (Flaubert model)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_242', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use 

In [28]:


model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [29]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  3845      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,957,317
Trainable params: 66,957,317
Non-trainable params: 0
_________________________________________________________________


In [30]:
# model.fit(
#     tf_train_set,
#     validation_data=tf_validation_set,
#     epochs=1,
# ) #one batch took more than half hour

In [31]:
#model.layers[0].embeddings.activity_regularizer = regularizers.l2(1e-5)

In [32]:
model.layers[0]

<transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertMainLayer at 0x192dee584c0>

## TFBertModel/TFGPT2Model pre-trained : can access to hidden layers

In [6]:
configuration = BertConfig(output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', output_hidden_states=True)
model = TFBertModel.from_pretrained("bert-base-uncased", config=configuration)
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [7]:
output[2][1:2] #(1, 12, 768) hidden 12 layers

(<tf.Tensor: shape=(1, 12, 768), dtype=float32, numpy=
 array([[[-0.02207932,  0.07241669, -0.21231124, ...,  0.25583994,
          -0.06988102, -0.05784186],
         [-0.10386208,  0.62958056,  1.3882225 , ...,  0.2654632 ,
           1.852907  ,  0.35319582],
         [ 0.26801813,  0.83381635, -0.67421234, ...,  0.1436718 ,
           1.358183  ,  0.5122263 ],
         ...,
         [ 0.91765064,  0.9559129 ,  0.10018361, ..., -0.18450573,
           0.6329654 ,  0.11736821],
         [-0.1476281 ,  0.09667887, -0.20371097, ...,  0.28010172,
           0.36521253,  0.44875804],
         [-0.38059896,  0.01264042, -0.15247092, ..., -0.19050437,
           0.3529018 ,  0.13312998]]], dtype=float32)>,)

In [9]:
configuration = GPT2Config(output_hidden_states=True)


tokenizer = GPT2Tokenizer.from_pretrained('gpt2', output_hidden_states=True)
model = TFGPT2Model.from_pretrained('gpt2', config = configuration)
model.save_pretrained("saved_gpt2")
new_model = TFGPT2Model.from_pretrained('saved_gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='tf')
output = new_model(encoded_input)

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.
All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at saved_gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


In [10]:
output[2][1:2] #(1, 10, 768) hidden 10 layers

(<tf.Tensor: shape=(1, 10, 768), dtype=float32, numpy=
 array([[[ 3.5697792 , -2.658392  ,  1.6498376 , ..., -1.2772871 ,
          -1.2973205 ,  0.3542018 ],
         [ 1.2458534 , -1.5179596 , -0.6840469 , ..., -0.03568979,
           0.2956106 , -1.1468014 ],
         [-0.86415637, -0.38782948, -0.26648208, ...,  0.86999714,
           1.5778211 , -0.9438042 ],
         ...,
         [-0.55909264,  0.6272529 ,  0.45439622, ..., -0.5477285 ,
          -0.48422644, -0.04893164],
         [-0.2099823 , -0.49616966, -1.2750123 , ..., -1.1759973 ,
           0.05631482, -0.07708734],
         [ 0.52308846,  0.35600227, -0.3611122 , ..., -0.25321734,
          -0.15091537,  0.11468042]]], dtype=float32)>,)

## transformers fine-tuned

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
## pytorch ----------------------

# training_args = TrainingArguments(output_dir="test_trainer")

# metric = load_metric("accuracy")
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)


# training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") 

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

In [41]:
## tensorflow -------------------------

output_hidden_state = True

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", 
                                                             output_hidden_states = True, 
                                                             num_labels=5)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

new_model = model.fit(tf_train_set, validation_data=tf_validation_set, epochs=1)

In [36]:
model.summary()

Model: "tf_bert_model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
Total params: 109,482,240
Trainable params: 109,482,240
Non-trainable params: 0
_________________________________________________________________


## language model with attention self-trained : can access to hidden layers

glue/sst2: General Language Understanding Evaluation benchmark (The Stanford Sentiment Treebank consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence. )

https://huggingface.co/transformers/v3.3.1/task_summary.html#sequence-classification