In [8]:
# Install ransformers and datasets library
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
# Import function and numpy
from datasets import load_dataset
import numpy as np

In [10]:
#https://huggingface.co/datasets/amazon_polarity
# takes a long time to process
#raw_dataset = load_dataset("amazon_polarity")

In [11]:
# Glue dataset with subtask sst2
raw_datasets = load_dataset("glue", "sst2")



  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
# Inspect the dataset
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [13]:
# Select one dataset, our variable behaves like a dict, we look at the train dataset
raw_datasets['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [14]:
# Check attributes and methods on the object
dir(raw_datasets['train']) 

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_examples_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguous',
 '_select_with_indices_mapping',
 '_split',
 'add_column',
 'add

In [15]:
# Check th type of object
type(raw_datasets['train'])

datasets.arrow_dataset.Dataset

In [16]:
# Check the data attribute
raw_datasets['train'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [17]:
# We can index the dataset object as if it was a list or an array
raw_datasets['train'][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [18]:
# We can index the dataset with a range
raw_datasets['train'][50000:50003]

{'sentence': ['glow ',
  'a classical dramatic animated feature ',
  'best espionage picture '],
 'label': [1, 1, 1],
 'idx': [50000, 50001, 50002]}

In [19]:
# Check the features attribute
raw_datasets['train'].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [20]:
#raw_dataset['test']

In [21]:
# Import the autotokenizer class
from transformers import AutoTokenizer

In [22]:
# Define the model checkpoint we want to use && load up the pre-trained tokenizer for this model
# checkpoint = "bert-base-uncased"
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [24]:
# Test the tokenizer on a subset
tokenized_sentences = tokenizer(raw_datasets['train'][0:3]['sentence'])
from pprint import pprint
pprint(tokenized_sentences)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
               [101,
                3397,
                2053,
                15966,
                1010,
                2069,
                4450,
                2098,
                18201,
                2015,
                102],
               [101,
                2008,
                7459,
                2049,
                3494,
                1998,
                10639,
                2015,
                2242,
                2738,
                3376,
                2055,
                2529,
                3267,
                102]]}


In [27]:
# Wrap our tokenizer with a new function, so that we can pas in th argument truncaion=True
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True)

In [28]:
# Call the map function to map the tokenizer function to each of our sampples in all of our datasets
# This is whats will be passed into the trainner later on
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [29]:
# Import class
from transformers import TrainingArguments

In [30]:
# Define a training arguments object, it will take som time to rain and will overfit(hence epochs=1)
training_args = TrainingArguments(
    'my_trainer',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=1,
)

In [31]:
# Import class
from transformers import AutoModelForSequenceClassification

In [32]:
# Load up our pre-trained model, specifying the checkppoint and the number of labels
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2
)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

In [33]:
# Check the type of the model object
type(model)


transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

In [34]:
# Print out the model-object
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [35]:
# install this to get the summary function
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.7.2-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.2


In [36]:
# Import the summary function and call it on our model
from torchinfo import summary
# summary(model, input_size(16,512), dtypes=['torch.IntTensor'], devices='cpu')
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [37]:
# Save the model parameters before we begin the training process
params_before = []
for name, p in model.named_parameters():
  params_before.append(p.detach().cpu().numpy())

In [38]:
# Import the trainer class
from transformers import Trainer

In [39]:
# Import the load metric funtion
from datasets import load_metric

In [40]:
# Call the load_metric function, passing in our task
metric = load_metric("glue", "sst2")

  metric = load_metric("glue", "sst2")


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [41]:
# Pass in some dummy-list just to se what we get back
metric.compute(predictions=[1,0,1], references=[1,0,0])

{'accuracy': 0.6666666666666666}

In [42]:
# Define our own compute_metrics function
def compute_metrics(logits_and_labels):
  # metric = load_metric("glue", "sst2")
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [43]:
# Create our trainer object
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

In [44]:
# Begin the training process
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.2123,0.343148


TrainOutput(global_step=8419, training_loss=0.2694392541282066, metrics={'train_runtime': 521.5061, 'train_samples_per_second': 129.143, 'train_steps_per_second': 16.144, 'total_flos': 518400815624736.0, 'train_loss': 0.2694392541282066, 'epoch': 1.0})

In [45]:
# Save the model into th directory my_saved_model
trainer.save_model('my_saved_model')

In [46]:
# Check for the directory
!ls

my_saved_model	my_trainer  sample_data


In [47]:
# Run the command again inside the directory
!ls my_saved_model

config.json	   special_tokens_map.json  tokenizer.json     vocab.txt
pytorch_model.bin  tokenizer_config.json    training_args.bin


In [48]:
# Demonstrate how to use our new model 
from transformers import pipeline

In [51]:
# Create our pipeline object
newmodel = pipeline('text-classification', model="my_saved_model", device=0)

In [52]:
# Test the model with sentiment analysis, LABEL_1 is the positive class
newmodel('This movie is great')

[{'label': 'LABEL_1', 'score': 0.9994105100631714}]

In [54]:
# LABEL_0 is the negative class
newmodel('This movie sucks')

[{'label': 'LABEL_0', 'score': 0.9952419996261597}]

In [55]:
# Inspect the confi.json file to be able to change the LABEL_X
!cat my_saved_model/config.json

{
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.27.4",
  "vocab_size": 30522
}


In [56]:
# Import the json library
import json

In [63]:
# Edit the config file
config_path = 'my_saved_model/config.json'
with open(config_path) as f:
  j = json.load(f)

j['id2label'] = {0: 'negative', 1: 'positive'}

with open(config_path, 'w') as f:
  json.dump(j, f, indent=2)

In [64]:
# Inspect the new label we just added
!cat my_saved_model/config.json

{
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.27.4",
  "vocab_size": 30522,
  "id2label": {
    "0": "negative",
    "1": "positive"
  }
}

In [65]:
# Load the pipelina again with the changes
newmodel = pipeline('text-classification', model='my_saved_model', device=0)

In [66]:
# 
newmodel('This movie is great!')

[{'label': 'positive', 'score': 0.9994620680809021}]

In [67]:
# Check if the weights in the neural network wa chnged during trainig
params_after = [] 
for name, p in model.named_parameters(): 
  params_after.append(p.detach().cpu().numpy())

In [68]:
# Compute the sum of the differencies of each parameter
# Because the sums are non-0 all the model weights were updated during the trainig process
for p1, p2, in zip(params_before, params_after):
  print(np.sum(np.abs(p1-p2)))

13411.57
91.70545
1.7868404
1.1527494
1299.8815
1.6974716
1284.7274
0.0026946187
1190.4558
1.0754144
1120.9456
0.8647071
1.651068
0.8612735
4961.4834
5.8027415
4591.105
0.72605497
1.5727199
0.7166341
1284.419
1.5467904
1283.8319
0.0024937373
1122.8733
0.8495246
1076.9614
0.7539193
1.5123565
0.7599005
4882.177
5.430606
4444.865
0.6867146
1.4279854
0.7591415
1271.9037
1.6015795
1267.7808
0.0024621873
1099.3817
0.8031607
1079.3221
0.71645606
1.4838234
0.7797825
4896.6553
5.526805
4364.582
0.7226196
1.3818038
0.68345284
1293.7256
1.4888422
1307.1149
0.0028757444
1133.8441
0.7546939
1087.4192
0.7417554
1.3784317
0.77667284
4788.1772
5.48077
4099.07
0.7250744
1.3830872
0.6828912
1179.8273
1.5384965
1191.5084
0.0014011515
980.19226
0.75972956
1005.5389
0.8710336
1.3857253
0.93116826
4281.448
5.0803967
3342.9294
0.7116433
1.2577976
0.64145696
1102.4678
1.2962468
1100.1824
0.00076559064
913.8469
0.75459516
941.7147
0.97939575
1.3132355
1.096585
3601.4243
4.6784234
3271.644
0.9917672
1.3564916
0