In [1]:
! pip install transformers datasets



In [2]:
from datasets import load_dataset
import numpy as np

In [3]:
raw_datasets=load_dataset("glue","sst2")

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [5]:
raw_datasets['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [6]:
dir(raw_datasets['train']) # To see what attributes and method object has

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguous',
 '_select_with_indices_mappin

In [7]:
type(raw_datasets['train'])

datasets.arrow_dataset.Dataset

In [8]:
raw_datasets['train'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [9]:
raw_datasets['train'][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [10]:
raw_datasets['train'][50000:50003]

{'sentence': ['glow ',
  'a classical dramatic animated feature ',
  'best espionage picture '],
 'label': [1, 1, 1],
 'idx': [50000, 50001, 50002]}

In [11]:
raw_datasets['train'].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [12]:
from transformers import AutoTokenizer

In [13]:
checkpoint="distilbert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(checkpoint)

In [14]:
tokenized_sentences=tokenizer(raw_datasets['train'][0:3]['sentence'])
from pprint import pprint
pprint(tokenized_sentences)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
               [101,
                3397,
                2053,
                15966,
                1010,
                2069,
                4450,
                2098,
                18201,
                2015,
                102],
               [101,
                2008,
                7459,
                2049,
                3494,
                1998,
                10639,
                2015,
                2242,
                2738,
                3376,
                2055,
                2529,
                3267,
                102]]}


In [15]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'],truncation=True)

In [16]:
tokenized_datasets=raw_datasets.map(tokenize_fn,batched=True)

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [17]:
from transformers import TrainingArguments

In [18]:
training_args=TrainingArguments(
    'my_trainer',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=1
)

In [19]:
pip install accelerate -U #While using TraingArgs,U will run into error ,so run this and below cell and restart runtime in collab



In [20]:
pip install transformers[torch]



In [22]:
from transformers import AutoModelForSequenceClassification

In [23]:
model=AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2
)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

In [25]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [26]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [27]:
from torchinfo import summary

summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [28]:
params_before=[]

for name,p in model.named_parameters():
  params_before.append(p.detach().cpu().numpy())

In [29]:
from transformers import Trainer

In [30]:
from datasets import load_metric

In [31]:
metric=load_metric("glue","sst2")

  metric=load_metric("glue","sst2")


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [32]:
metric.compute(predictions=[1,0,1],references=[1,0,0])

{'accuracy': 0.6666666666666666}

In [33]:
def compute_metrics(logits_and_labels):
  #Metric=load_metric("glue","sst2")
  logits,labels=logits_and_labels
  predictions=np.argmax(logits,axis=-1)
  return metric.compute(predictions=predictions,references=labels)

In [34]:
trainer=Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

In [35]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2108,0.353616,0.902523


TrainOutput(global_step=8419, training_loss=0.2717867206827679, metrics={'train_runtime': 427.624, 'train_samples_per_second': 157.496, 'train_steps_per_second': 19.688, 'total_flos': 518596929468840.0, 'train_loss': 0.2717867206827679, 'epoch': 1.0})

In [36]:
trainer.save_model('my_saved_model')

In [37]:
!ls

my_saved_model	my_trainer  sample_data


In [38]:
!ls my_saved_model

config.json	   special_tokens_map.json  tokenizer.json     vocab.txt
pytorch_model.bin  tokenizer_config.json    training_args.bin


In [39]:
from transformers import pipeline

In [40]:
newmodel=pipeline('text-classification',model="my_saved_model",device=0)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [43]:
newmodel('This movie is great!')

[{'label': 'LABEL_1', 'score': 0.9994888305664062}]

In [45]:
newmodel("I cant say whether i like the movie or not,but it was worth watching")

[{'label': 'LABEL_1', 'score': 0.9546936750411987}]

In [46]:
!cat my_saved_model/config.json

{
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.31.0",
  "vocab_size": 30522
}


In [47]:
import json

In [48]:
config_path="my_saved_model/config.json"

with open(config_path) as f:
  j=json.load(f)
  j['id2label']={0:'negative',1:'positive'}


with open(config_path,'w') as f:
  json.dump(j,f,indent=2)

In [49]:
!cat my_saved_model/config.json

{
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.31.0",
  "vocab_size": 30522,
  "id2label": {
    "0": "negative",
    "1": "positive"
  }
}

In [50]:
newmodel=pipeline('text-classification',model="my_saved_model",device=0)

In [51]:
newmodel('This movie is great!')

[{'label': 'positive', 'score': 0.9994888305664062}]

In [54]:
params_after=[]

for name,p in model.named_parameters():
  params_after.append(p.detach().cpu().numpy())

In [55]:
for p1,p2 in zip(params_before,params_after):
  print(np.sum(np.abs(p1-p2)))

13318.021
90.566185
1.6820736
1.0972003
1312.2317
1.7600219
1291.7142
0.0027800032
1197.7825
1.0343449
1144.6968
0.91132784
1.650372
0.917628
4948.3936
5.654251
4554.82
0.7226428
1.6151046
0.71951497
1300.6455
1.6097716
1303.488
0.0027740207
1125.7512
0.8417599
1071.2814
0.74634564
1.5749184
0.7400024
4932.3477
5.486819
4472.812
0.69851243
1.5077927
0.7473698
1272.4081
1.5247881
1275.3999
0.0023799199
1101.599
0.78481203
1083.817
0.7278465
1.5477228
0.7831414
4969.423
5.674129
4422.5015
0.70336777
1.4595897
0.6933882
1302.3636
1.4930966
1310.155
0.0027181492
1153.2665
0.68039286
1096.2336
0.7597605
1.4234436
0.7471764
4867.212
5.599474
4173.184
0.78466666
1.3296695
0.8034543
1178.8778
1.53914
1181.2269
0.001579574
959.65967
0.698666
970.7705
0.855132
1.37456
0.91208607
4357.987
5.1237597
3351.9214
0.66861796
1.1927235
0.69898427
1076.4995
1.232745
1101.5186
0.00082883285
908.434
0.89366007
893.24603
1.0205469
1.2117261
1.0900359
3488.4226
4.471657
3178.0215
0.9781238
1.2903448
0.687665