# 1. Activate GPU and Install Dependencies

In [1]:
# Activate GPU for faster training by clicking on 'Runtime' > 'Change runtime type' and then selecting GPU as the Hardware accelerator
# Then check if GPU is available
import torch
torch.cuda.is_available()

False

In [4]:
# Install required libraries
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs

zsh:1: command not found: apt-get


#2. Preprocess data

In [5]:
# Load data
from datasets import load_dataset
imdb = load_dataset("imdb")

Found cached dataset imdb (/Users/monusingh/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
# Create a smaller training dataset for faster training times
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])

Loading cached shuffled indices for dataset at /Users/monusingh/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-8a9e43a6ac4acdff.arrow
Loading cached shuffled indices for dataset at /Users/monusingh/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-2eff9f118d84c6fe.arrow


In [10]:
# Set DistilBERT tokenizer # To pre-process our data
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [11]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

Loading cached processed dataset at /Users/monusingh/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-3b566aa4b63109c1.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 3. Training the model

In [13]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

In [14]:
# Define the evaluation metrics 
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [23]:
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-sentiment-model-3000-samples"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch", 
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/monusingh/finetuning-sentiment-model-3000-samples into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Download file pytorch_model.bin:   0%|          | 15.4k/255M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.37k/3.37k [00:00<?, ?B/s]

Download file runs/Dec15_12-53-05_94bd8623da9c/1671108798.3752475/events.out.tfevents.1671108798.94bd8623da9c.…

Download file runs/Dec15_12-53-05_94bd8623da9c/events.out.tfevents.1671108798.94bd8623da9c.71.0: 100%|########…

Download file runs/Dec15_12-53-05_94bd8623da9c/events.out.tfevents.1671109250.94bd8623da9c.71.2: 100%|########…

Clean file training_args.bin:  30%|##9       | 1.00k/3.37k [00:00<?, ?B/s]

Clean file runs/Dec15_12-53-05_94bd8623da9c/1671108798.3752475/events.out.tfevents.1671108798.94bd8623da9c.71.…

Clean file runs/Dec15_12-53-05_94bd8623da9c/events.out.tfevents.1671108798.94bd8623da9c.71.0:  25%|##5       |…

Clean file runs/Dec15_12-53-05_94bd8623da9c/events.out.tfevents.1671109250.94bd8623da9c.71.2: 100%|##########|…

Clean file pytorch_model.bin:   0%|          | 1.00k/255M [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [24]:
# Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3000
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 376
  Number of trainable parameters = 66955010


  0%|          | 0/376 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

In [18]:
# Compute the evaluation metrics
trainer.evaluate()

NameError: name 'trainer' is not defined

# 4. Analyzing new data with the model

In [16]:
# Upload the model to the Hub
trainer.push_to_hub()

Saving model checkpoint to finetuning-sentiment-model-3000-samples
Configuration saved in finetuning-sentiment-model-3000-samples/config.json
Model weights saved in finetuning-sentiment-model-3000-samples/pytorch_model.bin
tokenizer config file saved in finetuning-sentiment-model-3000-samples/tokenizer_config.json
Special tokens file saved in finetuning-sentiment-model-3000-samples/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.30k/255M [00:00<?, ?B/s]

Upload file runs/Dec15_12-53-05_94bd8623da9c/events.out.tfevents.1671109250.94bd8623da9c.71.2: 100%|##########…

Upload file runs/Dec15_12-53-05_94bd8623da9c/events.out.tfevents.1671108798.94bd8623da9c.71.0:  84%|########4 …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/monusingh/finetuning-sentiment-model-3000-samples
   ab9694e..86cf245  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/monusingh/finetuning-sentiment-model-3000-samples
   ab9694e..86cf245  main -> main

To https://huggingface.co/monusingh/finetuning-sentiment-model-3000-samples
   86cf245..d1fb5aa  main -> main

   86cf245..d1fb5aa  main -> main



'https://huggingface.co/monusingh/finetuning-sentiment-model-3000-samples/commit/86cf245ede9a5f31ce7f5e994856633a1f7d152a'

In [17]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="monusingh/finetuning-sentiment-model-3000-samples")

sentiment_model(["I love this move", "This movie sucks!"])

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--monusingh--finetuning-sentiment-model-3000-samples/snapshots/d1fb5aa1d062f126eb8ed22234ced30dd803ee61/config.json
Model config DistilBertConfig {
  "_name_or_path": "monusingh/finetuning-sentiment-model-3000-samples",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--monusingh--finetuning-sentiment-mo

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--monusingh--finetuning-sentiment-model-3000-samples/snapshots/d1fb5aa1d062f126eb8ed22234ced30dd803ee61/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at monusingh/finetuning-sentiment-model-3000-samples.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


Downloading:   0%|          | 0.00/360 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--monusingh--finetuning-sentiment-model-3000-samples/snapshots/d1fb5aa1d062f126eb8ed22234ced30dd803ee61/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--monusingh--finetuning-sentiment-model-3000-samples/snapshots/d1fb5aa1d062f126eb8ed22234ced30dd803ee61/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--monusingh--finetuning-sentiment-model-3000-samples/snapshots/d1fb5aa1d062f126eb8ed22234ced30dd803ee61/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--monusingh--finetuning-sentiment-model-3000-samples/snapshots/d1fb5aa1d062f126eb8ed22234ced30dd803ee61/tokenizer_config.json
Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'label': 'LABEL_1', 'score': 0.9565949440002441},
 {'label': 'LABEL_0', 'score': 0.9415571689605713}]