#🧪 Practical: Load and Test BERT from Hugging Face Transformers
#✅ What you'll learn:
How to install and import Hugging Face Transformers

Load BERT tokenizer and model

Tokenize synthetic sentences

Extract and understand embeddings

In [1]:
# 🧩 Step 1: Install Required Libraries
# Run this in a cell (only once)

!pip install -q transformers
!pip install -q torch


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# 🧩 Step 2: Import BERT and Tokenizer

from transformers import BertTokenizer, BertModel
import torch

# Load BERT tokenizer and model (base uncased version)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
# 🧩 Step 3: Create Synthetic Input Data

# Sample synthetic sentences
sentences = [
    "Alice is working at Google in California.",
    "The AI conference was held in Paris by OpenAI.",
    "John and Mary went to New York last summer."
]


In [4]:
# 🧩 Step 4: Tokenize Input

# Use the tokenizer to encode the sentences
# return_tensors="pt" means return PyTorch tensors
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Display the tokens and attention mask
print("🧾 Tokenized Inputs:")
print(inputs)


🧾 Tokenized Inputs:
{'input_ids': tensor([[ 101, 5650, 2003, 2551, 2012, 8224, 1999, 2662, 1012,  102,    0,    0,
            0],
        [ 101, 1996, 9932, 3034, 2001, 2218, 1999, 3000, 2011, 2330, 4886, 1012,
          102],
        [ 101, 2198, 1998, 2984, 2253, 2000, 2047, 2259, 2197, 2621, 1012,  102,
            0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}


In [5]:
# 🧩 Step 5: Pass Inputs Through BERT Model

# Disable gradient calculation (inference mode)
with torch.no_grad():
    outputs = model(**inputs)

# The output contains:
# - last_hidden_state: token embeddings
# - pooler_output: [CLS] token embedding (sentence-level)
print("✅ Output Keys:", outputs.keys())


✅ Output Keys: odict_keys(['last_hidden_state', 'pooler_output'])


In [6]:
# 🧩 Step 6: Inspect the Outputs

# Get the last hidden state (token-level embeddings)
last_hidden_state = outputs.last_hidden_state
print("🔍 Shape of last_hidden_state:", last_hidden_state.shape)

# Get the [CLS] token embeddings (first token in each sentence)
cls_embeddings = last_hidden_state[:, 0, :]
print("🧠 Shape of CLS embeddings:", cls_embeddings.shape)

# Example: print embedding vector for first sentence's CLS token
print("\n🔹 CLS Embedding for First Sentence:\n", cls_embeddings[0])


🔍 Shape of last_hidden_state: torch.Size([3, 13, 768])
🧠 Shape of CLS embeddings: torch.Size([3, 768])

🔹 CLS Embedding for First Sentence:
 tensor([-1.0084e-01,  1.9006e-02, -3.5910e-01, -1.9640e-01, -1.8793e-01,
        -3.3190e-01,  8.6121e-02,  1.1071e+00, -2.3831e-01, -6.9766e-02,
         1.0297e-01, -3.6435e-01,  5.3557e-01,  3.0533e-01,  2.6839e-01,
        -3.1194e-01, -1.9098e-01,  1.0715e+00,  3.4621e-01, -2.8949e-02,
        -1.9344e-02, -3.4536e-01,  9.6315e-04, -4.9736e-02,  2.6427e-01,
         6.5264e-02, -1.4412e-01, -2.6914e-01,  3.1713e-01, -3.3727e-01,
        -2.8751e-01,  4.3333e-01, -1.5416e-01, -1.3125e-01,  7.5062e-01,
        -1.1335e-01,  3.3330e-01,  6.6608e-02,  1.7784e-01,  2.1899e-01,
        -1.9877e-01,  1.1864e-01,  1.0670e-02, -3.4115e-03, -1.7706e-01,
        -6.4897e-01, -2.5609e+00, -3.3245e-01,  3.2724e-01, -1.4660e-01,
        -9.1790e-02, -1.9389e-01,  5.9554e-01,  3.1367e-01,  2.6510e-01,
         6.4595e-01, -3.2484e-01,  1.9475e-01,  1.4232e-

#✅ Output Explanation:
last_hidden_state.shape = [batch_size, sequence_length, hidden_size]

For BERT-base: hidden_size = 768

CLS token (index 0) can be used as a sentence representation

#📌 Summary
| Step | What You Did                                     |
| ---- | ------------------------------------------------ |
| 1    | Installed `transformers` and `torch`             |
| 2    | Loaded BERT tokenizer and model                  |
| 3    | Used synthetic sentences                         |
| 4    | Tokenized text input                             |
| 5    | Ran the model and extracted embeddings           |
| 6    | Viewed token-level and sentence-level embeddings |
