In [1]:
from transformers import pipeline

# Load a pre-trained question-answering model, e.g., BERT fine-tuned on SQuAD
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

context = """
The Amazon River is the largest river by discharge volume of water in the world,
and by some definitions, it is the longest. It flows through Peru,
Colombia, and Brazil before emptying into the Atlantic Ocean.
"""
question = "Which countries does the Amazon River flow through?"

result = qa_pipeline(question=question, context=context)

print(f"Question: {question}")
print(f"Answer: {result['answer']}")
print(f"Score: {result['score']:.2f}")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


Question: Which countries does the Amazon River flow through?
Answer: Peru,
Colombia, and Brazil
Score: 0.99


In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, pipeline

# --- Step 1: Load Pre-trained Tokenizer and Model ---
# The tokenizer is responsible for converting raw text into numerical IDs that the model understands.
# It also handles special tokens (like [CLS], [SEP]) and subword tokenization.
print("--- Step 1: Loading Tokenizer and Model ---")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# DistilBertForSequenceClassification is a DistilBERT model with a classification head
# (a linear layer) on top, specifically designed for tasks like sentiment analysis.
# This model has been pre-trained on a massive text corpus and then
# fine-tuned on a sentiment dataset (implicitly, by using a model often associated with sentiment).
# For a raw pre-trained model that hasn't been fine-tuned for classification yet,
# you would typically use `DistilBertModel` and add your own classification head.
# Here, we're simulating a common scenario where a pre-trained model *is* the sentiment analyzer.
# We're loading a general DistilBERT, then we'll show how its outputs are used for classification.
# For a truly 'ready-to-go' sentiment model, you'd load one specifically fine-tuned for it.
# Let's load the *base* model first to show the embedding process.
# We will later use a `pipeline` for an end-to-end solution.
model_base = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
print("Tokenizer and base model loaded.")

# --- Step 2: Prepare Input Text ---
# We'll use a sample sentence for sentiment analysis.
text = "This movie was absolutely fantastic! I loved every minute of it."
print(f"\n--- Step 2: Input Text ---")
print(f"Original Text: '{text}'")

# --- Step 3: Tokenize the Input Text ---
# The tokenizer converts the text into a sequence of numerical IDs (input_ids),
# and also generates an attention mask.
# `input_ids`: numerical representation of tokens.
# `attention_mask`: indicates which tokens are real words (1) and which are padding (0).
# `return_tensors='pt'` ensures the output is a PyTorch tensor.
print("\n--- Step 3: Tokenization ---")
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
print(f"Tokenized Input IDs: {inputs['input_ids']}")
print(f"Attention Mask: {inputs['attention_mask']}")
print(f"Decoded Tokens: {tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])}")
# Note: [CLS] is a special token at the beginning for classification tasks.
# [SEP] is a special token to separate sentences. DistilBERT does not explicitly use [SEP] for single sentences,
# but it's common in BERT-like models for pair tasks.

# --- Step 4: Obtain Embeddings (and logits) from the Pre-trained Model ---
# When you pass the tokenized inputs to the DistilBertForSequenceClassification model,
# it first generates contextual embeddings for each token.
# Then, a classification head (a dense layer) on top of these embeddings
# produces logits (raw scores) for each possible class (e.g., positive, negative).
print("\n--- Step 4: Model Forward Pass & Logits ---")
# When `output_hidden_states=True` is passed, the model will also return
# the hidden states (which include the embeddings at different layers).
outputs = model_base(**inputs, output_hidden_states=True)

# The `logits` are the raw, unnormalized scores for each class.
# For binary sentiment (positive/negative), there would be 2 logits.
# For 3 classes (positive/negative/neutral), there would be 3 logits.
# By default, distilbert-base-uncased is not fine-tuned for classification,
# so it just has a default number of classes (usually 2, as it's often set up for binary tasks).
logits = outputs.logits
print(f"Raw Logits from model: {logits}") # e.g., tensor([[-0.2467,  0.1362]])

# The `hidden_states` contain the embeddings from all layers.
# The last hidden state is often used for downstream tasks.
# `hidden_states[0]` is the embedding layer output (input embeddings before any transformer layers).
# `hidden_states[-1]` is the output of the *last* Transformer layer (contextualized embeddings).
# The shape will be (batch_size, sequence_length, hidden_size).
# For `distilbert-base-uncased`, hidden_size is 768.
last_hidden_state_embeddings = outputs.hidden_states[-1]
print(f"Shape of Last Layer Contextual Embeddings: {last_hidden_state_embeddings.shape}")
# Example: torch.Size([1, 12, 768]) for our sample sentence.
# 1 (batch size), 12 (number of tokens), 768 (embedding dimension)

# To get the embedding for the [CLS] token (often used for classification):
cls_embedding = last_hidden_state_embeddings[:, 0, :]
print(f"Shape of [CLS] Token Embedding: {cls_embedding.shape}")
# Example: torch.Size([1, 768]) - this single vector represents the entire sentence's context
# and is passed to the classification head.

# --- Step 5: Convert Logits to Probabilities and Predict Sentiment ---
# We apply a softmax function to the logits to get probabilities across classes.
print("\n--- Step 5: Probabilities and Prediction ---")
probabilities = torch.softmax(logits, dim=1)
print(f"Probabilities (e.g., for 2 classes): {probabilities}")

# To get the predicted class index (0 or 1 for binary classification)
predicted_class_idx = torch.argmax(probabilities, dim=1).item()
print(f"Predicted Class Index: {predicted_class_idx}")

# Note: Without knowing the specific fine-tuning mapping (e.g., 0=negative, 1=positive),
# these indices are abstract for `model_base`.
# For real sentiment analysis, you'd use a model already fine-tuned with labels.

# --- Step 6: Using a Pre-trained Sentiment Analysis Pipeline (End-to-End) ---
# For practical sentiment analysis, you'd typically use a model already fine-tuned for the task.
# Hugging Face `pipeline` abstracts away much of the above.
print("\n--- Step 6: Using a Dedicated Sentiment Analysis Pipeline ---")
# This loads a model already fine-tuned for sentiment, complete with appropriate labels.
# E.g., 'nlptown/bert-base-multilingual-uncased-sentiment' is a common one for 5-star sentiment.
# Let's use a simpler one if available, or just demonstrate the concept.
# A common choice for English sentiment is 'distilbert-base-uncased-finetuned-sst-2-english'
# but it's better to use one that clearly outputs "positive" or "negative".
# For simplicity, let's use a popular readily available one.
try:
    sentiment_pipeline = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis")
    print(f"Model loaded: finiteautomata/bertweet-base-sentiment-analysis")

    result = sentiment_pipeline(text)
    print(f"Sentence: '{text}'")
    print(f"Sentiment Analysis Result: {result}")
    # Example Output: [{'label': 'POS', 'score': 0.9989}]

    text_negative = "This product broke after one day, completely useless."
    result_negative = sentiment_pipeline(text_negative)
    print(f"\nSentence: '{text_negative}'")
    print(f"Sentiment Analysis Result: {result_negative}")

except Exception as e:
    print(f"Could not load specific sentiment model: {e}")
    print("Falling back to a general example or skipping pipeline demo.")
    print("The key takeaway is that the fine-tuned model leverages the pre-trained embeddings.")


print("\n--- Sentiment Analysis Process Complete ---")
print("Key takeaway: Pre-trained embeddings provide meaningful numerical representations that a classification head then uses to predict sentiment.")

--- Step 1: Loading Tokenizer and Model ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer and base model loaded.

--- Step 2: Input Text ---
Original Text: 'This movie was absolutely fantastic! I loved every minute of it.'

--- Step 3: Tokenization ---
Tokenized Input IDs: tensor([[  101,  2023,  3185,  2001,  7078, 10392,   999,  1045,  3866,  2296,
          3371,  1997,  2009,  1012,   102]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Decoded Tokens: ['[CLS]', 'this', 'movie', 'was', 'absolutely', 'fantastic', '!', 'i', 'loved', 'every', 'minute', 'of', 'it', '.', '[SEP]']

--- Step 4: Model Forward Pass & Logits ---
Raw Logits from model: tensor([[-0.0541,  0.0460]], grad_fn=<AddmmBackward0>)
Shape of Last Layer Contextual Embeddings: torch.Size([1, 15, 768])
Shape of [CLS] Token Embedding: torch.Size([1, 768])

--- Step 5: Probabilities and Prediction ---
Probabilities (e.g., for 2 classes): tensor([[0.4750, 0.5250]], grad_fn=<SoftmaxBackward0>)
Predicted Class Index: 1

--- Step 6: Using a Dedicated Sentiment Analysis Pipeline --

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Device set to use mps:0


Model loaded: finiteautomata/bertweet-base-sentiment-analysis
Sentence: 'This movie was absolutely fantastic! I loved every minute of it.'
Sentiment Analysis Result: [{'label': 'POS', 'score': 0.9918115139007568}]

Sentence: 'This product broke after one day, completely useless.'
Sentiment Analysis Result: [{'label': 'NEG', 'score': 0.9826966524124146}]

--- Sentiment Analysis Process Complete ---
Key takeaway: Pre-trained embeddings provide meaningful numerical representations that a classification head then uses to predict sentiment.
