<a href="https://colab.research.google.com/github/Lcocks/DS6050-DeepLearning/blob/main/10L_HuggingFace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


<img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-title.png" width="400">

https://huggingface.co/models

The **Transformers library** (by HuggingFace) is the **industry standard** for working with pre-trained language models. Think of it as:

> **"PyTorch is the framework, Transformers is the model zoo + utilities"**

The library is built on three key principles:

#### 1. **Standardized Interface**
Every model follows the same API pattern:
```python
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("model-name")
model = AutoModel.from_pretrained("model-name")
```

#### 2. **Pre-trained Models + Tokenizers = Complete Package**

#### 3. **Built ON TOP of PyTorch** (not replacing it)

In [None]:
import torch
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

print(f"PyTorch version: {torch.__version__}")

print("\n--- Basic Sentiment Analysis ---")
classifier = pipeline("sentiment-analysis")
result = classifier("I have waited for a HuggingFace tutorial longer than my sourdough starter.")
print("Text: 'I have waited for a HuggingFace tutorial longer than my sourdough starter.'")
print(f"Result: {result}")
print(f"Label: {result[0]['label']}, Score: {result[0]['score']:.4f}")

In [None]:
print("\n--- Text Generation Pipeline ---")
generator = pipeline(
    "text-generation",
    model="gpt2"
)
generated = generator(
    "By the end of this course, you will finally learn how to",
    max_length=30,
    num_return_sequences=2
)
print("Prompt: 'By the end of this course, you will finally learn how to'")
print("\nGenerated sequences:")
for i, seq in enumerate(generated, 1):
    print(f"{i}. {seq['generated_text']}")

In [None]:
print("\n--- Zero-Shot Classification ---")
zero_shot = pipeline("zero-shot-classification")
text = "This course turns the Transformers library into legible magic for busy humans"
candidate_labels = ["education", "politics", "business"]
result = zero_shot(text, candidate_labels)
print(f"Text: '{text}'")
print(f"Candidate labels: {candidate_labels}")
print("\nResults:")
for label, score in zip(result['labels'], result['scores']):
    print(f"  {label}: {score:.4f}")

In [None]:
print("\n--- Other Available Pipeline Tasks ---")
unmasker = pipeline("fill-mask", model="bert-base-uncased")
result = unmasker("The tallest animal is [MASK].")
print("\nFill-mask example:")
print("Input: 'The tallest animal is [MASK].'")
print(f"Top prediction: {result[0]['token_str']} (score: {result[0]['score']:.4f})")


print("\nTop 5 predictions:")
# Iterate through the list of results
for i, item in enumerate(result, 1):
    print(f"{i}. {item['token_str']} (score: {item['score']:.4f})")

print("\n" + "="*80)

In [None]:
print("\n--- Loading Tokenizer and Model ---")
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
print(f"Loaded tokenizer and model: {model_name}")
print(f"Model type: {type(model).__name__}")
print(f"Tokenizer type: {type(tokenizer).__name__}")
classifier_custom = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer
)
result = classifier_custom("I have waited for a HuggingFace tutorial longer than my sourdough starter.")
print(f"\nUsing custom tokenizer/model: {result}")


In [None]:
print("\n--- Understanding Tokenization ---")
text = "Greetings from the future, how are the GPUs today?"
encoded = tokenizer(text)
print(f"Text: '{text}'")
print(f"\nDirect tokenization:")
print(f"  Input IDs: {encoded['input_ids']}")
print(f"  Attention mask: {encoded['attention_mask']}")
tokens = tokenizer.tokenize(text)
print(f"\nStep-by-step tokenization:")
print(f"  Tokens: {tokens}")
ids = tokenizer.convert_tokens_to_ids(tokens)
print(f"  Token IDs: {ids}")
decoded = tokenizer.decode(ids)
print(f"  Decoded: '{decoded}'")
print(f"\nAttention mask explanation:")
print(f"  1 means: attend to this token")
print(f"  0 means: ignore this token (padding)")
print(f"\nSpecial tokens:")
print(f"  [CLS] token ID: {tokenizer.cls_token_id}")
print(f"  [SEP] token ID: {tokenizer.sep_token_id}")
print(f"  [PAD] token ID: {tokenizer.pad_token_id}")

In [None]:
print("\n--- Tokenizing Batches ---")
sentences = [
    "This sentence is short and caffeinated.",
    "This one is dramatically longer because it decided to explain its feelings about gradient descent at 2 AM.",
    "Tiny."
]
batch = tokenizer(
    sentences,
    padding=True,
    truncation=True,
    max_length=20,
    return_tensors="pt"
)
print("Batch of sentences:")
for i, sent in enumerate(sentences):
    print(f"  {i+1}. {sent}")
print(f"\nTokenized batch:")
print(f"  Input IDs shape: {batch['input_ids'].shape}")
print(f"  Attention mask shape: {batch['attention_mask'].shape}")
print(f"\nFirst sentence tokens:")
print(f"  Input IDs: {batch['input_ids'][0]}")
print(f"  Attention mask: {batch['attention_mask'][0]}")
print(f"  (Notice the padding tokens at the end)")

print("\n" + "="*80)
print("SECTION 3: PYTORCH INTEGRATION")
print("="*80)

In [None]:
print("\n--- Pipeline with Multiple Inputs ---")
x_train = [
    "This gadget sparks joy and eats my chores for breakfast.",
    "This update is a dumpster fire wearing a bow tie.",
    "Meh, it works, like decaf coffee.",
    "Absolute masterpiece; I would frame the receipt."
]
results = classifier(x_train)
print("Sentiment analysis results:")
for text_item, result in zip(x_train, results):
    print(f"  '{text_item[:50]}...' -> {result['label']} ({result['score']:.3f})")


In [None]:
print("\n--- Manual PyTorch Inference ---")
batch = tokenizer(
    x_train,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)
print(f"Tokenized batch:")
print(f"  Type: {type(batch)}")
print(f"  Keys: {batch.keys()}")
print(f"  Input IDs shape: {batch['input_ids'].shape}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
batch = {k: v.to(device) for k, v in batch.items()}
model.eval()
with torch.no_grad():
    outputs = model(**batch)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    predictions = torch.argmax(probabilities, dim=-1)
print(f"\nManual inference results:")
print(f"  Logits shape: {logits.shape}")
print(f"  Probabilities: {probabilities}")
print(f"  Predicted labels: {predictions}")
id2label = model.config.id2label
for i, (text_item, pred, probs) in enumerate(zip(x_train, predictions, probabilities)):
    label = id2label[pred.item()]
    confidence = probs.max().item()
    print(f"  {i+1}. {label} ({confidence:.3f})")

In [None]:
print("\n--- Transformers Models ARE PyTorch Models ---")
print(f"Is model a PyTorch Module? {isinstance(model, torch.nn.Module)}")
print(f"Model class: {type(model)}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")
print("\nPyTorch operations work:")
print(f"  model.train() - sets training mode")
print(f"  model.eval() - sets evaluation mode")
print(f"  model.parameters() - get parameters for optimizer")
print(f"  model.to(device) - move to GPU/CPU")

print("\n" + "="*80)
print("SECTION 4: SAVING AND LOADING MODELS")
print("="*80)

print("\n--- Saving Model and Tokenizer ---")
save_directory = "./my_saved_model"
tokenizer.save_pretrained(save_directory)
print(f"✓ Tokenizer saved to {save_directory}")
model.save_pretrained(save_directory)
print(f"✓ Model saved to {save_directory}")
print(f"\nSaved files include:")
print(f"  - config.json (model configuration)")
print(f"  - pytorch_model.bin (model weights)")
print(f"  - tokenizer.json (tokenizer configuration)")
print(f"  - vocab.txt (vocabulary)")


In [None]:

print("\n--- Loading Model and Tokenizer ---")
loaded_tokenizer = AutoTokenizer.from_pretrained(save_directory)
print(f"✓ Tokenizer loaded from {save_directory}")
loaded_model = AutoModelForSequenceClassification.from_pretrained(save_directory)
print(f"✓ Model loaded from {save_directory}")
test_text = "This reloaded model performs like it had an extra espresso."
loaded_inputs = loaded_tokenizer(test_text, return_tensors="pt")
loaded_model.eval()
with torch.no_grad():
    outputs = loaded_model(**loaded_inputs)
    prediction = torch.argmax(outputs.logits, dim=-1)
    label = loaded_model.config.id2label[prediction.item()]
print(f"\nTest with loaded model:")
print(f"  Text: '{test_text}'")
print(f"  Prediction: {label}")

print("\n" + "="*80)
print("SECTION 5: USING THE MODEL HUB")
print("="*80)


In [None]:

print("\n--- Using Models from the Hub ---")
print("\nExample 1: Summarization")
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn"
)
article = """
The HuggingFace Transformers library provides thousands of pretrained models
to perform tasks on different modalities such as text, vision, and audio.
These models can be applied on text for tasks like classification, information
extraction, question answering, summarization, translation, and text generation
in over 100 languages. The library also provides easy-to-use APIs that allow
you to quickly download and use those pretrained models on a given text,
fine-tune them on your own datasets, and share them with the community.
"""
summary = summarizer(article, max_length=50, min_length=25, do_sample=False)
print(f"Original length: {len(article)} characters")
print(f"Summary: {summary[0]['summary_text']}")

print("\nExample 2: German Sentiment Analysis")
german_classifier = pipeline(
    "sentiment-analysis",
    model="oliverguhr/german-sentiment-bert"
)
german_text = "Dieses Tutorial ist so gut, dass mein Kaffee Applaus spendet!"
result = german_classifier(german_text)
print(f"Text: '{german_text}'")
print(f"Result: {result}")

In [None]:
print("\n--- Finding Models on the Hub ---")
print("""
To find models:
1. Go to huggingface.co/models
2. Filter by task, library, language, or dataset
3. Use the search bar
4. Click on a model to see:
   - Model card (documentation)
   - Code examples
   - Demo widgets
5. Copy the model name and use it in your code!
""")


APIs & Mechanics

I'm focusing on contrasting the ease of Hugging Face pipelines with the more granular PyTorch implementations.

In [None]:
#------ APIs & Mechanics: contrasting the ease of Hugging Face pipelines with the more granular PyTorch implementations.
import torch
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
    AutoModelForMaskedLM,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM
)
import logging

logging.getLogger("transformers").setLevel(logging.ERROR)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}\n")

print("="*80)
print("ANALOGY 1: SEQUENCE CLASSIFICATION (Sentiment Analysis)")
print("="*80)

MODEL_NAME_CLS = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer_cls = AutoTokenizer.from_pretrained(MODEL_NAME_CLS)
model_cls = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_CLS).to(DEVICE)

text_cls = "Hugging Face makes NLP incredibly easy!"

print("\n--- 1.1 Pipeline (High-Level Abstraction) ---")
classifier = pipeline("sentiment-analysis", model=model_cls, tokenizer=tokenizer_cls, device=DEVICE)
result_pipe = classifier(text_cls)
print(f"Input: '{text_cls}'")
print(f"Output: {result_pipe}")

print("\n--- 1.2 PyTorch Equivalent (The Mechanics) ---")
model_cls.eval()
inputs_cls = tokenizer_cls(text_cls, return_tensors="pt").to(DEVICE)
with torch.no_grad():
    outputs_cls = model_cls(**inputs_cls)
logits_cls = outputs_cls.logits
probabilities_cls = torch.nn.functional.softmax(logits_cls, dim=-1)
predicted_class_id = torch.argmax(probabilities_cls, dim=-1).item()
predicted_label = model_cls.config.id2label[predicted_class_id]
confidence = probabilities_cls[0, predicted_class_id].item()
print(f"Logits Shape: {logits_cls.shape}")
print(f"Predicted Label: {predicted_label}")
print(f"Confidence: {confidence:.4f}")

print("\n" + "="*80)
print("ANALOGY 2: MASKED LANGUAGE MODELING (Fill-Mask)")
print("="*80)

MODEL_NAME_MLM = "bert-base-uncased"
tokenizer_mlm = AutoTokenizer.from_pretrained(MODEL_NAME_MLM)
model_mlm = AutoModelForMaskedLM.from_pretrained(MODEL_NAME_MLM).to(DEVICE)
model_mlm.eval()

text_mlm = "The capital of France is [MASK]."

print("\n--- 2.1 Pipeline (High-Level Abstraction) ---")
fill_mask = pipeline("fill-mask", model=model_mlm, tokenizer=tokenizer_mlm, device=DEVICE)
result_mlm_pipe = fill_mask(text_mlm, top_k=3)
print(f"Input: '{text_mlm}'")
print(f"Output: {result_mlm_pipe}")

print("\n--- 2.2 PyTorch Equivalent (The Mechanics) ---")
inputs_mlm = tokenizer_mlm(text_mlm, return_tensors="pt").to(DEVICE)
mask_token_index = torch.where(inputs_mlm["input_ids"] == tokenizer_mlm.mask_token_id)[1]
with torch.no_grad():
    outputs_mlm = model_mlm(**inputs_mlm)
logits_mlm = outputs_mlm.logits
mask_token_logits = logits_mlm[0, mask_token_index, :]
top_k = 3
top_k_tokens = torch.topk(mask_token_logits, top_k, dim=1)
top_k_ids = top_k_tokens.indices[0].tolist()
probabilities_mlm = torch.nn.functional.softmax(mask_token_logits, dim=-1)
top_k_probs = torch.topk(probabilities_mlm, top_k, dim=1).values[0].tolist()
print(f"Logits Shape (Full Sequence): {logits_mlm.shape}")
print(f"Logits Shape (Masked Position): {mask_token_logits.shape}")
print("\nTop predictions:")
for token_id, prob in zip(top_k_ids, top_k_probs):
    token_str = tokenizer_mlm.decode([token_id])
    print(f"- {token_str} (Probability: {prob:.4f})")

print("\n" + "="*80)
print("ANALOGY 3: TOKEN CLASSIFICATION (NER)")
print("="*80)

MODEL_NAME_NER = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer_ner = AutoTokenizer.from_pretrained(MODEL_NAME_NER)
model_ner = AutoModelForTokenClassification.from_pretrained(MODEL_NAME_NER).to(DEVICE)
model_ner.eval()

text_ner = "My name is Wolfgang and I live in Berlin."

print("\n--- 3.1 Pipeline (High-Level Abstraction) ---")
ner = pipeline("ner", model=model_ner, tokenizer=tokenizer_ner, device=DEVICE, aggregation_strategy="simple")
result_ner_pipe = ner(text_ner)
print(f"Input: '{text_ner}'")
print(f"Output: {result_ner_pipe}")

print("\n--- 3.2 PyTorch Equivalent (The Mechanics) ---")
inputs_ner = tokenizer_ner(text_ner, return_tensors="pt").to(DEVICE)
tokens = tokenizer_ner.convert_ids_to_tokens(inputs_ner["input_ids"][0])
with torch.no_grad():
    outputs_ner = model_ner(**inputs_ner)
logits_ner = outputs_ner.logits
predictions_ner = torch.argmax(logits_ner, dim=-1)[0]
print(f"Logits Shape: {logits_ner.shape}")
print("\nToken-level predictions:")
for token, prediction_id in zip(tokens, predictions_ner):
    if token not in tokenizer_ner.all_special_tokens:
        label = model_ner.config.id2label[prediction_id.item()]
        print(f"{token:<15} : {label}")

print("\n" + "="*80)
print("ANALOGY 4: TEXT GENERATION (Causal LM)")
print("="*80)

MODEL_NAME_CLM = "gpt2"
tokenizer_clm = AutoTokenizer.from_pretrained(MODEL_NAME_CLM)
model_clm = AutoModelForCausalLM.from_pretrained(MODEL_NAME_CLM).to(DEVICE)
model_clm.eval()
if tokenizer_clm.pad_token is None:
    tokenizer_clm.pad_token = tokenizer_clm.eos_token

text_clm = "Transformers revolutionized NLP by"

print("\n--- 4.1 Hugging Face Generate (High-Level Abstraction) ---")
inputs_clm = tokenizer_clm(text_clm, return_tensors="pt").to(DEVICE)
with torch.no_grad():
    output_sequences = model_clm.generate(
        **inputs_clm,
        max_new_tokens=20,
        do_sample=False,
        pad_token_id=tokenizer_clm.eos_token_id
    )
generated_text_gen = tokenizer_clm.decode(output_sequences[0], skip_special_tokens=True)
print(f"Input: '{text_clm}'")
print(f"Output:\n{generated_text_gen}")

print("\n--- 4.2 PyTorch Equivalent (The Autoregressive Loop) ---")
current_input_ids = tokenizer_clm(text_clm, return_tensors="pt").input_ids.to(DEVICE)
max_new_tokens = 20
print("Starting manual autoregressive loop (Greedy Search):")
for _ in range(max_new_tokens):
    with torch.no_grad():
        outputs_clm = model_clm(input_ids=current_input_ids)
    next_token_logits = outputs_clm.logits[:, -1, :]
    next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)
    current_input_ids = torch.cat([current_input_ids, next_token_id], dim=1)
    if next_token_id.item() == tokenizer_clm.eos_token_id:
        break
generated_text_manual = tokenizer_clm.decode(current_input_ids[0], skip_special_tokens=True)
print(f"\nGenerated Output:\n{generated_text_manual}")

print("\n" + "="*80)
print("ANALOGY 5: TRANSLATION (Seq2Seq)")
print("="*80)

MODEL_NAME_S2S = "t5-small"
tokenizer_s2s = AutoTokenizer.from_pretrained(MODEL_NAME_S2S, model_max_length=512)
model_s2s = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME_S2S).to(DEVICE)
model_s2s.eval()

text_s2s = "translate English to German: The house is wonderful."

print("\n--- 5.1 Hugging Face Generate (High-Level Abstraction) ---")
inputs_s2s = tokenizer_s2s(text_s2s, return_tensors="pt", truncation=True).to(DEVICE)
with torch.no_grad():
    output_ids = model_s2s.generate(
        inputs_s2s.input_ids,
        max_length=20,
        num_beams=1,
    )
translation_gen = tokenizer_s2s.decode(output_ids[0], skip_special_tokens=True)
print(f"Input: '{text_s2s}'")
print(f"Output:\n{translation_gen}")

print("\n--- 5.2 PyTorch Equivalent (Encoder-Decoder Interaction) ---")
encoder_inputs = tokenizer_s2s(text_s2s, return_tensors="pt", truncation=True).to(DEVICE)
with torch.no_grad():
    encoder_outputs = model_s2s.get_encoder()(**encoder_inputs)
encoder_hidden_states = encoder_outputs.last_hidden_state
decoder_input_ids = torch.tensor([[model_s2s.config.decoder_start_token_id]], device=DEVICE)
max_length = 20
print("Starting manual Seq2Seq loop:")
for _ in range(max_length):
    with torch.no_grad():
        decoder_outputs = model_s2s.get_decoder()(
            input_ids=decoder_input_ids,
            encoder_hidden_states=encoder_hidden_states
        )
    sequence_output = decoder_outputs[0]
    if model_s2s.config.tie_word_embeddings:
        sequence_output = sequence_output * (model_s2s.model_dim**-0.5)
    lm_logits = model_s2s.lm_head(sequence_output)
    next_token_logits = lm_logits[:, -1, :]
    next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)
    decoder_input_ids = torch.cat([decoder_input_ids, next_token_id], dim=1)
    if next_token_id.item() == tokenizer_s2s.eos_token_id:
        break
translation_manual = tokenizer_s2s.decode(decoder_input_ids[0], skip_special_tokens=True)
print(f"\nEncoder Hidden States Shape: {encoder_hidden_states.shape}")
print(f"Generated Output:\n{translation_manual}")


In [None]:
%pip install evaluate datasets transformers accelerate

In [None]:
import torch
import numpy as np
import logging
import shutil
from datasets import load_dataset
import evaluate
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

# Quiet logs
logging.getLogger("transformers").setLevel(logging.ERROR)

# Devices
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe_device = 0 if torch.cuda.is_available() else -1
print(f"Using device for computation: {device}")

# ===== BERT (Encoder-Only) =====
MODEL_NAME_BERT = "bert-base-uncased"
tokenizer_bert = AutoTokenizer.from_pretrained(MODEL_NAME_BERT)
model_bert_mlm = AutoModelForMaskedLM.from_pretrained(MODEL_NAME_BERT).to(device)
model_bert_mlm.eval()

text = "The quick brown [MASK] jumps over the lazy dog."
inputs = tokenizer_bert(text, return_tensors="pt").to(device)
print(f"Input text: '{text}'")
print(f"Tokens: {tokenizer_bert.convert_ids_to_tokens(inputs['input_ids'][0])}")

mask_token_index = torch.where(inputs["input_ids"] == tokenizer_bert.mask_token_id)[1]
with torch.no_grad():
    logits = model_bert_mlm(**inputs).logits

mask_token_logits = logits[0, mask_token_index, :]
top_5_tokens_ids = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
print("\nBERT's top predictions for [MASK]:")
for token_id in top_5_tokens_ids:
    print(f"- {tokenizer_bert.decode([token_id])}")

fill_mask = pipeline("fill-mask", model=model_bert_mlm, tokenizer=tokenizer_bert, device=pipe_device)
text1 = "We sat by the river [MASK] and enjoyed the view."
text2 = "I need to deposit money at the [MASK] before the meeting."
print(f"\nContext 1 (Riverside): '{text1}'")
for pred in fill_mask(text1, top_k=3):
    print(f"- {pred['token_str']} (Score: {pred['score']:.4f})")
print(f"\nContext 2 (Financial): '{text2}'")
for pred in fill_mask(text2, top_k=3):
    print(f"- {pred['token_str']} (Score: {pred['score']:.4f})")


In [None]:

# ===== GPT (Decoder-Only) =====
MODEL_NAME_GPT = "gpt2"
tokenizer_gpt = AutoTokenizer.from_pretrained(MODEL_NAME_GPT)
model_gpt = AutoModelForCausalLM.from_pretrained(MODEL_NAME_GPT).to(device)
model_gpt.eval()

prompt = "The breakthrough of Transformers in NLP revolutionized the field by"
inputs_gpt = tokenizer_gpt(prompt, return_tensors="pt").to(device)
print(f"\nPrompt: '{prompt}'")
print("Generating continuation...")
with torch.no_grad():
    output_sequences = model_gpt.generate(
        input_ids=inputs_gpt["input_ids"],
        max_new_tokens=50,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer_gpt.eos_token_id,
    )
generated_text = tokenizer_gpt.decode(output_sequences[0], skip_special_tokens=True)
print("\nGenerated Text:")
print(generated_text)

print("\n--- Zero-Shot Prompt (Sentiment) ---")
prompt_zero_shot = """
Analyze the sentiment of the following movie review.

Review: The acting was incredible and the story was moving.
Sentiment:
"""
inputs_zs = tokenizer_gpt(prompt_zero_shot, return_tensors="pt").to(device)
with torch.no_grad():
    output_zs = model_gpt.generate(
        input_ids=inputs_zs["input_ids"],
        max_new_tokens=5,
        do_sample=False,
        pad_token_id=tokenizer_gpt.eos_token_id,
    )
print(tokenizer_gpt.decode(output_zs[0], skip_special_tokens=True))

print("\n--- Few-Shot Prompt (Translation) ---")
prompt_few_shot = """
Translate English to French:

English: Hello
French: Bonjour

English: Goodbye
French: Au revoir

English: Thank you
French:
"""
inputs_fs = tokenizer_gpt(prompt_few_shot, return_tensors="pt").to(device)
with torch.no_grad():
    output_fs = model_gpt.generate(
        input_ids=inputs_fs["input_ids"],
        max_new_tokens=5,
        do_sample=False,
        pad_token_id=tokenizer_gpt.eos_token_id,
    )
print(tokenizer_gpt.decode(output_fs[0], skip_special_tokens=True))


In [None]:

# ===== T5 (Encoder-Decoder) =====
MODEL_NAME_T5 = "t5-small"
tokenizer_t5 = AutoTokenizer.from_pretrained(MODEL_NAME_T5, model_max_length=512)
model_t5 = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME_T5).to(device)
model_t5.eval()

input_text_translation = "translate English to German: Hello, welcome to the DS6050."
print(f"\nInput (Translation): {input_text_translation}")
input_ids_t5 = tokenizer_t5(input_text_translation, return_tensors="pt").input_ids.to(device)
with torch.no_grad():
    outputs_t5 = model_t5.generate(input_ids_t5, max_length=20)
decoded_output = tokenizer_t5.decode(outputs_t5[0], skip_special_tokens=True)
print(f"Output: {decoded_output}")

long_text = """
The Transformer architecture was introduced in 2017.
It revolutionized NLP by moving away from RNNs and LSTMs, relying
solely on attention mechanisms. This allowed for much better parallelization.
In 2018, BERT utilized the Transformer encoder and pre-training to achieve
state-of-the-art results on many understanding tasks.
"""
input_text_summary = f"summarize: {long_text.replace('\n', ' ')}"
print(f"\nInput (Summarization): (See long_text variable)")
input_ids_sum = tokenizer_t5(
    input_text_summary, return_tensors="pt", max_length=512, truncation=True
).input_ids.to(device)
with torch.no_grad():
    outputs_sum = model_t5.generate(input_ids_sum, max_length=60, num_beams=4, early_stopping=True)
decoded_summary = tokenizer_t5.decode(outputs_sum[0], skip_special_tokens=True)
print("Output:")
print(decoded_summary)


In [None]:
# ===== Full end-to-end fine-tuning FFT =====
FT_MODEL_NAME = "distilbert-base-uncased"
print("\n--- Loading and Preparing Data (IMDB Sentiment) ---")
dataset = load_dataset("imdb")
dataset = dataset.shuffle(seed=42)
small_train_dataset = dataset["train"].select(range(300))
small_eval_dataset = dataset["test"].select(range(100))
print(f"Loaded IMDB dataset. Training samples: {len(small_train_dataset)}, Eval samples: {len(small_eval_dataset)}")

print("\n--- Tokenization ---")
ft_tokenizer = AutoTokenizer.from_pretrained(FT_MODEL_NAME)

def tokenize_function(examples):
    return ft_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
tokenized_eval = small_eval_dataset.map(tokenize_function, batched=True)

# Ensure Trainer sees 'labels' and tensors
if "label" in tokenized_train.column_names:
    tokenized_train = tokenized_train.rename_column("label", "labels")
if "label" in tokenized_eval.column_names:
    tokenized_eval = tokenized_eval.rename_column("label", "labels")
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_eval.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print("Tokenization complete.")

print(f"\n--- Loading Pre-trained Model ({FT_MODEL_NAME}) ---")
ft_model = AutoModelForSequenceClassification.from_pretrained(FT_MODEL_NAME, num_labels=2).to(device)
print("Model loaded.")

print("\n--- Defining Metrics ---")
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

print("Metrics defined (Accuracy).")

print("\n--- Configuring and Running Trainer ---")
training_args = TrainingArguments(
    output_dir="distilbert_ft_demo_lecture",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)

trainer = Trainer(
    model=ft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)

print("Starting Fine-Tuning...")
trainer.train()
print("\nFine-tuning complete!")

print("\n--- Evaluation and Inference ---")
eval_results = trainer.evaluate()
print(f"Final Evaluation Accuracy: {eval_results['eval_accuracy']:.4f}")

finetuned_classifier = pipeline(
    "sentiment-analysis",
    model=ft_model,
    tokenizer=ft_tokenizer,
    device=pipe_device,
)

test_sentence = "This lectutorial was incredibly insightful and well-structured!"
print(f"\nInference Test: '{test_sentence}'")
prediction = finetuned_classifier(test_sentence)
print(f"Prediction: {prediction}")

# Clean artifacts
shutil.rmtree("distilbert_ft_demo_lecture", ignore_errors=True)
