In [1]:
# MUST RUN FIRST
import os, sys, platform, torch
os.environ["TRANSFORMERS_NO_TF"] = "1"              # avoid TF/Keras import issues
os.environ["HF_HUB_DISABLE_TF_WARNING"] = "1"

print("Python:", sys.version)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    print("BF16 supported:", torch.cuda.is_bf16_supported())


Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
CUDA available: True
GPU: NVIDIA A100-SXM4-40GB
BF16 supported: True


In [2]:
!pip -q install "transformers>=4.43,<4.47" "accelerate>=0.30" datasets==2.21.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m152.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m126.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.6.1 which is incompatible.[0m[31m
[0m

In [4]:
import torch, os
torch.backends.cuda.matmul.allow_tf32 = True  # faster matmul on A100
torch.set_float32_matmul_precision("high")    # ok with BF16/TF32

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # fewer OOMs

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [8]:
!pip install -q flash-attn --no-build-isolation

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone


In [9]:
base = AutoModelForCausalLM.from_pretrained(
    "gpt2-medium",                          # or "gpt2" if you prefer small
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2" # A100-optimized
)
base.gradient_checkpointing_enable()
base.config.use_cache = False
print("GPT-2-medium loaded on A100 ✅")


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT-2-medium loaded on A100 ✅


In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Clean re-load with SDPA (fast and stable on A100)
tok = AutoTokenizer.from_pretrained("gpt2-medium")
tok.pad_token = tok.eos_token

base = AutoModelForCausalLM.from_pretrained(
    "gpt2-medium",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="sdpa",   # <- force SDPA; avoids FA2 runtime errors
)
base.gradient_checkpointing_enable()
base.config.use_cache = False
print("GPT-2-medium (SDPA) loaded ✅")


GPT-2-medium (SDPA) loaded ✅


In [12]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("gpt2-medium"); tok.pad_token = tok.eos_token

prompt = "You are a helpful assistant. Q: What is RoPE in transformers? A:"
inputs = tok(prompt, return_tensors="pt").to("cuda")
out = base.generate(**inputs, max_new_tokens=64, do_sample=True, top_p=0.9)
print(tok.decode(out[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You are a helpful assistant. Q: What is RoPE in transformers? A: RoPE is a data transformation tool that lets you transform an RGBA image to a 2D image, using standard Photoshop features like blend modes and masking. Here's a demonstration of how to use RoPE:


Q: I'm a novice user of RoPE but I need help with other images that use


In [13]:
prompt = "You are a helpful assistant. Q: What is RoPE in transformers? A:"
inputs = tok(prompt, return_tensors="pt").to("cuda")
with torch.inference_mode():
    out = base.generate(**inputs, max_new_tokens=64, do_sample=True, top_p=0.9)
print(tok.decode(out[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You are a helpful assistant. Q: What is RoPE in transformers? A: RoPE is the method of storing and manipulating data for a system of transformers and motors. RoPE is a general purpose library for data-processing, such as data structures and algorithms for handling variable-width data (e.g. graphics). In this chapter, we cover the basics of the basic concepts of Ro
