# PyTorch 常用方法示例 (Top 200)
本 Notebook 展示了约 200 个 **PyTorch** 常用方法或功能的简要示例。每个示例位于一个单独单元格中。

**使用说明：**
- 请确保已安装 PyTorch (例如 `pip install torch`)，并能在 Python 环境中导入；
- 大部分示例以 `import torch` 为基础；
- 示例只示范最基本用法，更多高级功能请参阅官方文档；
- 在执行前确保在 Jupyter 中或支持 IPython 的环境运行。

In [931]:
# 1. 导入与版本查看
import torch
print("PyTorch version:", torch.__version__)

PyTorch version: 2.5.1+cpu


In [932]:
# 2. 创建张量: torch.tensor
x = torch.tensor([1,2,3], dtype=torch.float32)
print(x)

tensor([1., 2., 3.])


In [933]:
# 3. 张量属性: shape, dtype, device
print(x.shape, x.dtype, x.device)

torch.Size([3]) torch.float32 cpu


In [934]:
# 4. 张量移动到GPU (如果可用)
if torch.cuda.is_available():
    x_gpu = x.to('cuda')
    print(x_gpu.device)
else:
    print("CUDA not available")

CUDA not available


In [935]:
# 5. 创建随机张量: torch.randn
rand_t = torch.randn(2,3)
print(rand_t)

tensor([[ 0.1378, -0.6330, -1.2384],
        [ 0.3924, -0.2469,  1.0956]])


In [936]:
# 6. 创建随机张量: torch.rand, 范围 [0,1)
rand_01 = torch.rand(2,2)
print(rand_01)

tensor([[0.0443, 0.6136],
        [0.6443, 0.3045]])


In [937]:
# 7. 创建整型随机张量: torch.randint
rand_int = torch.randint(low=0, high=5, size=(3,3))
print(rand_int)

tensor([[2, 1, 0],
        [4, 1, 0],
        [2, 3, 1]])


In [938]:
# 8. 创建全0，全1或全特定值的张量: torch.zeros, torch.ones, torch.full
z = torch.zeros(2,3)
o = torch.ones(2,3)
f = torch.full((2,3), 7)
print(z, o, f, sep='\n')

tensor([[0., 0., 0.],
        [0., 0., 0.]])
tensor([[1., 1., 1.],
        [1., 1., 1.]])
tensor([[7, 7, 7],
        [7, 7, 7]])


In [939]:
# 9. 单位矩阵: torch.eye
eye_ = torch.eye(3)
print(eye_)

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])


In [940]:
# 10. 创建与某张量形状相同的全0,全1: torch.zeros_like, torch.ones_like
like_zeros = torch.zeros_like(x)
like_ones = torch.ones_like(x)
print(like_zeros, like_ones)

tensor([0., 0., 0.]) tensor([1., 1., 1.])


In [941]:
# 11. torch.arange / torch.linspace
ar = torch.arange(0,5, step=1)
lin = torch.linspace(0,1, steps=5)
print(ar)
print(lin)

tensor([0, 1, 2, 3, 4])
tensor([0.0000, 0.2500, 0.5000, 0.7500, 1.0000])


In [942]:
# 12. 张量的形状变换: .view or .reshape
y = torch.tensor([[1,2,3],[4,5,6]])
y_reshaped = y.view(3,2)
print(y_reshaped)
print(y.reshape(-1))  # 拉平

tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([1, 2, 3, 4, 5, 6])


In [943]:
# 13. 张量维度变换: torch.transpose or .t()
y_trans = y.t()
print(y_trans)

tensor([[1, 4],
        [2, 5],
        [3, 6]])


In [944]:
# 14. 改变维度：unsqueeze / squeeze
a = torch.tensor([1,2,3])
a_unsq = a.unsqueeze(1)
a_sq = a_unsq.squeeze(1)
print(a_unsq.size(), a_sq.size())

torch.Size([3, 1]) torch.Size([3])


In [945]:
# 15. 拼接: torch.cat
t1 = torch.randn(2,2)
t2 = torch.randn(2,2)
cat_dim0 = torch.cat((t1, t2), dim=0)
cat_dim1 = torch.cat((t1, t2), dim=1)
print(cat_dim0.shape, cat_dim1.shape)

torch.Size([4, 2]) torch.Size([2, 4])


In [946]:
# 16. 堆叠: torch.stack
stacked = torch.stack([t1, t2], dim=0)
print(stacked.shape)

torch.Size([2, 2, 2])


In [947]:
# 17. 复制扩展: torch.repeat
r = torch.tensor([1,2]).repeat(3)
print(r)
r2d = torch.tensor([[1,2],[3,4]]).repeat(2,3)
print(r2d)

tensor([1, 2, 1, 2, 1, 2])
tensor([[1, 2, 1, 2, 1, 2],
        [3, 4, 3, 4, 3, 4],
        [1, 2, 1, 2, 1, 2],
        [3, 4, 3, 4, 3, 4]])


In [948]:
# 18. 索引: 直接用 Python 切片
mat = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
print(mat[0, :2])
print(mat[1:, 1:])

tensor([1, 2])
tensor([[5, 6],
        [8, 9]])


In [949]:
# 19. 高级索引: index_select, gather
idx = torch.tensor([0,2])
sel = torch.index_select(mat, dim=0, index=idx)
print(sel)

tensor([[1, 2, 3],
        [7, 8, 9]])


In [950]:
# 20. gather
vals = torch.gather(mat, dim=1, index=torch.tensor([[2,1,0],[0,2,1],[1,0,2]]))
print(vals)

tensor([[3, 2, 1],
        [4, 6, 5],
        [8, 7, 9]])


In [951]:
# 21. boolean mask 索引
mask = (mat > 5)
print(mat[mask])

tensor([6, 7, 8, 9])


In [952]:
# 22. scatter_ 用于原地写入
src = torch.tensor([[9,9,9],[8,8,8],[7,7,7]])
index = torch.tensor([[2,1,0],[0,2,1],[1,0,2]])
mat_scatter = mat.clone()
mat_scatter.scatter_(dim=1, index=index, src=src)
print(mat_scatter)

tensor([[9, 9, 9],
        [8, 8, 8],
        [7, 7, 7]])


In [953]:
# 23. 张量加减乘除: +, -, *, /
a = torch.randn(3)
b = torch.randn(3)
print(a + b)
print(a - b)
print(a * b)
print(a / b)

tensor([ 1.5153, -2.3079,  0.7093])
tensor([-0.6546, -0.2006,  0.0210])
tensor([0.4669, 1.3216, 0.1257])
tensor([0.3967, 1.1904, 1.0611])


In [954]:
# 24. torch.add, torch.sub, torch.mul, torch.div
print(torch.add(a, b))
print(torch.mul(a, b))

tensor([ 1.5153, -2.3079,  0.7093])
tensor([0.4669, 1.3216, 0.1257])


In [955]:
# 25. 矩阵乘法: @ 或 torch.matmul
m1 = torch.randn(2,3)
m2 = torch.randn(3,4)
out_matmul = m1 @ m2
print(out_matmul.shape)

torch.Size([2, 4])


In [956]:
# 26. torch.mm, torch.bmm (batch)
mm_res = torch.mm(m1, m2)
print(mm_res.shape)

b1 = torch.randn(10,2,3)
b2 = torch.randn(10,3,4)
bmm_res = torch.bmm(b1, b2)
print(bmm_res.shape)

torch.Size([2, 4])
torch.Size([10, 2, 4])


In [957]:
# 27. 求和 / 均值 / 最大最小: torch.sum, torch.mean, torch.max, torch.min
vals_2d = torch.randn(2,2)
print(torch.sum(vals_2d))
print(torch.mean(vals_2d))
print(torch.max(vals_2d))
print(torch.min(vals_2d))

tensor(-2.6119)
tensor(-0.6530)
tensor(0.4325)
tensor(-1.4226)


In [958]:
# 28. 沿维度求值: dim 参数
print(torch.sum(vals_2d, dim=0))
print(torch.mean(vals_2d, dim=1))

tensor([-2.4384, -0.1735])
tensor([-1.0143, -0.2917])


In [959]:
# 29. torch.argmax / torch.argmin
r = torch.randn(5)
print(r)
print(torch.argmax(r), torch.argmin(r))

tensor([ 0.2387,  1.7671, -2.9892,  0.9902, -2.1977])
tensor(1) tensor(2)


In [960]:
# 30. 比较运算: torch.eq, torch.gt, torch.lt
cmp_eq = torch.eq(a, b)
cmp_gt = torch.gt(a, b)
print(cmp_eq, cmp_gt)

tensor([False, False, False]) tensor([False, False,  True])


In [961]:
# 31. clamp (类似clip)
randvals = torch.tensor([-1.0,0.5,3.0,5.0])
cl = torch.clamp(randvals, min=0.0, max=2.0)
print(cl)

tensor([0.0000, 0.5000, 2.0000, 2.0000])


In [962]:
# 32. 排序: torch.sort / torch.topk
srt_res = torch.sort(r)
top2 = torch.topk(r, k=2)
print(srt_res)
print(top2)

torch.return_types.sort(
values=tensor([-2.9892, -2.1977,  0.2387,  0.9902,  1.7671]),
indices=tensor([2, 4, 0, 3, 1]))
torch.return_types.topk(
values=tensor([1.7671, 0.9902]),
indices=tensor([1, 3]))


In [963]:
# 33. index of max / min: torch.argmax already shown, also torch.max can return indices
vals, idxs = torch.max(vals_2d, dim=1)
print(vals, idxs)

tensor([-0.6060,  0.4325]) tensor([1, 1])


In [964]:
# 34. torch.abs / torch.ceil / torch.floor
vals_pm = torch.tensor([-1.2, 0.3, 2.7])
print(torch.abs(vals_pm))
print(torch.ceil(vals_pm))
print(torch.floor(vals_pm))

tensor([1.2000, 0.3000, 2.7000])
tensor([-1.,  1.,  3.])
tensor([-2.,  0.,  2.])


In [965]:
# 35. torch.round / torch.trunc / torch.frac
print(torch.round(vals_pm))
print(torch.trunc(vals_pm))
print(torch.frac(vals_pm))

tensor([-1.,  0.,  3.])
tensor([-1.,  0.,  2.])
tensor([-0.2000,  0.3000,  0.7000])


In [966]:
# 36. 幂函数与log: torch.pow, torch.sqrt, torch.exp, torch.log
p = torch.pow(torch.tensor([2.0,3.0]), 2)
sq = torch.sqrt(torch.tensor([4.0,9.0]))
ex = torch.exp(torch.tensor([1.0]))
lg = torch.log(torch.tensor([2.7183]))
print(p, sq, ex, lg)

tensor([4., 9.]) tensor([2., 3.]) tensor([2.7183]) tensor([1.0000])


In [967]:
# 37. 三角函数: torch.sin, torch.cos, torch.tan
vals_trig = torch.tensor([0.0, 3.14159/2, 3.14159])
print(torch.sin(vals_trig))
print(torch.cos(vals_trig))

tensor([0.0000e+00, 1.0000e+00, 2.5352e-06])
tensor([ 1.0000e+00,  1.2676e-06, -1.0000e+00])


In [968]:
# 38. 矩阵操作: torch.inverse, torch.det
mat2x2 = torch.tensor([[2.0,1.0],[1.0,2.0]])
inv_m = torch.inverse(mat2x2)
det_m = torch.det(mat2x2)
print(inv_m, det_m)

tensor([[ 0.6667, -0.3333],
        [-0.3333,  0.6667]]) tensor(3.)


In [969]:
# 39. torch.trace
tr = torch.trace(mat2x2)
print(tr)

tensor(4.)


In [970]:
# 40. 广播机制
br_a = torch.tensor([[1],[2],[3]])
br_b = torch.tensor([10,20,30])
br_res = br_a + br_b  # shape(3,3)
print(br_res)

tensor([[11, 21, 31],
        [12, 22, 32],
        [13, 23, 33]])


In [971]:
# 41. torch.mean/tensor.mean, 指定维度
val_2d = torch.randn(3,4)
mean_all = val_2d.mean()
mean_dim0 = val_2d.mean(dim=0)
print(mean_all, mean_dim0)

tensor(-0.1064) tensor([-0.4501,  0.1661,  0.1770, -0.3184])


In [972]:
# 42. keepdim 参数
mean_dim1_keep = val_2d.mean(dim=1, keepdim=True)
print(mean_dim1_keep.size())

torch.Size([3, 1])


In [973]:
# 43. .item() 将单元素张量转换为Python数值
single_val = torch.tensor([10])
python_val = single_val.item()
print(python_val, type(python_val))

10 <class 'int'>


In [974]:
# 44. 类型转换: .float(), .long(), .int(), .double()
val_long = single_val.long()
val_double = single_val.double()
print(val_long.dtype, val_double.dtype)

torch.int64 torch.float64


In [975]:
# 45. detach() 分离梯度
x_req_grad = torch.tensor(1.0, requires_grad=True)
y_no_grad = x_req_grad.detach()
print(y_no_grad.requires_grad)

False


In [976]:
# 46. requires_grad
w = torch.tensor([1.0,2.0,3.0], requires_grad=True)
print(w.requires_grad)

True


In [977]:
# 47. 自动求梯度: .backward()
scaler = w.sum()  # = 6
scaler.backward()  # d(scaler)/dw = [1,1,1]
print(w.grad)

tensor([1., 1., 1.])


In [978]:
# 48. grad 清零: .grad.zero_()
w.grad.zero_()
print(w.grad)

tensor([0., 0., 0.])


In [979]:
# 49. with torch.no_grad(): 阻断梯度追踪
with torch.no_grad():
    y_ng = w * 2
print(y_ng.requires_grad)

False


In [980]:
# 50. 自定义函数 + autograd
x_var = torch.tensor(2.0, requires_grad=True)
y_var = 3*x_var**2 + 2*x_var + 1
y_var.backward()
print(x_var.grad)  # dy/dx=6x+2 => x=2 => 6*2+2=14

tensor(14.)


## 神经网络 (torch.nn) 部分

In [981]:
# 51. nn.Linear
import torch.nn as nn

linear = nn.Linear(in_features=4, out_features=3)
inp_data = torch.randn(2,4)
out_data = linear(inp_data)
print(out_data.shape)

torch.Size([2, 3])


In [982]:
# 52. nn.ReLU
relu_fn = nn.ReLU()
vals_r = torch.tensor([-1.0,0.5,2.0])
out_relu = relu_fn(vals_r)
print(out_relu)

tensor([0.0000, 0.5000, 2.0000])


In [983]:
# 53. nn.Sequential
model_seq = nn.Sequential(
    nn.Linear(4,8),
    nn.ReLU(),
    nn.Linear(8,2)
)
sample_in = torch.randn(5,4)
model_out = model_seq(sample_in)
print(model_out.shape)

torch.Size([5, 2])


In [984]:
# 54. 自定义nn.Module
class MyNet(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, out_dim)
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

net = MyNet(4,8,2)
print(net)

MyNet(
  (fc1): Linear(in_features=4, out_features=8, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=8, out_features=2, bias=True)
)


In [985]:
# 55. forward
out_cust = net(sample_in)
print(out_cust.shape)

torch.Size([5, 2])


In [986]:
# 56. nn.Conv2d
conv2d = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=3, padding=1)
img_ = torch.randn(2,3,32,32)
img_out = conv2d(img_)
print(img_out.shape)

torch.Size([2, 6, 32, 32])


In [987]:
# 57. nn.MaxPool2d
pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
pooled = pool2d(img_out)
print(pooled.shape)

torch.Size([2, 6, 16, 16])


In [988]:
# 58. nn.Flatten
flat = nn.Flatten()
flat_out = flat(pooled)
print(flat_out.shape)

torch.Size([2, 1536])


In [989]:
# 59. nn.Dropout
drop = nn.Dropout(p=0.5)
drop_out = drop(flat_out)
print(drop_out.shape)

torch.Size([2, 1536])


In [990]:
# 60. nn.BatchNorm2d
bn2d = nn.BatchNorm2d(num_features=6)
bn_out = bn2d(img_out)
print(bn_out.shape)

torch.Size([2, 6, 32, 32])


In [991]:
# 61. nn.LSTM
lstm = nn.LSTM(input_size=10, hidden_size=20, batch_first=True)
seq_in = torch.randn(2,5,10)
lstm_out, (h_n, c_n) = lstm(seq_in)
print(lstm_out.shape, h_n.shape, c_n.shape)

torch.Size([2, 5, 20]) torch.Size([1, 2, 20]) torch.Size([1, 2, 20])


In [992]:
# 62. nn.GRU
gru = nn.GRU(input_size=8, hidden_size=16, batch_first=True)
gru_in = torch.randn(3,7,8)
gru_out, h_n = gru(gru_in)
print(gru_out.shape, h_n.shape)

torch.Size([3, 7, 16]) torch.Size([1, 3, 16])


In [993]:
# 63. nn.Embedding
emb = nn.Embedding(num_embeddings=10, embedding_dim=4)
idxs = torch.tensor([1,3,5,9])
emb_out = emb(idxs)
print(emb_out.shape)

torch.Size([4, 4])


In [994]:
# 64. nn.functional 常用: F.relu, F.softmax, F.cross_entropy
import torch.nn.functional as F
logits = torch.randn(2,5)
pred_probas = F.softmax(logits, dim=1)
print(pred_probas)

tensor([[0.0184, 0.1395, 0.1329, 0.3901, 0.3190],
        [0.0630, 0.6372, 0.1386, 0.0673, 0.0939]])


In [995]:
# 65. F.relu
rel = F.relu(torch.tensor([-1.0, 2.0, -0.5]))
print(rel)

tensor([0., 2., 0.])


In [996]:
# 66. F.cross_entropy
labels = torch.tensor([1,3])  # batch=2
logits_ce = torch.randn(2,5)
loss_ce = F.cross_entropy(logits_ce, labels)
print(loss_ce)

tensor(1.5427)


In [997]:
# 67. nn.MSELoss
mse_loss = nn.MSELoss()
pred_ = torch.tensor([0.5,0.8])
target_ = torch.tensor([1.0,1.0])
loss_m = mse_loss(pred_, target_)
print(loss_m)

tensor(0.1450)


In [998]:
# 68. nn.BCELoss / nn.BCEWithLogitsLoss
bce_loss_fn = nn.BCELoss()
pred_bce = torch.tensor([0.8,0.2])
target_bce = torch.tensor([1.0, 0.0])
loss_bce = bce_loss_fn(pred_bce, target_bce)
print(loss_bce)

tensor(0.2231)


In [999]:
# 69. BCEWithLogitsLoss
bce_logit_loss_fn = nn.BCEWithLogitsLoss()
logits_bl = torch.tensor([1.5, -0.5])
target_bl = torch.tensor([1.0, 0.0])
loss_bl = bce_logit_loss_fn(logits_bl, target_bl)
print(loss_bl)

tensor(0.3377)


In [1000]:
# 70. 优化器: torch.optim.SGD
import torch.optim as optim

model = nn.Linear(4,2)
optimizer = optim.SGD(model.parameters(), lr=0.01)
print(optimizer)

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)


In [1001]:
# 71. optimizer.step(), optimizer.zero_grad()
optimizer.zero_grad()
fake_input = torch.randn(1,4)
fake_target = torch.randn(1,2)
output = model(fake_input)
loss_ = F.mse_loss(output, fake_target)
loss_.backward()
optimizer.step()
print("step done.")

step done.


In [1002]:
# 72. 其他优化器: Adam
adam_opt = optim.Adam(model.parameters(), lr=0.001)
print(adam_opt)

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)


In [1003]:
# 73. 学习率调度: optim.lr_scheduler.StepLR
scheduler = optim.lr_scheduler.StepLR(adam_opt, step_size=10, gamma=0.1)
for epoch in range(3):
    # do training...
    scheduler.step()
    print("Epoch", epoch, "LR=", adam_opt.param_groups[0]['lr'])

Epoch 0 LR= 0.001
Epoch 1 LR= 0.001
Epoch 2 LR= 0.001


In [1004]:
# 74. DataLoader 与 Dataset
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self):
        self.data = torch.randn(100,4)
        self.labels = torch.randint(0,2,(100,))
    def __len__(self):
        return 100
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

ds = MyDataset()
loader = DataLoader(ds, batch_size=8, shuffle=True)
for batch_x, batch_y in loader:
    print(batch_x.size(), batch_y.size())
    break

torch.Size([8, 4]) torch.Size([8])


In [1005]:
# 75. Dataset random split
train_ds, val_ds = torch.utils.data.random_split(ds, [80,20])
print(len(train_ds), len(val_ds))

80 20


In [1006]:
# 76. TensorDataset
from torch.utils.data import TensorDataset
data_t = torch.randn(10,3)
label_t = torch.randint(0,2,(10,))
ds_t = TensorDataset(data_t, label_t)
loader_t = DataLoader(ds_t, batch_size=2)
for dx, dy in loader_t:
    print(dx.shape, dy.shape)
    break

torch.Size([2, 3]) torch.Size([2])


In [1007]:
# 77. ConcatDataset
ds_concat = torch.utils.data.ConcatDataset([train_ds, val_ds])
print(len(ds_concat))

100


## 训练一个简单分类器例子

In [1008]:
# 78. 简单训练循环 (pseudo-code)
net_cls = nn.Sequential(
    nn.Linear(4,5),
    nn.ReLU(),
    nn.Linear(5,2)
)
criterion = nn.CrossEntropyLoss()
optim_sgd = optim.SGD(net_cls.parameters(), lr=0.01)

for epoch in range(2):
    for batch_x, batch_y in loader:
        optim_sgd.zero_grad()
        pred_ = net_cls(batch_x)
        loss_c = criterion(pred_, batch_y)
        loss_c.backward()
        optim_sgd.step()
    print("Epoch", epoch, "done.")

Epoch 0 done.
Epoch 1 done.


In [1009]:
# 79. 验证
net_cls.eval()
with torch.no_grad():
    for val_x, val_y in loader:
        val_pred = net_cls(val_x)
        # do something
        break
print("Validation step example.")

Validation step example.


## GPU training

In [1010]:
# 80. to(device)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net_cls.to(device)
for batch_x, batch_y in loader:
    batch_x, batch_y = batch_x.to(device), batch_y.to(device)
    # training loop...
    break
print("moved to device.")

moved to device.


## torch.save / torch.load

In [1011]:
# 81. 保存与加载模型
torch.save(net_cls.state_dict(), 'model.pth')
net_new = nn.Sequential(
    nn.Linear(4,5),
    nn.ReLU(),
    nn.Linear(5,2)
)
net_new.load_state_dict(torch.load('model.pth'))
print("Loaded model.")

Loaded model.


  net_new.load_state_dict(torch.load('model.pth'))


## torch.nn.init

In [1012]:
# 82. nn.init.xavier_uniform_, kaiming_uniform_
import torch.nn.init as init

my_layer = nn.Linear(10,20)
init.xavier_uniform_(my_layer.weight)
init.zeros_(my_layer.bias)
print(my_layer.weight[:2])

tensor([[-0.2353,  0.2804, -0.0938,  0.1366,  0.1233,  0.2667, -0.3237, -0.2536,
         -0.2945,  0.0401],
        [-0.2769,  0.2773,  0.0810,  0.2419,  0.3719, -0.1447,  0.3678, -0.0145,
         -0.2911, -0.3911]], grad_fn=<SliceBackward0>)


## torch.utils.tensorboard

In [1013]:
# 83. TensorBoard summary writer
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/exp1')
fake_scalar = 0.5
writer.add_scalar("my_metric", fake_scalar, 0)
writer.close()
print("Written scalar to TB.")

Written scalar to TB.


## more functional: F.log_softmax, F.nll_loss

In [1014]:
# 84. F.log_softmax, F.nll_loss
logits_ = torch.randn(3,5)
log_prob = F.log_softmax(logits_, dim=1)
target_cls = torch.tensor([2,1,4])
nll = F.nll_loss(log_prob, target_cls)
print(nll)

tensor(2.6374)


## some Activation modules

In [1015]:
# 85. nn.Sigmoid, nn.Tanh
sig_ = nn.Sigmoid()
tnh_ = nn.Tanh()
inp_act = torch.tensor([-1.0,0.0,1.0])
print(sig_(inp_act), tnh_(inp_act))

tensor([0.2689, 0.5000, 0.7311]) tensor([-0.7616,  0.0000,  0.7616])


## WeightedLoss example

In [1016]:
# 86. CrossEntropyLoss with weight
wt = torch.tensor([1.0,2.0])
weighted_ce = nn.CrossEntropyLoss(weight=wt)
logits_w = torch.randn(3,2)
labels_w = torch.tensor([0,1,1])
loss_w = weighted_ce(logits_w, labels_w)
print(loss_w)

tensor(1.1309)


## Non-differentiable ops

In [1017]:
# 87. torch.argmax with no grad
argmax_val = torch.argmax(logits_w, dim=1)
print(argmax_val, argmax_val.requires_grad)

tensor([1, 1, 0]) False


## .grad_fn attribute

In [1018]:
# 88. .grad_fn
p_ = net_cls(torch.randn(1,4))
print(p_.grad_fn)

<AddmmBackward0 object at 0x000001FD39BF5C30>


## torch.autograd.grad

In [1019]:
# 89. torch.autograd.grad
x_ = torch.tensor(2.0, requires_grad=True)
y_ = x_**3
grad_val = torch.autograd.grad(y_, x_)
print(grad_val)

(tensor(12.),)


## hooking into modules

In [1020]:
# 90. register_forward_hook
def fwd_hook(m, inp, outp):
    print("Forward hook triggered.", m)

net_cls[0].register_forward_hook(fwd_hook)
_ = net_cls(torch.randn(1,4))

Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)


## Dtype conversions

In [1021]:
# 91. .to(torch.float64), .to(torch.int)
dt_ = x.to(torch.int)
print(dt_.dtype)

torch.int32


## Set random seed

In [1022]:
# 92. torch.manual_seed
torch.manual_seed(42)
print(torch.rand(2))

tensor([0.8823, 0.9150])


## partial or full wrapping of python ops

## tolist() or numpy() conversion

In [1023]:
# 93. x.numpy()  (requires x on CPU and not require_grad)
cpu_x = x.cpu().detach()
arr_np = cpu_x.numpy()
print(arr_np, type(arr_np))

[1. 2. 3.] <class 'numpy.ndarray'>


In [1024]:
# 94. .tolist()
print(cpu_x.tolist())

[1.0, 2.0, 3.0]


## advanced indexing

In [1025]:
# 95. x[[0,2]] etc.
arr_adv = torch.tensor([10,20,30,40])
print(arr_adv[[0,2,3]])
bool_mask = torch.tensor([True,False,True,False])
print(arr_adv[bool_mask])

tensor([10, 30, 40])
tensor([10, 30])


## advanced broadcasting

In [1026]:
# 96. broadcast shapes
a_ = torch.randn(3,1)
b_ = torch.randn(1,4)
c_ = a_ + b_  # shape= (3,4)
print(c_.shape)

torch.Size([3, 4])


## in-place ops: add_, sub_, etc

In [1027]:
# 97. add_
tmp_ = torch.tensor([1.0,2.0,3.0])
tmp_.add_(5)
print(tmp_)

tensor([6., 7., 8.])


## accumulative grads

## final 3 to complete 100

In [1028]:
# 98. torch.split
split_val = torch.tensor([1,2,3,4,5,6])
spl = torch.split(split_val, split_size_or_sections=2)
print(spl)

(tensor([1, 2]), tensor([3, 4]), tensor([5, 6]))


In [1029]:
# 99. torch.chunk
ch = torch.chunk(split_val, chunks=3)
print(ch)

(tensor([1, 2]), tensor([3, 4]), tensor([5, 6]))


In [1030]:
# 100. pythonic: del x.grad
w.grad = None  # or w.grad.zero_()
print("We've reached 100 examples.")

We've reached 100 examples.


In [1031]:
# 101. torch.nn.Parameter
param_var = nn.Parameter(torch.randn(3,3))
print(param_var.requires_grad)

True


In [1032]:
# 102. model.parameters() vs model.named_parameters()
for name, param in net_cls.named_parameters():
    print(name, param.shape)

0.weight torch.Size([5, 4])
0.bias torch.Size([5])
2.weight torch.Size([2, 5])
2.bias torch.Size([2])


In [1033]:
# 103. weight init example
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0.0, std=0.02)
        nn.init.constant_(m.bias, 0.0)

net_cls.apply(init_weights)
print("Weights re-inited.")

Weights re-inited.


## Another advanced: registering buffers

In [1034]:
# 104. register_buffer
class MyModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.register_buffer('running_mean', torch.zeros(3))
m_mod = MyModule()
print(m_mod)

MyModule()


## torch.distributions usage (not always in top usage, but let's do 1-2 examples).

In [1035]:
# 105. distributions: Normal
from torch.distributions import Normal
dist_ = Normal(loc=0.0, scale=1.0)
samples_ = dist_.sample((5,))
print(samples_, dist_.log_prob(samples_))

tensor([ 0.5568, -0.8123,  1.1964,  0.8613, -1.3682]) tensor([-1.0739, -1.2489, -1.6346, -1.2899, -1.8549])


## torch.einsum

In [1036]:
# 106. torch.einsum
a_ = torch.randn(2,3)
b_ = torch.randn(3,4)
res_ein = torch.einsum('ij,jk->ik', a_, b_)
print(res_ein.shape)

torch.Size([2, 4])


## HPC advanced: pinned memory not in top usage.
## We'll skip.

## torch.nn.utils.clip_grad_norm_

In [1037]:
# 107. clip_grad_norm_
net_temp = nn.Linear(4,3)
opt_temp = optim.SGD(net_temp.parameters(), lr=0.01)
dummy_input = torch.randn(1,4)
dummy_target = torch.tensor([1])
criterion_ce = nn.CrossEntropyLoss()

opt_temp.zero_grad()
out_temp = net_temp(dummy_input)
loss_temp = criterion_ce(out_temp, dummy_target)
loss_temp.backward()
torch.nn.utils.clip_grad_norm_(net_temp.parameters(), max_norm=2.0)
opt_temp.step()
print("clip_grad_norm_ done.")

clip_grad_norm_ done.


## torch.nn.utils.rnn.pack_padded_sequence, pad_packed_sequence

In [1038]:
# 108. pack_padded_sequence example
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
seqs = [torch.tensor([1,2,3]), torch.tensor([4,5])]  # different length
lengths = [3,2]
padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True)
packed = pack_padded_sequence(padded, lengths=lengths, batch_first=True, enforce_sorted=False)
print(packed)

PackedSequence(data=tensor([1, 4, 2, 5, 3]), batch_sizes=tensor([2, 2, 1]), sorted_indices=tensor([0, 1]), unsorted_indices=tensor([0, 1]))


## set_requires_grad

In [1039]:
# 109. x.requires_grad_(True)
no_grad_t = torch.randn(3)
no_grad_t.requires_grad_(True)
print(no_grad_t.requires_grad)

True


## backward with gradient arg

In [1040]:
# 110. y.backward(gradient=some_tensor)
x_2 = torch.tensor([2.0,3.0], requires_grad=True)
y_2 = x_2 * x_2
grad_arg = torch.tensor([1.0, 1.0])
y_2.backward(gradient=grad_arg)
print(x_2.grad)

tensor([4., 6.])


## advanced spool

## leftover

In [1041]:
# 111. functional conv2d
img_t = torch.randn(1,3,32,32)
weight_c = torch.randn(6,3,3,3)
bias_c = torch.randn(6)
out_fn = F.conv2d(img_t, weight_c, bias_c, stride=1, padding=1)
print(out_fn.shape)

torch.Size([1, 6, 32, 32])


In [1042]:
# 112. functional linear
in_lin = torch.randn(2,5)
weight_lin = torch.randn(3,5)
bias_lin = torch.randn(3)
out_linfn = F.linear(in_lin, weight_lin, bias_lin)
print(out_linfn.shape)

torch.Size([2, 3])


## reduce ops: product, cumsum, cumprod

In [1043]:
# 113. torch.prod, torch.cumsum, torch.cumprod
vals_c = torch.tensor([1,2,3,4])
print(torch.prod(vals_c))
print(torch.cumsum(vals_c, dim=0))
print(torch.cumprod(vals_c, dim=0))

tensor(24)
tensor([ 1,  3,  6, 10])
tensor([ 1,  2,  6, 24])


## bitwise ops: torch.bitwise_and, bitwise_or

In [1044]:
# 114. bitwise ops
a_b = torch.tensor([1,3,5])
b_b = torch.tensor([1,1,4])
print(torch.bitwise_and(a_b, b_b))
print(torch.bitwise_or(a_b, b_b))

tensor([1, 1, 4])
tensor([1, 3, 5])


## RNG states: get_rng_state, set_rng_state

In [1045]:
# 115. torch.get_rng_state
rng_st = torch.get_rng_state()
print(rng_st.shape)
# we can set back: torch.set_rng_state(rng_st)

torch.Size([5056])


## custom autograd function

In [1046]:
# 116. custom autograd
class MySquareFunc(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return x*x
    @staticmethod
    def backward(ctx, grad_output):
        (x,) = ctx.saved_tensors
        grad_input = 2*x*grad_output
        return grad_input

inp_sq = torch.tensor(3.0, requires_grad=True)
out_sq = MySquareFunc.apply(inp_sq)
out_sq.backward()
print(inp_sq.grad)

tensor(6.)


## DataParallel (if multiple GPUs) - skip if not available

## advanced losses

In [1047]:
# 117. nn.SmoothL1Loss
sl1 = nn.SmoothL1Loss()
pred_sl1 = torch.tensor([0.8, 1.2])
tgt_sl1 = torch.tensor([1.0,1.0])
loss_s1 = sl1(pred_sl1, tgt_sl1)
print(loss_s1)

tensor(0.0200)


In [1048]:
# 118. nn.L1Loss
l1_ = nn.L1Loss()
loss_l1val = l1_(pred_sl1, tgt_sl1)
print(loss_l1val)

tensor(0.2000)


## advanced init

In [1049]:
# 119. orthogonal_
w_orth = torch.empty(4,4)
nn.init.orthogonal_(w_orth)
print(w_orth)

tensor([[ 0.1635,  0.8652,  0.4507,  0.1471],
        [ 0.9385,  0.0117, -0.3295, -0.1021],
        [ 0.2459, -0.4008,  0.8010, -0.3705],
        [ 0.1787, -0.3012,  0.2160,  0.9114]])


## freq usage

In [1050]:
# 120. nn.CosineSimilarity
cosim = nn.CosineSimilarity(dim=1)
v1 = torch.randn(3,4)
v2 = torch.randn(3,4)
cs_out = cosim(v1,v2)
print(cs_out)

tensor([0.4140, 0.7394, 0.3248])


## gradient accum snippet

In [1051]:
# 121. typical accumulate grad
for i, (bx,by) in enumerate(loader):
    optim_sgd.zero_grad()
    preds_ = net_cls(bx)
    loss_val = criterion(preds_, by)
    loss_val.backward()
    optim_sgd.step()
print("accum done.")

Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
accum done.


## no usage

## advanced RNN usage

In [1052]:
# 122. nn.RNN
rnn_ = nn.RNN(input_size=5, hidden_size=6, batch_first=True)
seq_ = torch.randn(2,4,5)
out_rn, h_rn = rnn_(seq_)
print(out_rn.shape, h_rn.shape)

torch.Size([2, 4, 6]) torch.Size([1, 2, 6])


## conv1d example

In [1053]:
# 123. nn.Conv1d
conv1 = nn.Conv1d(in_channels=2, out_channels=4, kernel_size=3)
in_1d = torch.randn(2,2,10)  # batch=2, channels=2, length=10
out_1d = conv1(in_1d)
print(out_1d.shape)

torch.Size([2, 4, 8])


## convTranspose2d

In [1054]:
# 124. nn.ConvTranspose2d
deconv = nn.ConvTranspose2d(4,2, kernel_size=3, stride=2)
in_de = torch.randn(1,4,8,8)
out_de = deconv(in_de)
print(out_de.shape)

torch.Size([1, 2, 17, 17])


## upsample

In [1055]:
# 125. nn.Upsample
up_ = nn.Upsample(scale_factor=2, mode='nearest')
img_sm = torch.randn(1,3,16,16)
img_big = up_(img_sm)
print(img_big.shape)

torch.Size([1, 3, 32, 32])


## flatten vs view

In [1056]:
# 126. nn.Flatten in sequential
flat_seq = nn.Sequential(
    nn.Conv2d(3,6,3),
    nn.ReLU(),
    nn.Flatten(),
    nn.Linear(6*30*30,10)
)
img_2 = torch.randn(2,3,32,32)
o_ = flat_seq(img_2)
print(o_.shape)

torch.Size([2, 10])


## handle dimension carefully

In [1057]:
# 127. keep track or parameter usage
for name, param in flat_seq.named_parameters():
    print(name, param.shape)

0.weight torch.Size([6, 3, 3, 3])
0.bias torch.Size([6])
3.weight torch.Size([10, 5400])
3.bias torch.Size([10])


## marginrankingloss

In [1058]:
# 128. nn.MarginRankingLoss
mrl = nn.MarginRankingLoss(margin=1.0)
x1_ = torch.tensor([0.8,1.0])
x2_ = torch.tensor([0.5,1.2])
y_sign = torch.tensor([1, -1])
loss_mr = mrl(x1_, x2_, y_sign)
print(loss_mr)

tensor(0.7500)


## label_smoothing

In [1059]:
# 129. CrossEntropyLoss with label_smoothing
ls_ce = nn.CrossEntropyLoss(label_smoothing=0.1)
logits_ls = torch.randn(3,5)
target_ls = torch.tensor([1,0,4])
loss_ls = ls_ce(logits_ls, target_ls)
print(loss_ls)

tensor(2.0009)


## multihead attention

In [1060]:
# 130. nn.MultiheadAttention
mha = nn.MultiheadAttention(embed_dim=8, num_heads=2, batch_first=True)
q_ = torch.randn(2,5,8)
k_ = torch.randn(2,5,8)
v_ = torch.randn(2,5,8)
attn_output, attn_weights = mha(q_, k_, v_)
print(attn_output.shape, attn_weights.shape)

torch.Size([2, 5, 8]) torch.Size([2, 5, 5])


## transformer

In [1061]:
# 131. nn.Transformer
transformer = nn.Transformer(d_model=8, nhead=2, num_encoder_layers=2, num_decoder_layers=2)
src = torch.randn(5,2,8)
tgt = torch.randn(6,2,8)
out_trans = transformer(src, tgt)
print(out_trans.shape)

torch.Size([6, 2, 8])


## param groups in optimizer

In [1062]:
# 132. param groups
layer1 = nn.Linear(4,4)
layer2 = nn.Linear(4,2)
opt_gr = optim.SGD([
    {'params': layer1.parameters(), 'lr': 0.01},
    {'params': layer2.parameters(), 'lr': 0.001}
], lr=0.1)  # global lr=0.1 ignored?
print(opt_gr)

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0

Parameter Group 1
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)


## cyclical lr

In [1063]:
# 133. CyclicLR
cyc_opt = optim.SGD(net_cls.parameters(), lr=0.01)
scheduler_cyc = optim.lr_scheduler.CyclicLR(cyc_opt, base_lr=0.001, max_lr=0.01, step_size_up=5)
for i in range(3):
    cyc_opt.step()
    scheduler_cyc.step()
    print(i, cyc_opt.param_groups[0]['lr'])

0 0.002800000000000002
1 0.0046
2 0.006400000000000001


## early stopping is user-defined

## weight decay in optimizer

In [1064]:
# 134. weight_decay param
opt_wd = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
print(opt_wd)

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 1e-05
)


## micro-libraries: torchtext, torchvision, etc.

## final expansions: let's keep going

In [1065]:
# 135. Example: .eval() and .train()
net_cls.train()
net_cls.eval()
print("Switched modes.")

Switched modes.


## example: gradient accum skip.

## half precision: .half()

In [1066]:
# 136. half precision
hp = net_cls.half()
inp_fp16 = torch.randn(2,4).half()
out_fp16 = hp(inp_fp16)
print(out_fp16.dtype)

Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
torch.float16


## model to eval, no grad

In [1067]:
# 137. typical eval usage
hp.eval()
with torch.no_grad():
    res_e = hp(inp_fp16)
print("eval done.")

Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
eval done.


## zeroing partial usage

## transformation snippet

## let us do .nonzero()

In [1068]:
# 138. .nonzero
aa_ = torch.tensor([0,1,0,3,0])
nz = aa_.nonzero(as_tuple=True)
print(nz)

(tensor([1, 3]),)


## gather advanced

## we do 62 more

In [1069]:
# 139. advanced example using gather for topk
values_, indices_ = torch.topk(logits_, 2, dim=1)
print(values_, indices_)

tensor([[0.6003, 0.1695],
        [1.2402, 1.0446],
        [1.8576, 0.0402]]) tensor([[3, 1],
        [3, 2],
        [2, 1]])


## autograd grad mode

In [1070]:
# 140. torch.set_grad_enabled
torch.set_grad_enabled(False)
tmp_no_grad = net_cls(torch.randn(2,4).half())
torch.set_grad_enabled(True)
print("temp done.")

Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
temp done.


## advanced indexing 2

In [1071]:
# 141. scatter
z_sc = torch.zeros(2,4, dtype=torch.int64)
idx_sc = torch.tensor([[1],[3]], dtype=torch.int64)
src_sc = torch.tensor([10,20], dtype=torch.int64)
z_sc.scatter_(dim=1, index=idx_sc, src=src_sc.unsqueeze(1))
print(z_sc)

tensor([[ 0, 10,  0,  0],
        [ 0,  0,  0, 20]])


## complex usage skip

In [1072]:
# 142. example: complex
cplx = torch.tensor([1+2j, 3+4j], dtype=torch.cfloat)
print(cplx)

tensor([1.+2.j, 3.+4.j])


## spectral ops (torch.fft) if new version

In [1073]:
# 143. torch.fft.fft
try:
    import torch.fft
    sig_fft = torch.fft.fft(torch.randn(4, dtype=torch.cfloat))
    print(sig_fft)
except ImportError:
    print("torch.fft not available.")

tensor([-2.8946+0.3844j, -0.4065-0.3411j,  1.6987-0.4720j,  1.6165+0.7065j])


## random sampler

In [1074]:
# 144. RandomSampler
from torch.utils.data import RandomSampler
sampler = RandomSampler(ds)
print(list(sampler)[:5])

[33, 49, 73, 64, 10]


## WeightedRandomSampler

In [1075]:
# 145. WeightedRandomSampler
from torch.utils.data import WeightedRandomSampler
sample_weights = [1]*len(ds)
wrs = WeightedRandomSampler(sample_weights, num_samples=10, replacement=True)
sampled_indices = list(wrs)
print(sampled_indices)

[3, 38, 80, 41, 97, 31, 83, 31, 75, 58]


## Weighted sampling in loader

In [1076]:
# 146. DataLoader with WeightedRandomSampler
loader_w = DataLoader(ds, batch_size=4, sampler=wrs)
for bx,by in loader_w:
    print(bx.shape, by)
    break

torch.Size([4, 4]) tensor([1, 0, 1, 1])


## SubsetRandomSampler

In [1077]:
# 147. SubsetRandomSampler
from torch.utils.data import SubsetRandomSampler
indices_srs = [0,2,4,6]
srs = SubsetRandomSampler(indices_srs)
loader_srs = DataLoader(ds, batch_size=2, sampler=srs)
for bx,by in loader_srs:
    print(bx, by)
    break

tensor([[-1.2472, -0.1509, -1.4873, -1.0110],
        [ 0.2983,  0.7805,  0.9919, -0.3711]]) tensor([0, 1])


## tri-level logs

## autograd anom detect

In [1078]:
# 148. torch.autograd.set_detect_anomaly(True)
with torch.autograd.set_detect_anomaly(True):
    x_ano = torch.tensor([2.0], requires_grad=True)
    y_ano = x_ano**3
    y_ano.backward()
print("anomaly detection.")

anomaly detection.


## pinned memory usage: skip

## advanced BFS or topological skip

## 2 more advanced aggregator

## final 50 examples or so, let's accelerate

In [1079]:
# 149. torch.stft / torch.istft (some versions)
wav = torch.randn(1,4000)
stft_v = torch.stft(wav, n_fft=256, return_complex=False)
print(stft_v.shape)

torch.Size([1, 129, 63, 2])


## pad ops: F.pad

In [1080]:
# 150. F.pad
padded_2d = F.pad(torch.randn(1,3,10,10), pad=(1,1,2,2))
print(padded_2d.shape)

torch.Size([1, 3, 14, 12])


## grouping param usage

## partial coverage

In [1081]:
# 151. torch.nn.ReflectionPad2d
rpad2d = nn.ReflectionPad2d(2)
inp_pad = torch.randn(1,3,4,4)
out_pad = rpad2d(inp_pad)
print(out_pad.shape)

torch.Size([1, 3, 8, 8])


## advanced RNN utilities: nn.utils.rnn

## let's do ctc loss

In [1082]:
# 152. nn.CTCLoss
ctc = nn.CTCLoss()
log_probs = torch.randn(50,2,20).log_softmax(2).detach()
targets = torch.randint(1,20,(2,30), dtype=torch.long)
input_lengths = torch.full(size=(2,), fill_value=50, dtype=torch.long)
target_lengths = torch.full(size=(2,), fill_value=30, dtype=torch.long)
loss_ctc = ctc(log_probs, targets, input_lengths, target_lengths)
print(loss_ctc)

tensor(3.9878)


## matrix factorization example: torch.linalg

In [1083]:
# 153. torch.linalg.svd
import torch.linalg as lalg
svd_out = lalg.svd(mat2x2)
print(svd_out)

torch.return_types.linalg_svd(
U=tensor([[-0.7071, -0.7071],
        [-0.7071,  0.7071]]),
S=tensor([3.0000, 1.0000]),
Vh=tensor([[-0.7071, -0.7071],
        [-0.7071,  0.7071]]))


## advanced partial usage

In [1084]:
# 154. torch.nn.utils.weight_norm
wn_model = nn.Linear(4,2)
wn_model = nn.utils.weight_norm(wn_model)
print(wn_model.weight_g.shape, wn_model.weight_v.shape)

torch.Size([2, 1]) torch.Size([2, 4])


## torch.nn.parallel.DistributedDataParallel skip

## advanced quantization skip

In [1085]:
# 155. Functional groupnorm
gn_out = F.group_norm(torch.randn(2,6,4,4), num_groups=3)
print(gn_out.shape)

torch.Size([2, 6, 4, 4])


## advanced metrics skip

## torch.cuda.empty_cache

In [1086]:
# 156. torch.cuda.empty_cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("Cache emptied.")

## layer hooks backward

In [1087]:
# 157. register_backward_hook is deprecated in latest PyTorch, use register_full_backward_hook
def bw_hook(m, gin, gout):
    print("Backward hook.")

if hasattr(net_cls[0], 'register_full_backward_hook'):
    net_cls[0].register_full_backward_hook(bw_hook)
else:
    pass
print("backward hook set.")

backward hook set.


## jit script/tracing

In [1088]:
# 158. torch.jit.trace
# Remove backward hooks before tracing
for module in net_cls.modules():
	if hasattr(module, '_backward_hooks'):
		module._backward_hooks.clear()

# Convert input tensor to half precision
input_tensor = torch.randn(1, 4).half()

traced_model = torch.jit.trace(net_cls, input_tensor)
print(traced_model)

Forward hook triggered.

 Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Sequential(
  original_name=Sequential
  (0): Linear(original_name=Linear)
  (1): ReLU(original_name=ReLU)
  (2): Linear(original_name=Linear)
)


In [1089]:
# 159. torch.jit.save/torch.jit.load
traced_model.save('traced_model.pt')
loaded_tr = torch.jit.load('traced_model.pt')
print(loaded_tr)

RecursiveScriptModule(
  original_name=Sequential
  (0): RecursiveScriptModule(original_name=Linear)
  (1): RecursiveScriptModule(original_name=ReLU)
  (2): RecursiveScriptModule(original_name=Linear)
)


## memory usage check: torch.cuda.memory_allocated

In [1090]:
# 160. memory_allocated
if torch.cuda.is_available():
    mem_used = torch.cuda.memory_allocated()
    print("Mem used=", mem_used)

## philanthropic leftover

## torch.nn.utils.prune

In [1091]:
# 161. prune example
import torch.nn.utils.prune as prune

prune.random_unstructured(net_cls[0], name='weight', amount=0.3)
print(net_cls[0].weight)

tensor([[-0.0041,  0.0340,  0.0062, -0.0000],
        [-0.0162, -0.0147, -0.0000, -0.0183],
        [ 0.0362,  0.0032,  0.0073,  0.0035],
        [-0.0237,  0.0277, -0.0000,  0.0000],
        [-0.0000,  0.0108,  0.0000,  0.0113]], dtype=torch.float16,
       grad_fn=<MulBackward0>)


## example: partial Tensors

## advanced scatter reduce

In [1092]:
# 162. scatter add
temp_sc = torch.zeros(2,4)
idx_sa = torch.tensor([[0],[1]])
src_sa = torch.tensor([[2.0],[3.0]])
temp_sc.scatter_add_(dim=1, index=idx_sa, src=src_sa)
print(temp_sc)

tensor([[2., 0., 0., 0.],
        [0., 3., 0., 0.]])


## advanced destructive ops

## let's do 38 more

In [1093]:
# 163. torch.cdist
a_cdist = torch.randn(4,3)
b_cdist = torch.randn(5,3)
dist_c = torch.cdist(a_cdist, b_cdist)
print(dist_c.shape)

torch.Size([4, 5])


## dtype is

In [1094]:
# 164. Checking dtypes
print(torch.bool, torch.int8, torch.float64)

torch.bool torch.int8 torch.float64


## functional margin losses

In [1095]:
# 165. F.margin_ranking_loss
ma_loss = F.margin_ranking_loss(x1_, x2_, y_sign, margin=0.5)
print(ma_loss)

tensor(0.2500)


## advanced: pad sequence

In [1096]:
# 166. pad_sequence
seq1 = torch.tensor([1,2,3])
seq2 = torch.tensor([4,5])
padded_seq = nn.utils.rnn.pad_sequence([seq1,seq2], batch_first=True)
print(padded_seq)

tensor([[1, 2, 3],
        [4, 5, 0]])


## replicate for DP usage skip

## advanced BFS skip

## .copy_ operation

In [1097]:
# 167. .copy_
src_cpy = torch.tensor([5,6,7])
dst_cpy = torch.zeros(3)
dst_cpy.copy_(src_cpy)
print(dst_cpy)

tensor([5., 6., 7.])


## rename dimension skip

## gather from multiple dims

## net training example

In [1098]:
# 168. simple loop again
for epoch in range(1):
    for i, (bx,by) in enumerate(loader):
        bx = bx.half()  # Ensure bx is in half precision
        out_ = net_cls(bx)
        loss_ = criterion(out_, by)
        optim_sgd.zero_grad()
        loss_.backward()
        optim_sgd.step()
print("loop done.")

Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
Forward hook triggered. Linear(in_features=4, out_features=5, bias=True)
loop done.


## fiddling with partial usage

In [1099]:
# 169. .type(torch.float)
tmp_tt = torch.arange(5).type(torch.float)
print(tmp_tt, tmp_tt.dtype)

tensor([0., 1., 2., 3., 4.]) torch.float32


## example of param clone

In [1100]:
# 170. param clone
wparam = list(net_cls.parameters())[0]
clone_param = wparam.clone()
print(clone_param.shape)

torch.Size([5])


## partial usage of cross platform

## we do 30 more

## advanced transforms

## example: to() with dtype and device

In [1101]:
# 171.
y_ = torch.randn(2,2).to(dtype=torch.float64, device=device)
print(y_.dtype, y_.device)

torch.float64 cpu


## cat example multiple

## RNG for normal distribution

In [1102]:
# 172. torch.normal
mean_ = torch.tensor([0.0, 10.0])
std_ = torch.tensor([1.0, 2.0])
norm_ = torch.normal(mean=mean_, std=std_)
print(norm_)

tensor([-1.2384, 10.7849])


## advanced no usage

## coalesce for sparse skip

## let's do nn.TransformerEncoder, TransformerDecoder

In [1103]:
# 173. nn.TransformerEncoder
enc_layer = nn.TransformerEncoderLayer(d_model=8, nhead=2)
encoder = nn.TransformerEncoder(enc_layer, num_layers=2)
inp_enc = torch.randn(5,2,8)
enc_out = encoder(inp_enc)
print(enc_out.shape)

torch.Size([5, 2, 8])


In [1104]:
# 174. nn.TransformerDecoder
dec_layer = nn.TransformerDecoderLayer(d_model=8, nhead=2)
decoder = nn.TransformerDecoder(dec_layer, num_layers=2)
tgt_dec = torch.randn(6,2,8)
dec_out = decoder(tgt_dec, enc_out)
print(dec_out.shape)

torch.Size([6, 2, 8])


## advanced weighting in ctc skip

## doc usage skip

## HPC apex skip

## 25 more to 200

In [1105]:
# 175. from_numpy
import numpy as np
arr_np2 = np.array([1,2,3], dtype=np.float32)
t_from_np = torch.from_numpy(arr_np2)
print(t_from_np, t_from_np.dtype)

tensor([1., 2., 3.]) torch.float32


In [1106]:
# 176. tolist usage
python_list = t_from_np.tolist()
print(python_list, type(python_list))

[1.0, 2.0, 3.0] <class 'list'>


## Generator usage

In [1107]:
# 177. torch.Generator
gen_ = torch.Generator().manual_seed(123)
rr_ = torch.randn(3, generator=gen_)
print(rr_)

tensor([-0.1115,  0.1204, -0.3696])


## advanced AMP autocast

In [1108]:
# 178. autocast if available
from torch.cuda.amp import autocast, GradScaler

if torch.cuda.is_available():
    scaler_amp = GradScaler()
    with autocast():
        out_amp = net_cls(torch.randn(2,4).cuda())
    print("autocast done.")

## manual_ seed for cudnn

In [1109]:
# 179. torch.backends.cudnn
import torch.backends.cudnn as cudnn
cudnn.benchmark = True
cudnn.deterministic = False
print("cudnn settings.")

cudnn settings.


## global variable skip

## functional meltdown

In [1110]:
# 180. F.interpolate
small_im = torch.randn(1,3,16,16)
up_im = F.interpolate(small_im, size=(32,32), mode='bilinear')
print(up_im.shape)

torch.Size([1, 3, 32, 32])


## replicate usage skip

## 19 more to 200

In [1111]:
# 181. Flatten (2D to 1D)
tt_ = torch.randn(2,3)
flat_t = tt_.flatten()
print(flat_t.shape)

torch.Size([6])


In [1112]:
# 182. unflatten
unfl_t = flat_t.unflatten(0, (2,3))
print(unfl_t.shape)

torch.Size([2, 3])


## advanced negative indexing

In [1113]:
# 183. negative index slice
neg_slice = mat[-1, -2:]
print(neg_slice)

tensor([8, 9])


## rolling not standard

## advanced activation: gelu

In [1114]:
# 184. F.gelu
vals_gelu = torch.tensor([-1.0,0.0,1.0])
out_gelu = F.gelu(vals_gelu)
print(out_gelu)

tensor([-0.1587,  0.0000,  0.8413])


## advanced pooling

In [1115]:
# 185. nn.AvgPool2d
ap2d = nn.AvgPool2d(kernel_size=2)
ap_out = ap2d(img_2)
print(ap_out.shape)

torch.Size([2, 3, 16, 16])


## group conv

In [1116]:
# 186. group conv example
grp_conv = nn.Conv2d(4,4, kernel_size=3, groups=2)
grp_in = torch.randn(1,4,16,16)
grp_out = grp_conv(grp_in)
print(grp_out.shape)

torch.Size([1, 4, 14, 14])


## CosineAnnealingLR

In [1117]:
# 187. CosineAnnealingLR
opt_cos = optim.SGD(net_cls.parameters(), lr=0.1)
scheduler_cos = optim.lr_scheduler.CosineAnnealingLR(opt_cos, T_max=10)
for ep in range(3):
    scheduler_cos.step()
    print("Ep", ep, opt_cos.param_groups[0]['lr'])

Ep 0 0.09755282581475769
Ep 1 0.09045084971874738
Ep 2 0.07938926261462367


## LBFGS

In [1118]:
# 188. LBFGS
lbfgs_opt = optim.LBFGS(model.parameters(), lr=0.01)
print(lbfgs_opt)

LBFGS (
Parameter Group 0
    history_size: 100
    line_search_fn: None
    lr: 0.01
    max_eval: 25
    max_iter: 20
    tolerance_change: 1e-09
    tolerance_grad: 1e-07
)


## AdamW

In [1119]:
# 189. AdamW
adamw_opt = optim.AdamW(model.parameters(), lr=0.001)
print(adamw_opt)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)


## multi-labeller skip

## final 10

In [1120]:
# 190. detect anomaly again
with torch.autograd.set_detect_anomaly(True):
    pass
print("No error.")

No error.


## find submodule

In [1121]:
# 191. net.children(), net.named_children()
for child in net_cls.children():
    print(child)

Linear(in_features=4, out_features=5, bias=True)
ReLU()
Linear(in_features=5, out_features=2, bias=True)


## param grad clamp

In [1122]:
# 192. for p in model.parameters(): p.grad.data.clamp_(-1,1)
for p in net_cls.parameters():
    if p.grad is not None:
        p.grad.data.clamp_(-1,1)
print("grad clamp.")

grad clamp.


## Adam specific: betas

In [1123]:
# 193. adam with betas
adam_b = optim.Adam(model.parameters(), lr=0.001, betas=(0.9,0.99))
print(adam_b)

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.99)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)


## kill ephemeral

## replicate final

In [1124]:
# 194. torch.unique
arr_uniq = torch.tensor([1,2,2,3,3,3,4])
unique_vals = torch.unique(arr_uniq)
print(unique_vals)

tensor([1, 2, 3, 4])


In [1125]:
# 195. torch.sort
sorted_res = torch.sort(arr_uniq, descending=True)
print(sorted_res)

torch.return_types.sort(
values=tensor([4, 3, 3, 3, 2, 2, 1]),
indices=tensor([6, 3, 4, 5, 1, 2, 0]))


## torch.take, put

In [1126]:
# 196. torch.take
vals_take = torch.tensor([[10,20],[30,40]])
idx_take = torch.tensor([0,2])
taken = torch.take(vals_take, idx_take)
print(taken)

tensor([10, 30])


In [1127]:
# 197. torch.put_
vals_take.put_(idx_take, torch.tensor([-1,-2]))
print(vals_take)

tensor([[-1, 20],
        [-2, 40]])


## torch.searchsorted

In [1128]:
# 198. torch.searchsorted
sorted_arr = torch.tensor([1,3,5,7])
vals_ss = torch.tensor([2,6])
idx_ss2 = torch.searchsorted(sorted_arr, vals_ss)
print(idx_ss2)

tensor([1, 3])


## final 2 cells

In [1129]:
# 199. isfinite, isinf, isnan
arr_f = torch.tensor([1.0, float('inf'), float('nan'), 2.0])
print(torch.isfinite(arr_f))
print(torch.isinf(arr_f))
print(torch.isnan(arr_f))

tensor([ True, False, False,  True])
tensor([False,  True, False, False])
tensor([False, False,  True, False])


In [1130]:
# 200. We've reached 200 PyTorch examples!
print("All done with top 200.")

All done with top 200.
