In [1]:
import torch

In [2]:
print(torch.cuda.is_available())

True


In [3]:
!nvidia-smi

Mon Jan  5 18:31:26 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 13.0     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          On  |   00000000:47:00.0 Off |                   On |
| N/A   33C    P0             50W /  400W |                  N/A   |     N/A      Default |
|                                         |                        |              Enabled |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [4]:
### Setting Device Agnostic Code

device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
device

'cuda'

### Introduction to Tensors
* A tensor is a multidimensional array specially designed for mathematical and computational efficiency
* * Scalars( 0 Dimension) like 1
* * Vectors( 1 Dimension) like [1,2,4]
* * Matrices( 2 Dimension) like [[1,2],[3,4]]
* * 3D Tensors like RGB Images
* * 4D Tensors like Batches of RGB Images
* * 5D Tensors like Video Data till n dimensional tensors...

In [6]:
torch.__version__

'2.10.0a0+b558c986e8.nv25.11'

In [7]:
print(f"Name of the GPU is {torch.cuda.get_device_name(0)}")

Name of the GPU is NVIDIA A100-SXM4-40GB MIG 1g.5gb


In [8]:
#Creating a Tensor

tensor1 = torch.tensor([1,2])
tensor1.dtype

torch.int64

In [9]:
tensor2 = torch.zeros(2,3)

In [10]:
tensor2

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [11]:
torch.manual_seed(42)
tensor3 = torch.rand(2,3)

In [12]:
tensor3

tensor([[0.8823, 0.9150, 0.3829],
        [0.9593, 0.3904, 0.6009]])

In [13]:
X = torch.tensor([[1,2,3],[3,4,5]])

In [14]:
X.shape

torch.Size([2, 3])

In [15]:
y = torch.tensor([1.98,2.9,3.4], dtype=torch.int32)

In [16]:
y

tensor([1, 2, 3], dtype=torch.int32)

In [17]:
y.to(torch.float16)

tensor([1., 2., 3.], dtype=torch.float16)

### Inplace Operations

* Suppose I want to sum two tensors but by default when we sum two tensors, there is a formation of third tensor to store the result which occupies space...
* to avoid we use inplace operations like add m + n and store it in m
* Syntax is m.add_(n)
* eg. m.sub_(n)

In [18]:
m = torch.tensor([1,2,4])
n = torch.tensor([4,5,7])

In [19]:
m.add_(n)

tensor([ 5,  7, 11])

In [20]:
m

tensor([ 5,  7, 11])

In [21]:
print( n == n.sub_(m))

tensor([True, True, True])


In [22]:
# copying a tensor

a = torch.tensor([1,2])
b = a.clone()

In [23]:
b

tensor([1, 2])

In [24]:
a = torch.tensor([1])
b

tensor([1, 2])

### Tensor operations on GPU

In [25]:
a.to(device)

tensor([1], device='cuda:0')

In [26]:
a + 5

tensor([6])

In [32]:
# Checking matrix multiplication on CPU and GPU

import time

size = 10000

matrix_cpu1 = torch.rand(size,size)
matrix_cpu2 = torch.rand(size,size)

start_time = time.time()
result_cpu = torch.matmul(matrix_cpu1, matrix_cpu2)
end_time = time.time()
cpu_time = end_time - start_time
print(f"Time taken using CPU is {end_time - start_time} seconds")

Time taken using CPU is 1.0675685405731201 seconds


In [31]:
matrix_cpu1 = torch.rand(size,size).to(device)
matrix_cpu2 = torch.rand(size,size).to(device)

start_time = time.time()
result_cpu = torch.matmul(matrix_cpu1, matrix_cpu2).to(device)
end_time = time.time()
gpu_time = end_time - start_time
print(f"Time taken using GPU is {end_time - start_time} seconds")

Time taken using GPU is 0.0017480850219726562 seconds


In [34]:
print(f"Speed of GPU to calculate matrix multiplication of {size,size} is {cpu_time / gpu_time}x faster than CPU")

Speed of GPU to calculate matrix multiplication of (10000, 10000) is 610.7074468085107x faster than CPU


In [36]:
# converting a tensor to Numpy array and vice versa
import numpy as np
c = a.numpy()

In [38]:
c.dtype

dtype('int64')

In [39]:
a.dtype

torch.int64

In [40]:
d = torch.from_numpy(c)

In [41]:
d.dtype

torch.int64