In [1]:
import torch
print(torch.__version__)

2.8.0+cu128


In [2]:
if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Name of the GPU: {torch.cuda.get_device_name(0)}")
else:
    print("GPU not available. Using CPU.")

GPU is available!
Name of the GPU: NVIDIA GeForce RTX 2070


## Creating a Tensor

In [3]:
# using empty to create a tensor
a = torch.empty(2, 3)

In [4]:
# check type
type(a)

torch.Tensor

In [5]:
# using zeros
torch.zeros(2, 3) # [ Rows X Columns ]

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [6]:
# using ones
torch.ones(2, 3)

tensor([[1., 1., 1.],
        [1., 1., 1.]])

In [7]:
# using rand
torch.rand(2, 3)

tensor([[0.9242, 0.0463, 0.8434],
        [0.9984, 0.8827, 0.0187]])

In [8]:
# use of rand gives random values between 0 and 1
torch.rand(2, 3)

tensor([[0.1542, 0.8421, 0.2154],
        [0.1468, 0.7689, 0.0921]])

In [9]:
# Using manual_seed to get the same random values
torch.manual_seed(100)
torch.rand(2, 3)

tensor([[0.1117, 0.8158, 0.2626],
        [0.4839, 0.6765, 0.7539]])

In [10]:
torch.manual_seed(100)
torch.rand(2, 3)

tensor([[0.1117, 0.8158, 0.2626],
        [0.4839, 0.6765, 0.7539]])

In [11]:
# using tensor to create a tensor from data
torch.tensor([[1,2,3],
              [4,5,6]])

tensor([[1, 2, 3],
        [4, 5, 6]])

In [12]:
# arange to create a tensor with a range of values
print("using arange ->", torch.arange(0, 10, 2))

# using linspace to create a tensor with linearly spaced values
print("using linspace ->", torch.linspace(0, 10, 10))

# using eye to create a 2D tensor with ones on the diagonal
print("using eye ->", torch.eye(5))

# using full to create a tensor filled with a specific value
print("using full ->", torch.full((3, 3), 5))

using arange -> tensor([0, 2, 4, 6, 8])
using linspace -> tensor([ 0.0000,  1.1111,  2.2222,  3.3333,  4.4444,  5.5556,  6.6667,  7.7778,
         8.8889, 10.0000])
using eye -> tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])
using full -> tensor([[5, 5, 5],
        [5, 5, 5],
        [5, 5, 5]])


## Tensor Shapes

In [13]:
x = torch.tensor([[1,2,3],[4,5,6]])
x

tensor([[1, 2, 3],
        [4, 5, 6]])

In [14]:
x.shape

torch.Size([2, 3])

In [15]:
# using empty_like to create a tensor with same shape as x
torch.empty_like(x)

tensor([[127309019458944, 127309019458944,               0],
        [ 99638745661696,  99638718228160, 127303583616912]])

In [16]:
# using zeros_like to create a tensor with same shape as x filled with zeros.
torch.zeros_like(x)

tensor([[0, 0, 0],
        [0, 0, 0]])

In [17]:
torch.ones_like(x)

tensor([[1, 1, 1],
        [1, 1, 1]])

In [18]:
# using dtype argument to specify data type cause by default it takes the data type of the input tensor.
torch.rand_like(x, dtype = torch.float32)

tensor([[0.2627, 0.0428, 0.2080],
        [0.1180, 0.1217, 0.7356]])

## Tensor Data Types

In [19]:
# find data type
x.dtype

torch.int64

In [20]:
# assign data type
torch.tensor([1.0, 2.0, 3.0], dtype=torch.int32)

tensor([1, 2, 3], dtype=torch.int32)

In [21]:
torch.tensor([1,2,3], dtype=torch.float64)

tensor([1., 2., 3.], dtype=torch.float64)

In [22]:
# using to()
x.to(torch.float32)

tensor([[1., 2., 3.],
        [4., 5., 6.]])

| **Data Type**             | **Dtype**         | **Description**                                                                                                                                                                |
|---------------------------|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **32-bit Floating Point** | `torch.float32`   | Standard floating-point type used for most deep learning tasks. Provides a balance between precision and memory usage.                                                         |
| **64-bit Floating Point** | `torch.float64`   | Double-precision floating point. Useful for high-precision numerical tasks but uses more memory.                                                                               |
| **16-bit Floating Point** | `torch.float16`   | Half-precision floating point. Commonly used in mixed-precision training to reduce memory and computational overhead on modern GPUs.                                            |
| **BFloat16**              | `torch.bfloat16`  | Brain floating-point format with reduced precision compared to `float16`. Used in mixed-precision training, especially on TPUs.                                                |
| **8-bit Floating Point**  | `torch.float8`    | Ultra-low-precision floating point. Used for experimental applications and extreme memory-constrained environments (less common).                                               |
| **8-bit Integer**         | `torch.int8`      | 8-bit signed integer. Used for quantized models to save memory and computation in inference.                                                                                   |
| **16-bit Integer**        | `torch.int16`     | 16-bit signed integer. Useful for special numerical tasks requiring intermediate precision.                                                                                    |
| **32-bit Integer**        | `torch.int32`     | Standard signed integer type. Commonly used for indexing and general-purpose numerical tasks.                                                                                  |
| **64-bit Integer**        | `torch.int64`     | Long integer type. Often used for large indexing arrays or for tasks involving large numbers.                                                                                  |
| **8-bit Unsigned Integer**| `torch.uint8`     | 8-bit unsigned integer. Commonly used for image data (e.g., pixel values between 0 and 255).                                                                                    |
| **Boolean**               | `torch.bool`      | Boolean type, stores `True` or `False` values. Often used for masks in logical operations.                                                                                      |
| **Complex 64**            | `torch.complex64` | Complex number type with 32-bit real and 32-bit imaginary parts. Used for scientific and signal processing tasks.                                                               |
| **Complex 128**           | `torch.complex128`| Complex number type with 64-bit real and 64-bit imaginary parts. Offers higher precision but uses more memory.                                                                 |
| **Quantized Integer**     | `torch.qint8`     | Quantized signed 8-bit integer. Used in quantized models for efficient inference.                                                                                              |
| **Quantized Unsigned Integer** | `torch.quint8` | Quantized unsigned 8-bit integer. Often used for quantized tensors in image-related tasks.                                                                                     |


## Mathematical operations

### 1. Scalar operation

In [23]:
x = torch.rand(2,2)
x

tensor([[0.7118, 0.7876],
        [0.4183, 0.9014]])

In [24]:
# addition
x + 2
# substraction
x - 2
# multiplication
x * 3
# division
x / 3
# int division
(x * 100)//3
# mod
((x * 100)//3)%2
# power
x**2

tensor([[0.5066, 0.6203],
        [0.1750, 0.8125]])

### 2. Element wise operation

In [25]:
a = torch.rand(2,3)
b = torch.rand(2,3)

print(a)
print(b)

tensor([[0.9969, 0.7565, 0.2239],
        [0.3023, 0.1784, 0.8238]])
tensor([[0.5557, 0.9770, 0.4440],
        [0.9478, 0.7445, 0.4892]])


In [26]:
# add
a + b
# sub
a - b
# multiply
a * b
# division
a / b
# power
a ** b
# mod
a % b

tensor([[0.4411, 0.7565, 0.2239],
        [0.3023, 0.1784, 0.3346]])

In [27]:
c = torch.tensor([1, -2, 3, -4])

In [28]:
# abs
torch.abs(c)

tensor([1, 2, 3, 4])

In [29]:
# negative
torch.neg(c)

tensor([-1,  2, -3,  4])

In [30]:
d = torch.tensor([1.9, 2.3, 3.7, 4.4])

In [31]:
# round
torch.round(d)

tensor([2., 2., 4., 4.])

In [32]:
# ceil
torch.ceil(d)

tensor([2., 3., 4., 5.])

In [33]:
# floor
torch.floor(d)

tensor([1., 2., 3., 4.])

In [34]:
# clamp
torch.clamp(d, min=2, max=3)

tensor([2.0000, 2.3000, 3.0000, 3.0000])

### 3. Reduction operation

In [35]:
e = torch.randint(size=(2,3), low=0, high=10, dtype=torch.float32)
e

tensor([[8., 0., 7.],
        [0., 0., 9.]])

In [36]:
# sum
torch.sum(e)
# sum along columns
torch.sum(e, dim=0)
# sum along rows
torch.sum(e, dim=1)

tensor([15.,  9.])

In [37]:
# mean
torch.mean(e)
# mean along col
torch.mean(e, dim=0)

tensor([4., 0., 8.])

In [38]:
# median
torch.median(e)

tensor(0.)

In [39]:
# max and min
torch.max(e)
torch.min(e)

tensor(0.)

In [40]:
# product
torch.prod(e)

tensor(0.)

In [41]:
# standard deviation
torch.std(e)

tensor(4.4272)

In [42]:
# variance
torch.var(e)

tensor(19.6000)

In [43]:
# argmax
torch.argmax(e)

tensor(5)

In [44]:
# argmin
torch.argmin(e)

tensor(1)

### 4. Matrix operations

In [45]:
f = torch.randint(size=(2,3), low=0, high=10)
g = torch.randint(size=(3,2), low=0, high=10)

print(f)
print(g)

tensor([[5, 7, 3],
        [9, 4, 0]])
tensor([[5, 7],
        [5, 9],
        [9, 7]])


In [46]:
# matrix multiplcation
torch.matmul(f, g)

tensor([[ 87, 119],
        [ 65,  99]])

In [47]:
vector1 = torch.tensor([1, 2])
vector2 = torch.tensor([3, 4])

# dot product
torch.dot(vector1, vector2)

tensor(11)

In [48]:
# transpose
torch.transpose(f, 0, 1)

tensor([[5, 9],
        [7, 4],
        [3, 0]])

In [49]:
h = torch.randint(size=(3, 3), low = 0, high=10, dtype = torch.float32)
h

tensor([[5., 9., 8.],
        [9., 7., 9.],
        [2., 6., 7.]])

In [50]:
# determinant
torch.det(h)

tensor(-110.)

In [51]:
# inverse
torch.inverse(h)

tensor([[ 0.0455,  0.1364, -0.2273],
        [ 0.4091, -0.1727, -0.2455],
        [-0.3636,  0.1091,  0.4182]])

### 5. Comparison operations

In [52]:
i = torch.randint(size=(2,3), low=0, high=10)
j = torch.randint(size=(2,3), low=0, high=10)

print(i)
print(j)

tensor([[7, 8, 3],
        [6, 1, 5]])
tensor([[5, 0, 4],
        [3, 8, 8]])


In [53]:
# greater than
i > j
# less than
i < j
# equal to
i == j
# not equal to
i != j
# greater than equal to
# less than equal to

tensor([[True, True, True],
        [True, True, True]])

### 6. Special functions

In [54]:
k = torch.randint(size=(2,3), low=0, high=10, dtype=torch.float32)
k

tensor([[3., 3., 5.],
        [0., 6., 4.]])

In [55]:
# log
torch.log(k)

tensor([[1.0986, 1.0986, 1.6094],
        [  -inf, 1.7918, 1.3863]])

In [56]:
# exp
torch.exp(k)

tensor([[ 20.0855,  20.0855, 148.4132],
        [  1.0000, 403.4288,  54.5981]])

In [57]:
# sqrt
torch.sqrt(k)

tensor([[1.7321, 1.7321, 2.2361],
        [0.0000, 2.4495, 2.0000]])

In [58]:
# sigmoid
torch.sigmoid(k)

tensor([[0.9526, 0.9526, 0.9933],
        [0.5000, 0.9975, 0.9820]])

In [59]:
# softmax
torch.softmax(k, dim=0)

tensor([[0.9526, 0.0474, 0.7311],
        [0.0474, 0.9526, 0.2689]])

In [60]:
# relu
torch.relu(k)

tensor([[3., 3., 5.],
        [0., 6., 4.]])

## Inplace Operations

In [61]:
m = torch.rand(2,3)
n = torch.rand(2,3)

print(m)
print(n)

tensor([[0.6574, 0.3451, 0.0453],
        [0.9798, 0.5548, 0.6868]])
tensor([[0.4920, 0.0748, 0.9605],
        [0.3271, 0.0103, 0.9516]])


In [62]:
m.add_(n)

tensor([[1.1494, 0.4199, 1.0058],
        [1.3069, 0.5650, 1.6384]])

In [63]:
m

tensor([[1.1494, 0.4199, 1.0058],
        [1.3069, 0.5650, 1.6384]])

In [64]:
n

tensor([[0.4920, 0.0748, 0.9605],
        [0.3271, 0.0103, 0.9516]])

In [65]:
torch.relu(m)

tensor([[1.1494, 0.4199, 1.0058],
        [1.3069, 0.5650, 1.6384]])

In [66]:
m.relu_()

tensor([[1.1494, 0.4199, 1.0058],
        [1.3069, 0.5650, 1.6384]])

In [67]:
m

tensor([[1.1494, 0.4199, 1.0058],
        [1.3069, 0.5650, 1.6384]])

## Copying a Tensor

In [68]:
a = torch.rand(2,3)
a

tensor([[0.2855, 0.2324, 0.9141],
        [0.7668, 0.1659, 0.4393]])

In [69]:
b = a

In [70]:
b

tensor([[0.2855, 0.2324, 0.9141],
        [0.7668, 0.1659, 0.4393]])

In [71]:
a[0][0] = 0

In [72]:
a

tensor([[0.0000, 0.2324, 0.9141],
        [0.7668, 0.1659, 0.4393]])

In [73]:
b

tensor([[0.0000, 0.2324, 0.9141],
        [0.7668, 0.1659, 0.4393]])

In [74]:
id(a)

127303584015968

In [75]:
id(b)

127303584015968

In [76]:
b = a.clone()

In [77]:
a

tensor([[0.0000, 0.2324, 0.9141],
        [0.7668, 0.1659, 0.4393]])

In [78]:
b

tensor([[0.0000, 0.2324, 0.9141],
        [0.7668, 0.1659, 0.4393]])

In [79]:
a[0][0] = 10

In [80]:
a

tensor([[10.0000,  0.2324,  0.9141],
        [ 0.7668,  0.1659,  0.4393]])

In [81]:
b

tensor([[0.0000, 0.2324, 0.9141],
        [0.7668, 0.1659, 0.4393]])

In [82]:
id(a)

127303584015968

In [83]:
id(b)

127303583957936

## Tensors Operations in GPU

In [84]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU


In [85]:
# Creating a new tensor on the GPU

torch.rand(2, 3, device = device)

tensor([[0.3563, 0.0303, 0.7088],
        [0.2009, 0.0224, 0.9896]], device='cuda:0')

In [86]:
# creating a tensor on CPU and then transferring it to GPU
a = torch.rand(2,3)
a

tensor([[0.2243, 0.8935, 0.0497],
        [0.1780, 0.3011, 0.1893]])

In [87]:
# transfer to GPU
b = a.to(device)
b

tensor([[0.2243, 0.8935, 0.0497],
        [0.1780, 0.3011, 0.1893]], device='cuda:0')

In [88]:
b + 5

tensor([[5.2243, 5.8935, 5.0497],
        [5.1780, 5.3011, 5.1893]], device='cuda:0')

In [89]:
# CPU vs GPU computation speed comparison
import time
import torch
# CPU computation

# Defining a large tensor size
size = (10000)
# Creating two large tensors on CPU
x_cpu = torch.rand(size, size)
y_cpu = torch.rand(size, size)
# Measuring time for addition on CPU
start_cpu = time.time()
result_cpu = torch.matmul(x_cpu, y_cpu)
end_cpu = time.time()
cpu_time = end_cpu - start_cpu
print(f"Time taken for matrix multiplication on CPU: {cpu_time:.4f} seconds.")

# GPU computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu.")
# Transferring tensors to GPU
X_GPU = x_cpu.to(device)
Y_GPU = y_cpu.to(device)
# Measuring time for addition on GPU
start_gpu = time.time()
result_gpu = torch.matmul(X_GPU, Y_GPU)
end_gpu = time.time()
gpu_time = end_gpu - start_gpu
print(f"Time taken for matrix multiplication on GPU: {gpu_time:.4f} seconds.")

# Result Comparison
print(f"GPU is: {cpu_time / gpu_time:.4f}x faster then CPU.")

Time taken for matrix multiplication on CPU: 5.2710 seconds.
Time taken for matrix multiplication on GPU: 0.0186 seconds.
GPU is: 283.2712x faster then CPU.


## Reshaping Tensors

In [90]:
a = torch.ones(4,4)
a

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [91]:
# reshape
a.reshape(2,2,2,2)

tensor([[[[1., 1.],
          [1., 1.]],

         [[1., 1.],
          [1., 1.]]],


        [[[1., 1.],
          [1., 1.]],

         [[1., 1.],
          [1., 1.]]]])

In [92]:
# flatten
a.flatten()

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [93]:
# Permuting dimensions
b = torch.randn(2, 3, 4)
b

tensor([[[-0.9356, -1.5977,  0.1933, -0.1624],
         [ 1.5556, -0.3431, -0.8697, -0.1950],
         [-0.7548, -1.5893,  1.7431,  0.6176]],

        [[ 0.8178,  0.6571,  1.0683,  0.1637],
         [-0.3231,  0.4631,  0.8596, -0.8124],
         [-0.1069,  0.2653,  0.9565, -1.5644]]])

In [94]:
b.permute(2, 0, 1)

tensor([[[-0.9356,  1.5556, -0.7548],
         [ 0.8178, -0.3231, -0.1069]],

        [[-1.5977, -0.3431, -1.5893],
         [ 0.6571,  0.4631,  0.2653]],

        [[ 0.1933, -0.8697,  1.7431],
         [ 1.0683,  0.8596,  0.9565]],

        [[-0.1624, -0.1950,  0.6176],
         [ 0.1637, -0.8124, -1.5644]]])

In [95]:
b.permute(2, 0, 1).shape

torch.Size([4, 2, 3])

In [96]:
# Unsqueezing
c = torch.rand(256, 256, 3)
c

tensor([[[0.8801, 0.4203, 0.8186],
         [0.0463, 0.7306, 0.2009],
         [0.3265, 0.6684, 0.3509],
         ...,
         [0.3562, 0.6196, 0.2052],
         [0.9444, 0.0385, 0.4012],
         [0.8760, 0.2689, 0.7486]],

        [[0.9468, 0.9516, 0.2972],
         [0.8308, 0.8938, 0.9509],
         [0.3025, 0.8195, 0.0917],
         ...,
         [0.1863, 0.8468, 0.6018],
         [0.0538, 0.0189, 0.9054],
         [0.3617, 0.3941, 0.7456]],

        [[0.3345, 0.5882, 0.2287],
         [0.2625, 0.9309, 0.0712],
         [0.6732, 0.6364, 0.9203],
         ...,
         [0.8705, 0.9539, 0.5377],
         [0.8492, 0.1176, 0.9881],
         [0.6234, 0.9691, 0.9766]],

        ...,

        [[0.3198, 0.4319, 0.1581],
         [0.3474, 0.4804, 0.6456],
         [0.5491, 0.7875, 0.6221],
         ...,
         [0.4005, 0.3181, 0.6964],
         [0.2300, 0.6768, 0.1062],
         [0.4556, 0.2257, 0.4327]],

        [[0.2503, 0.1219, 0.2062],
         [0.9130, 0.8486, 0.7581],
         [0.

In [97]:
c.unsqueeze(0).shape

torch.Size([1, 256, 256, 3])

In [98]:
# Squeezing
d = torch.rand(1, 256, 256, 3)
d

tensor([[[[0.5058, 0.0110, 0.9448],
          [0.0895, 0.4037, 0.5097],
          [0.2816, 0.7860, 0.1566],
          ...,
          [0.3483, 0.7965, 0.3163],
          [0.2564, 0.1395, 0.5789],
          [0.0860, 0.4074, 0.6638]],

         [[0.0800, 0.2696, 0.2378],
          [0.5578, 0.6793, 0.1440],
          [0.9403, 0.2297, 0.7370],
          ...,
          [0.8332, 0.3943, 0.9177],
          [0.1261, 0.2188, 0.1474],
          [0.1955, 0.6822, 0.5779]],

         [[0.1160, 0.5696, 0.7157],
          [0.3957, 0.2071, 0.9547],
          [0.2486, 0.7636, 0.8120],
          ...,
          [0.5408, 0.2199, 0.2131],
          [0.3142, 0.2201, 0.3451],
          [0.8051, 0.6446, 0.8832]],

         ...,

         [[0.1471, 0.3860, 0.8286],
          [0.8340, 0.1069, 0.0334],
          [0.2324, 0.8673, 0.7299],
          ...,
          [0.2802, 0.2591, 0.2299],
          [0.0152, 0.9752, 0.2129],
          [0.1397, 0.9821, 0.2340]],

         [[0.3041, 0.1512, 0.0832],
          [0.1312

In [99]:
d.squeeze(0)

tensor([[[0.5058, 0.0110, 0.9448],
         [0.0895, 0.4037, 0.5097],
         [0.2816, 0.7860, 0.1566],
         ...,
         [0.3483, 0.7965, 0.3163],
         [0.2564, 0.1395, 0.5789],
         [0.0860, 0.4074, 0.6638]],

        [[0.0800, 0.2696, 0.2378],
         [0.5578, 0.6793, 0.1440],
         [0.9403, 0.2297, 0.7370],
         ...,
         [0.8332, 0.3943, 0.9177],
         [0.1261, 0.2188, 0.1474],
         [0.1955, 0.6822, 0.5779]],

        [[0.1160, 0.5696, 0.7157],
         [0.3957, 0.2071, 0.9547],
         [0.2486, 0.7636, 0.8120],
         ...,
         [0.5408, 0.2199, 0.2131],
         [0.3142, 0.2201, 0.3451],
         [0.8051, 0.6446, 0.8832]],

        ...,

        [[0.1471, 0.3860, 0.8286],
         [0.8340, 0.1069, 0.0334],
         [0.2324, 0.8673, 0.7299],
         ...,
         [0.2802, 0.2591, 0.2299],
         [0.0152, 0.9752, 0.2129],
         [0.1397, 0.9821, 0.2340]],

        [[0.3041, 0.1512, 0.0832],
         [0.1312, 0.4697, 0.4004],
         [0.

## NumPy and PyTorch

In [100]:
import numpy as np

a = torch.tensor([1, 2, 3, 4, 5])
a

tensor([1, 2, 3, 4, 5])

In [101]:
b = a.numpy()
b

array([1, 2, 3, 4, 5])

In [102]:
type(b)

numpy.ndarray

In [103]:
c = np.array([1, 2, 3, 4, 5])
c

array([1, 2, 3, 4, 5])

In [104]:
torch.from_numpy(c)

tensor([1, 2, 3, 4, 5])

In [1]:
# language: python
import time
import tensorflow as tf

size = 10000  # reduce (e.g. 2048 or 4096) if you get OOM

def matmul_time(device_name, size, warmups=1):
    try:
        with tf.device(device_name):
            a = tf.random.uniform((size, size), dtype=tf.float32)
            b = tf.random.uniform((size, size), dtype=tf.float32)
            # warmup(s)
            for _ in range(warmups):
                _ = tf.matmul(a, b)
                _.numpy()  # force execution & move result to host
            # timed run
            start = time.perf_counter()
            c = tf.matmul(a, b)
            c.numpy()  # block until finished
            end = time.perf_counter()
            return end - start
    except tf.errors.ResourceExhaustedError as e:
        print(f"OOM on {device_name}: {e}")
        return None

# CPU
cpu_time = matmul_time('/CPU:0', size, warmups=1)
if cpu_time is not None:
    print(f"Time taken for matrix multiplication on CPU: {cpu_time:.4f} s")

# GPU (if available)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    gpu_time = matmul_time('/GPU:0', size, warmups=1)
    if gpu_time is not None:
        print(f"Time taken for matrix multiplication on GPU: {gpu_time:.4f} s")
        if cpu_time is not None:
            print(f"GPU is: {cpu_time / gpu_time:.4f}x faster than CPU")
else:
    print("No GPU detected. Using CPU only.")

2025-12-05 11:58:21.381907: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1764916106.825291 1037729 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6231 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2070, pci bus id: 0000:01:00.0, compute capability: 7.5
2025-12-05 11:58:26.839799: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 400000000 exceeds 10% of free system memory.
2025-12-05 11:58:26.954422: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 400000000 exceeds 10% of free system memory.
2025-12-05 11:58:27.074307: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 400000000 exceeds 10% of free system memor

Time taken for matrix multiplication on CPU: 6.9979 s


2025-12-05 11:58:41.408720: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 400000000 exceeds 10% of free system memory.


Time taken for matrix multiplication on GPU: 0.5230 s
GPU is: 13.3793x faster than CPU
