Submission Guideline
* Do not clear your outputs. This notebook will not have autograder, we will not download and run your solution.
* You will have till 11.59 pm tonight to finish.

In [71]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Why do we offload computations to GPU?

In [72]:
random_matrix_a = torch.randn(10000,20000)
random_matrix_b = torch.randn(20000,10000)

In [73]:
%%timeit -n 1 -r 1
#CPU version
out = torch.matmul(random_matrix_a, random_matrix_b)
# out = random_matrix_a @ random_matrix_b
# out = torch.einsum('ij,jk->ik', random_matrix_a, random_matrix_b)
print(out.shape)

torch.Size([10000, 10000])
16.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [74]:
#Move to GPU
random_matrix_a = random_matrix_a.to(device)
random_matrix_b = random_matrix_b.to(device)

In [75]:
%%timeit -n 1 -r 1
#CPU version
out = torch.matmul(random_matrix_a, random_matrix_b)
# out = random_matrix_a @ random_matrix_b
# out = torch.einsum('ij,jk->ik', random_matrix_a, random_matrix_b)
print(out.shape)

torch.Size([10000, 10000])
230 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [76]:
del random_matrix_a, random_matrix_b

### 1) Use `%%timeit` to compare performance of the [inverse](https://docs.pytorch.org/docs/stable/generated/torch.inverse.html) and [mean](https://docs.pytorch.org/docs/stable/generated/torch.mean.html) function on CPU and GPU. 

In [77]:
random_matrix_inv = torch.randn(10000,10000) # create a random square matrix
random_matrix_mean = torch.randn(5,20000) # Compute Mean across rows (5,20000) -> (20000)

In [78]:
%%timeit -n 1 -r 1
#CPU Inverse
print(random_matrix_inv.device)
out = torch.inverse(random_matrix_inv)
print(out.shape)

cpu
torch.Size([10000, 10000])
11.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [79]:
# Move random_matrix_inv to GPU
random_matrix_inv = random_matrix_inv.to(device)

In [41]:
%%timeit -n 1 -r 1
#GPU Inverse
out = torch.inverse(random_matrix_inv)
print(out.shape)

torch.Size([10000, 10000])
137 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [42]:
%%timeit -n 1 -r 1
#CPU Mean
out = torch.mean(random_matrix_mean)
print(out)

tensor(0.0029)
20.2 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [43]:
#Move random_matrix_mean to GPU
random_matrix_mean = random_matrix_mean.to(device)

In [44]:
%%timeit -n 1 -r 1
#GPU Mean
out = torch.mean(random_matrix_mean)
print(out)

tensor(0.0029, device='cuda:0')
542 μs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### 2) Complete the following code-blocks to generate a classifier for predicting 1000 classes.

Assume you have input features $\in R^{512}$, Complete the shapes of weight matrices and einsum strings so that the number of features is [1024,2048,1000] for your three layer network with no bias and relu activation

In [56]:
input_features = torch.randn(64,1024,512) # batch size of 64, 1024 pixels, 512 features
print(input_features.shape)
weight_1 = torch.randn(512,1024)
weight_2 = torch.randn(1024,2048)
weight_3 = torch.randn(2048,1000)



torch.Size([64, 1024, 512])


In [70]:
layer_1 = torch.einsum('bnd,dh->bnh', input_features, weight_1)
layer_1 = torch.relu(layer_1) # Suppress Negative Values
layer_2 = torch.einsum('bnh,hm->bnm', layer_1, weight_2)
layer_2 = torch.relu(layer_2) # Suppress Negative Values
output = torch.einsum('bnm,mq->bnq', layer_2, weight_3)
output = torch.mean(output,1) # Average All Pixels (64,1024,1000) -> (64,1000), dim=1 means '1024' dimension
# output = torch.einsum('bnq->bq',output) / output.shape[1]
print(output.shape)

torch.Size([64, 1000])
