# 1 Installation


In [3]:
#!conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch

In [4]:
import torch 
print(torch.__version__)
torch.cuda.is_available()

1.2.0


True

# 2 Tensor
## 2.1 Create a tensor

In [5]:
# uninitialized tensor
x = torch.empty(5, 3)
print(x)

tensor([[2.0165e-16, 4.5558e-41, 2.0165e-16],
        [4.5558e-41, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.5556e-41, 7.7052e+31, 7.2148e+22],
        [2.5226e-18, 2.5930e-09, 1.0299e-11]])


In [6]:
# random initialized tensor
x = torch.rand(5, 3) # Returns a tensor filled with random numbers from a uniform distribution on the interval [0, 1)

y = torch.randn(5, 3) # Returns a tensor filled with random numbers from a normal distribution with mean 0 and variance 1 
print(x)
print(y)

tensor([[0.7974, 0.4379, 0.5271],
        [0.0137, 0.1592, 0.2382],
        [0.0525, 0.2752, 0.8075],
        [0.1118, 0.5983, 0.4822],
        [0.4174, 0.4503, 0.8434]])
tensor([[ 0.6997, -1.3750, -0.2327],
        [ 0.1116, -1.4076,  0.5490],
        [ 0.9344,  0.3409, -0.1898],
        [-0.3784, -0.2884,  0.2256],
        [-1.0037, -0.2776,  0.9279]])


In [7]:
# from a list
x = torch.tensor([5, 3])
print(x)

tensor([5, 3])


In [8]:
# from a existing tensor
x = torch.ones(5, 3, dtype = torch.float64)
y = torch.rand_like(x, dtype=torch.float)
print(x)
print(y)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], dtype=torch.float64)
tensor([[0.7728, 0.1372, 0.2402],
        [0.2752, 0.8795, 0.8423],
        [0.4405, 0.5696, 0.4664],
        [0.2972, 0.8016, 0.2430],
        [0.2186, 0.4146, 0.4985]])


## 2.2 Properties

In [9]:
# size/shape
x = torch.ones(5, 3)
print(x.size())
print(x.shape)

torch.Size([5, 3])
torch.Size([5, 3])


In [10]:
# Addition
x = torch.rand(5, 3)
y = torch.rand(5, 3)
print(x + y)

result = torch.empty(5, 3) # result has to be defined before
torch.add(x, y, out = result)
print(result)

print(y.add(x))

# inplace Addition
y.add_(x)
print(y)

tensor([[0.9397, 0.4407, 1.1115],
        [0.9903, 1.0324, 1.1561],
        [1.8697, 1.1561, 0.3952],
        [1.3803, 1.5323, 1.1323],
        [0.6961, 1.5515, 1.4617]])
tensor([[0.9397, 0.4407, 1.1115],
        [0.9903, 1.0324, 1.1561],
        [1.8697, 1.1561, 0.3952],
        [1.3803, 1.5323, 1.1323],
        [0.6961, 1.5515, 1.4617]])
tensor([[0.9397, 0.4407, 1.1115],
        [0.9903, 1.0324, 1.1561],
        [1.8697, 1.1561, 0.3952],
        [1.3803, 1.5323, 1.1323],
        [0.6961, 1.5515, 1.4617]])
tensor([[0.9397, 0.4407, 1.1115],
        [0.9903, 1.0324, 1.1561],
        [1.8697, 1.1561, 0.3952],
        [1.3803, 1.5323, 1.1323],
        [0.6961, 1.5515, 1.4617]])


In [11]:
# Indexing and Slicing: the result is actually a REFERENCE of the original tensor, do not allocate new memory
x = torch.ones(5, 3)
y = x[0, :]
y += 1
print(x) # x changes with y

tensor([[2., 2., 2.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])


In [12]:
# reshape : 'view()` allocates new memory, but the result shares data with the original tensor
x = torch.ones(5, 3)
z = x.view(-1, 5)
x[0, 0] -= 1 # change z will also change x and y
print(z)

tensor([[0., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])


In [13]:
# reshape with a copy
x = torch.ones(5, 3)
x_cp = x.clone().view(-1, 5)
x -= 1
print(x)
print(x_cp)

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])
tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])


In [14]:
# convert an one element tensor to python scalar
x = torch.randn(1)
print(x)
print(x.item())

tensor([1.2776])
1.2775917053222656


## 2.3 Broadcasting
Broadcasting mechanism can expand one or both tensors by copying elements appropriately so that after this transformation, the two tensors have the same shape.

In [15]:
x = torch.arange(2).view(1, 2)
print(x)
y = torch.arange(3).view(3, 1)
print(y)
print(x + y)

tensor([[0, 1]])
tensor([[0],
        [1],
        [2]])
tensor([[0, 1],
        [1, 2],
        [2, 3]])


## 2.4 Saving Memory
Python's `id()` function gives us the exact address of the referenced object in memory

In [16]:
x = torch.tensor([1, 2])
y = torch.tensor([3, 4])
id_before = id(y)
y = x + y  # we allocate new memory here
print(id(y) == id_before)

False


In [17]:
x = torch.tensor([1, 2])
y = torch.tensor([3, 4])
id_before = id(y)
y[:] = x + y # Do not allocate new memory
print(id(y) == id_before)

True


In [18]:
x = torch.tensor([1, 2])
y = torch.tensor([3, 4])
id_before = id(y)
torch.add(x, y, out = y) # Do not allocate new memory
print(id(y) == id_before)


True


In [19]:
x = torch.tensor([1, 2])
y = torch.tensor([3, 4])
id_before = id(y)
y += x # Do not allocate new memory
print(id(y) == id_before)

True


In [20]:
x = torch.tensor([1, 2])
y = torch.tensor([3, 4])
id_before = id(y)
y.add_(x) # Do not allocate new memory
print(id(y) == id_before)

True


## 2.5 Conversion to Numpy and vice versa

In [21]:
# tensor to numpy.ndarray : SHARE the memory
a = torch.ones(5)
b = a.numpy()
print(a, b)
a += 1
print(a, b)
b += 1
print(a, b)

tensor([1., 1., 1., 1., 1.]) [1. 1. 1. 1. 1.]
tensor([2., 2., 2., 2., 2.]) [2. 2. 2. 2. 2.]
tensor([3., 3., 3., 3., 3.]) [3. 3. 3. 3. 3.]


In [22]:
# numpy.ndarray to tensor using `torch.from_numpy()`: SHARE the memory
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a)
print(a, b)

a += 1
print(a, b)
b += 1
print(a, b)


[1. 1. 1. 1. 1.] tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
[2. 2. 2. 2. 2.] tensor([2., 2., 2., 2., 2.], dtype=torch.float64)
[3. 3. 3. 3. 3.] tensor([3., 3., 3., 3., 3.], dtype=torch.float64)


In [23]:
# numpy.ndarray to tensor using `torch.tensor()`: do not SHARE the memory
a = np.ones(5)
b = torch.tensor(a)
print(a, b)
a += 1
print(a, b)
b += 2
print(a, b)

[1. 1. 1. 1. 1.] tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
[2. 2. 2. 2. 2.] tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
[2. 2. 2. 2. 2.] tensor([3., 3., 3., 3., 3.], dtype=torch.float64)


## 2.6 Tensor on GPU

In [24]:
x = torch.ones(5, 3)
if torch.cuda.is_available():
    device = torch.device("cuda")
    y = torch.ones_like(x, device = device) # directly created on GPU
    x = x.to(device) #equivalent to `x = x.to("cuda")
    z = x + y
    print(z)
    print(z.to("cpu", torch.double)) # change the datatype simultaneously

tensor([[2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.]], device='cuda:0')
tensor([[2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.]], dtype=torch.float64)


# 3 Data Preprocessing
## 3.1 Reading the dataset

In [25]:
# Create a dataset
import os

os.makedirs(os.path.join(".", "data"), exist_ok=True)
data_file = os.path.join(".", "data", "house_tiny.csv")
with open(data_file, "w") as f:
    f.write("NumRooms,Alley,Price\n") # Column names
    f.write("NA,Pave,127500\n")
    f.write("2,NA,106000\n")
    f.write("4,NA,178100\n")
    f.write("NA,NA,140000\n")

In [26]:
# Read the dataset
import pandas as pd
data = pd.read_csv(data_file)
print(data)

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000


## 3.2 Handle Missing Data

In [27]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = inputs.fillna(inputs.mean()) # Only deal with munnerical values
print(inputs)

   NumRooms Alley
0       3.0  Pave
1       2.0   NaN
2       4.0   NaN
3       3.0   NaN


In [28]:
# For categorical or discretet values, we consider NaN as a category and convert column "Alley" to several different columns according to the discrete values(categories)
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)

   NumRooms  Alley_Pave  Alley_nan
0       3.0           1          0
1       2.0           0          1
2       4.0           0          1
3       3.0           0          1


## 3.3 Conversion to the Tensor Format

In [29]:
X, y = torch.tensor(inputs.values), torch.tensor(outputs.values)
X, y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

# 4 Autograd
attributes and methods of `Tensor`:
- `requires_grad`: True, when we want to track the gradient
- `grad_fn`: return a `Function` for this Tensor
- `grad`: gradient describing the latest computation
- `backward()`: do the backpropagation and thus change the attribute `grad` 
- `detach()`: stop to track the future computation
- `data`: returns a copy that shares memory. But calculation on `data` will not be tracked. That means, we can change the value without being tracked


Or we can detach temporarily:
```python
with torch.no_grad():
    # Computation here will not be tracked
```

## 4.1 `requires_grad` and `grad_fn`

In [30]:
x = torch.ones(2, 2, requires_grad=True)
# we call the directly created tensor as `leaf` Leaf node has no `grad_fn`
print(x)
print(x.is_leaf)
print(x.grad_fn)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
True
None


In [31]:
y = x + 2
# y is created by addition and thus has `AddBackward` as `grad_fn`
print(y)
print(y.is_leaf)
print(y.grad_fn)

tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)
False
<AddBackward0 object at 0x7efea6500400>


In [32]:
z = y * y * 3
out = z.mean()
print(z)
print(out)

tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>)
tensor(27., grad_fn=<MeanBackward0>)


In [33]:
# change `requires_grad`
a = torch.rand(2, 2) # `requires_grad` is False by default
a = ((a * 3) / (a - 1))
print(a.requires_grad)
a.requires_grad_(True) # change `requires_grad` in-place
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

False
True
<SumBackward0 object at 0x7efea650dba8>


## 4.2 `backward()`
### 4.2.1 Scalar
For `loss.backward()`, if `loss` is a scalar, we do not need to add a parameter for `backward()`

In [34]:
x = torch.ones(2, 2, requires_grad=True)
y = x + 2
z = y * y * 3
out = z.mean()

In [35]:
out.backward()
print(x.grad)

tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])


In [36]:
# grad is ACCUMULATED during the training
out2 = x.sum()
out2.backward()
print("d out/d x + d out2 / d x: \n", x.grad)

out3 = x.sum()
x.grad.data.zero_() # Clear the previous gradient
out3.backward()
print("d out3/dx:\n", x.grad)

d out/d x + d out2 / d x: 
 tensor([[5.5000, 5.5000],
        [5.5000, 5.5000]])
d out3/dx:
 tensor([[1., 1.],
        [1., 1.]])


### 4.2.2 Tensor
Actually derivation of tensor to tensor should not be allowed. So when we use `y.backward(w)`, with `y` and `w` as tensors with the same shape, we are doing:
```python
l = torch.sum(y * w) # l is a scalar
l.backward()
```

In [37]:
x = torch.tensor([1.0, 2.0, 3.0, 4.0], requires_grad=True)
y = 2 * x
z = y.view(2, 2)
print(z)

tensor([[2., 4.],
        [6., 8.]], grad_fn=<ViewBackward>)


In [38]:
v = torch.tensor([[1.0, 0.1], [0.01, 0.001]], dtype = torch.float)
z.backward(v)
print(x.grad)

tensor([2.0000, 0.2000, 0.0200, 0.0020])


## 4.3 Detach

In [39]:
x = torch.tensor(1.0, requires_grad=True)
y1 = x ** 2
with torch.no_grad():
    y2 = x ** 3;
y3 = y1 + y2
y3.detach() # future gradient will not be calculated
y4 = 2 * y3
print(x.requires_grad)
print(y1, y1.requires_grad)
print(y2, y2.requires_grad)
print(y3, y3.requires_grad)
print(y4, y4.requires_grad)

True
tensor(1., grad_fn=<PowBackward0>) True
tensor(1.) False
tensor(2., grad_fn=<AddBackward0>) True
tensor(4., grad_fn=<MulBackward0>) True


In [40]:
#y2.backward() # Error
#y4.backward() # Error, although y4.requires_grad is true
y3.backward()
print(x.grad) 


tensor(2.)


## 4.4 `data`
Change the value without being tracked

In [41]:
x = torch.ones(1, requires_grad=True)
print(x.data)
print(x.data.requires_grad)

y = 2 * x
x.data *= 100 # SHARING Memory, but will not be tracked

y.backward()
print(x)
print(x.grad)

tensor([1.])
False
tensor([100.], requires_grad=True)
tensor([2.])
