## ch2. Preliminaries

### 2.1. Data Manipulation

In [None]:
import torch

In [None]:
x = torch.arange(12, dtype=torch.float32)
x

In [None]:
x.numel()

In [None]:
x.shape

In [None]:
x = x.reshape(3,4)
x

In [None]:
torch.zeros((2,3,4))

In [None]:
torch.ones((2,3,4))

In [None]:
torch.tensor([[2,1,4,3], [1,2,3,4], [4,3,2,1]])

In [None]:
x[-1], x[1:3]

In [None]:
x[1, 2] = 17
x

In [None]:
x[:2, :] = 12
x

In [None]:
torch.exp(x)

In [None]:
x = torch.tensor([1.0, 2, 4, 8])
y = torch.tensor([2,2,2,2])

print(x // y, x % y)    ## these also works!
x+y, x-y, x*y, x/y, x**y

In [None]:
x = torch.arange(12, dtype=torch.float32).reshape(3,4)
y = torch.tensor([[2,1,4,3], [1,2,3,4], [4,3,2,1]])
torch.cat((x,y), dim=0), torch.cat((x,y), dim=1)

In [None]:
x == y

In [None]:
x.sum()

In [None]:
## broadcasting procedure: 
## (i) expand one or both arrays by copying elements along axes with length 1 so that after this transformation, the two tensors have the same shape
## (ii) perform an elementwise operation on the resulting arrays.

a = torch.arange(3).reshape((3,1))
b = torch.arange(2).reshape(1,2)      ## using without bracket also works, but if then, reshpaed shape is NOT EXPLICIT.

print(a,b)
a + b

In [None]:
a = torch.tensor([[0,1,2], [0,1,2]])
b = torch.tensor([[0,1,2]*2])
a, b
## * basically works as element-wise operation, but in this case it worked as if I use * for List[].

In [None]:
before = id(y)
y = y + x
print(id(y) == before)  ## referencing new id

z = torch.zeros_like(y)
print(f'id(z): {id(z)}')
z[:] = x + y
print(f'id(z): {id(z)}')

before = id(x)
x += y      ## works as in-place operation
id(x) == before

In [None]:
A = x.numpy()
B = torch.from_numpy(A)
type(A), type(B)

In [None]:
a = torch.tensor([3.5])
a, a.item(), float(a), int(a)

### 2.2. Data preprocessing

In [None]:
import os, torch

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('''NumRooms,RoofType,Price
NA,NA,127500
2,NA,106000
4,Slate,178100
NA,NA,140000''')

In [None]:
import pandas as pd

data = pd.read_csv(data_file)
print(data)

In [None]:
inputs, targets = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = pd.get_dummies(inputs, dummy_na = True)
print(inputs)

In [None]:
inputs = inputs.fillna(inputs.mean())
print(inputs)

In [None]:
X = torch.tensor(inputs.to_numpy(dtype=float))
y = torch.tensor(targets.to_numpy(dtype=float))
X, y

#### Discussions
- is it always plausible to use mean value as N/A-filling?
- Real world dataset has a lot of outliers. We have to take care of dealing it.

### 2.3. Linear Algebra

In [None]:
import torch

x = torch.tensor(3.0)
y = torch.tensor(2.0)
x+y, x*y, x/y, x**y

In [None]:
x = torch.arange(3)
x

In [None]:
x[2], len(x), x.shape, x.size()

In [None]:
A = torch.arange(6).reshape(3,2)
A

In [None]:
A.T

In [None]:
A = torch.tensor([[1,2,3], [2,0,4], [3,4,5]])
A == A.T

In [None]:
torch.arange(24).reshape(2,3,4)

In [None]:
A = torch.arange(6, dtype=torch.float32).reshape(2,3)
B = A.clone()
A, A+B

In [None]:
A * B

In [None]:
a = 2
X = torch.arange(24).reshape(2,3,4)
a + X, (a * X).shape

In [None]:
x = torch.arange(3, dtype=torch.float32)
x, x.sum()

In [None]:
A.shape, A.sum()

In [None]:
A.shape, A.sum(axis=0).shape

In [None]:
A.sum(axis=[0,1]) == A.sum()

In [None]:
A.mean(), A.sum() / A.numel()

In [None]:
A.mean(axis=0), A.sum(axis=0) / A.shape[0]

In [None]:
print(A)
sum_A = A.sum(axis=1, keepdims=False)
sum_A, sum_A.shape

In [None]:
sum_A = A.sum(axis=1, keepdims=True)
sum_A, sum_A.shape

In [None]:
A / sum_A

In [None]:
A.cumsum(axis=0)

In [None]:
y = torch.ones(3, dtype=torch.float32)
x, y, torch.dot(x, y)

In [None]:
torch.sum(x * y)    ## dot product == element-wise multiplication & summing up

In [None]:
A.shape, x.shape, torch.mv(A, x), A@x, torch.matmul(A, x)

In [None]:
B = torch.ones(3,4)
torch.mm(A,B), A@B

In [None]:
u = torch.tensor([3.0, -4.0])
torch.norm(u)

In [None]:
torch.abs(u).sum()

In [None]:
torch.norm(torch.ones(4,9))

#### Discussions & Questions
- matrices can be decomposed into factors for machine-learning usage?!
> Norms capture various notions of the magnitude of a vector or matrix, and are commonly applied to the difference of two vectors to measure their distance apart.
>> other distance metrics?
- Questions
1. differrence b/w `@`, `matmul`, `dot`, `mv`
2. L2 norm, L2 norm. Which one for what case?

### 2.5. Automatic Differentiation

In [None]:
import torch

In [None]:
x = torch.arange(4.0)
x.requires_grad_(True)
print(x)
x.grad

In [None]:
y = 2 * torch.dot(x, x)
y

In [None]:
y.backward()
x.grad

In [None]:
x.grad == 4 * x

In [None]:
x.grad.zero_()
print(x.grad)

y = x.sum()     ## grads of summing up is always 1 !?
y.backward()
x.grad

In [None]:
print(x.grad)
x.grad.zero_()
print(x.grad)   ## the grad is initailzed with zero values!

y = x * x
y.backward(gradient=torch.ones(len(y)))
x.grad

In [None]:
x.grad.zero_()
y = x * X
u = y.detach()
z = u * x

z.sum().backward()
x.grad == u

In [None]:
x.grad.zero_()
y.sum().backward()
x.grad == 2 * x

In [None]:
def f(a):
    b = a * 2
    while b.norm() < 1000:
        b = b * 2
    if b.sum() > 0:
        c = b
    else:
        c = 100 * b
    return c

In [None]:
a = torch.randn(size=(), requires_grad=True)
d = f(a)
d.backward()

In [None]:
a.grad == d / a

#### Discussions
- autograd let us design massive DL models!
- How autograd model works
  1. attach gradients to variables
  2. record the computation
  3. execute the backpropagation function
  4. access the resulting gradiet.

## ch3

### 3.1. Linear Regression

In [None]:
import math, time
import numpy as np
import torch

# !pip install d2l
from d2l import torch as d2l

In [None]:
n = 10_000
a = torch.ones(n)
b = torch.ones(n)

c = torch.zeros(n)
t = time.time()


for i in range(n):
    c[i] = a[i] + b[i]
print(f'{time.time() - t:.5f} sec')


t = time.time()
d = a + b
print(f'{time.time() - t:.5f} sec')     ## it's way faster than the former one.

In [None]:
def normal(x, mu, sigma):
    p = 1 / math.sqrt(2 * math.pi * sigma**2)
    return p * np.exp(-0.5 * (x - mu)**2 / sigma**2) 

In [None]:
x = np.arange(-7, 7, 0.01)

params = [(0,1), (0,2), (3,1)]

d2l.plot(x, [normal(x, mu, sigma) for mu, sigma in params], 
         xlabel='x', ylabel='p(x)', figsize=(4.5, 2.5),
         legend=[f'mean {mu} std {sigma}' for mu, sigma in params])

### 3.2. Object-oriented design

In [None]:
import time
import numpy as np
import torch
from torch import nn
from d2l import torch as d2l

In [None]:
def add_to_class(Class):
    def wrapper(obj):
        setattr(Class, obj.__name__, obj)
    return wrapper

class A:
    def __init__(self):
        self.b = 1

a = A()

In [None]:
@add_to_class(A)
def do(self):
    print("Class attribute 'b' is", self.b)
    
a.do()

In [None]:
class HyperParameters: 
    def save_hyperparameters(self, ignore=[]):
        raise NotImplemented
    
class B(d2l.HyperParameters):
    def __init__(self, a, b, c):
        self.save_hyperparameters(ignore=['c'])
        print('self.a =', self.a, 'self.b =', self.b)
        print('There is no self.c =', not hasattr(self, 'c'))

b = B(a=1, b=2, c=3)

In [None]:
class ProgressBoard(d2l.HyperParameters):
    def __init__(self, xlabel=None, ylabel=None, xlim=None,
                 ylim=None, xscale='linear', yscale='linear',
                 ls=['-', '--', '-.', ':'], colors=['C0', 'C1', 'C2', 'C3'],
                 fig=None, axes=None, figsize=(3.5, 2.5), display=True):
        self.save_hyperparameters()

    def draw(self, x, y, label, every_n=1):
        raise NotImplemented

In [None]:
board = d2l.ProgressBoard('x')
for x in np.arange(0, 10, 0.1):
    board.draw(x, np.sin(x), 'sin', every_n=2)
    board.draw(x, np.cos(x), 'cos', every_n=10)

In [None]:
## define Module class!
class Module(nn.Module, d2l.HyperParameters):
    def __init__(self, plot_train_per_epoch=2, plot_valid_per_epoch=1):
        super().__init__()
        self.save_hyperparameters()
        self.board = ProgressBoard()
        
    def loss(self, y_hat, y):
        raise NotImplemented
    
    def forward(self, X):
        assert hashattr(self, 'net'), "Neural net is NOT defined"
        return self.net(X)
    
    def plot(self, key, value, train):
        assert hasattr(self, 'trainer'), "Trainer is not inited"
        self.board.xlabel = 'epoch'
        
        if train:
            x = self.trainer.train_batch_idx / \
                self.trainer.num_train_batches
            n = self.trainer.num_train_batches / \
                self.plot_train_per_epoch
        else:
            x = self.trainer.epoch + 1
            n = self.trainer.num_val_batches / \
                self.plot_valid_per_epoch
                
        self.board.draw(x, value.to(d2l.cpu()).detach().numpy(),
                        ('train_' if train else 'val_') + key,
                        every_n=int(n))

    def training_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        self.plot('loss', l, train=True)
        return l

    def validation_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        self.plot('loss', l, train=False)


    def configure_optimizers(self):
        raise NotImplementedError
    

In [None]:
class Trainer(d2l.HyperParameters):
    def __init__(self, max_epochs, num_gpus=0, gradient_clip_val=0):
        self.save_hyperparameters()
        assert num_gpus == 0, "No GPU support yet"

    def prepare_data(self, data):
        self.train_dataloader = data.train_dataloader()
        self.val_dataloader = data.val_dataloader()
        self.num_train_batches = len(self.train_dataloader)
        self.num_val_batches = (len(self.val_dataloader)
                                if self.val_dataloader is not None else 0)

    def prepare_model(self, model):
        model.trainer = self
        model.board.xlim = [0, self.max_epochs]
        self.model = model

    def fit(self, model, data):
        self.prepare_data(data)
        self.prepare_model(model)
        self.optim = model.configure_optimizers()
        self.epoch = 0
        self.train_batch_idx = 0
        self.val_batch_idx = 0
        for self.epoch in range(self.max_epochs):
            self.fit_epoch()

    def fit_epoch(self):
        raise NotImplementedError

#### Questions
- Is it always good to use OOP when cosntructing DL model? I think that maybe there can be a kind of performance drawback.
- I'm not that familiar to python's OOP programming. What's the `raise NotImplemented` and `setattr()`? Maybe I need to study more about it

### 3.4. Linear regression from scratch

In [None]:
import torch
from d2l import torch as d2l

In [None]:
class LinearRegressionScratch(d2l.Module):
    def __init__(self, num_inputs, lr, sigma=0.01):
        super().__init__()
        self.save_hyperparameters()
        self.W = torch.normal(0, sigma, (num_inputs, 1), requires_grad=True)    ## weight
        self.b = torch.zeros(1, requires_grad=True)     ## bias
        

@d2l.add_to_class(LinearRegressionScratch)  ## add forward method
def forward(self, X):
    return torch.matmul(X, self.W) + self.b

@d2l.add_to_class(LinearRegressionScratch)  ## add loss function
def loss(self, Y_hat, Y):
    l = (Y_hat - Y) ** 2 / 2    ## loss by L2 norm !?
    return l.mean()

class SGD(d2l.HyperParameters):
    def __init__(self, params, lr):
        self.save_hyperparameters()

    def step(self):
        for param in self.params:
            param -= self.lr * param.grad

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()
                
@d2l.add_to_class(LinearRegressionScratch)
def configure_optimizers(self):
    return SGD([self.W, self.b], self.lr)

In [None]:
@d2l.add_to_class(d2l.Trainer)
def prepare_batch(self, batch):
    return batch

@d2l.add_to_class(d2l.Trainer)
def fit_epoch(self):
    self.model.train()
    for batch in self.train_dataloader:
        loss = self.model.training_step(self.prepare_batch(batch))
        self.optim.zero_grad()
        with torch.no_grad():
            loss.backward()
            if self.gradient_clip_val > 0:
                self.clip_gradients(self.gradient_clip_val, self.model)
            self.optim.step()
        self.train_batch_idx += 1
    if self.val_dataloader is None:
        return None
    
    self.model.eval()
    for batch in self.val_dataloader:
        with torch.no_grad():
            self.model.validation_step(self.prepare_batch(batch))
        self.val_batch_idx += 1

In [None]:
model = LinearRegressionScratch(2, lr=0.03)
data = d2l.SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2)
trainer = d2l.Trainer(max_epochs=3)
trainer.fit(model, data)

with torch.no_grad():
    print(f'error in estimating W: {data.w - model.W.reshape(data.w.shape)}')
    print(f'error in estimating b: {data.b - model.b}')

## ch4

### 4.1. Softmax Regression

- Discussions
    - One-hot encoding is usually used for classification.
    - The derivative of Cross Entopy function is easy to calculate. (And that's why they are used as loss funciton!)
  

### 4.2. Image classification

In [None]:
import time
import torch
import torchvision
from torchvision import transforms
from d2l import torch as d2l

d2l.use_svg_display()

In [None]:
class FashionMNIST(d2l.DataModule):
    def __init__(self, batch_size=64, resize=(28, 28)):
        super().__init__()
        self.save_hyperparameters()
        trans = transforms.Compose([transforms.Resize(resize),
                                    transforms.ToTensor()])
        self.train = torchvision.datasets.FashionMNIST(
            root=self.root, train=True, transform=trans, download=True)
        self.val = torchvision.datasets.FashionMNIST(
            root=self.root, train=False, transform=trans, download=True)

In [None]:
data = FashionMNIST(resize=(32, 32))
print(len(data.train), len(data.val))
print(data.train[0][0].shape)

In [None]:
@d2l.add_to_class(FashionMNIST)
def text_labels(self, indices):
    labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
              'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
    return [labels[int(i)] for i in indices]

@d2l.add_to_class(FashionMNIST)
def get_dataloader(self, train):
    data = self.train if train else self.val
    return torch.utils.data.DataLoader(data, self.batch_size, shuffle=train,
                                       num_workers=self.num_workers)

In [None]:
X, y = next(iter(data.train_dataloader()))
print(X.shape, X.dtype, y.shape, y.dtype)

tic = time.time()
for X, y in data.train_dataloader():
    continue
f'{time.time() - tic:.2f} second'

In [None]:
def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5):
    raise NotImplementedError

@d2l.add_to_class(FashionMNIST)
def visualize(self, batch, nrows=1, ncols=8, labels=[]):
    X, y = batch
    if not labels:
        labels = self.text_labels(y)
    d2l.show_images(X.squeeze(1), nrows, ncols, titles=labels)
    
batch = next(iter(data.val_dataloader()))
data.visualize(batch)

### 4.3. The base classification model

In [None]:
import torch
from d2l import torch as d2l

In [None]:
class Classifier(d2l.Module):
    def validation_step(self, batch):
        Y_hat = self(*batch[:-1])
        
        self.plot('loss', self.loss(Y_hat, batch[-1]), train=False)
        self.plot('acc', self.accuracy(Y_hat, batch[-1]), train=False)

In [None]:
@d2l.add_to_class(d2l.Module)
def configure_optimizers(self):
    print('optimizer configured!')
    return torch.optim.SGD(self.parameters(), lr=self.lr)

### 4.4. Softmax Regression from Scratch

In [None]:
import torch
from d2l import torch as d2l

In [None]:
X = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
X.sum(0, keepdims=True), X.sum(1, keepdims=True)

In [None]:
def softmax(X):
    X_exp = torch.exp(X)
    partition = X_exp.sum(1, keepdims=True)
    return X_exp / partition

X = torch.rand((2, 5))
X_prob = softmax(X)
X_prob, X_prob.sum(1)

In [None]:
class SoftmaxRegressionScratch(d2l.Classifier):
    def __init__(self, num_inputs, num_outputs, lr, sigma=0.01):
        super().__init__()
        self.save_hyperparameters()
        self.W = torch.normal(0, sigma, size=(num_inputs, num_outputs),
                              requires_grad=True)
        self.b = torch.zeros(num_outputs, requires_grad=True)

    def parameters(self):
        return [self.W, self.b]
    
@d2l.add_to_class(SoftmaxRegressionScratch)
def forward(self, X):
    X = X.reshape((-1, self.W.shape[0]))
    return softmax(torch.matmul(X, self.W) + self.b)

In [None]:
y = torch.tensor([0, 2])
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y_hat[[0, 1], y]

In [None]:
y = torch.tensor([0, 2])
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y_hat[[0, 1], y]

def cross_entropy(y_hat, y):
    return -torch.log(y_hat[list(range(len(y_hat))), y]).mean()

cross_entropy(y_hat, y)

@d2l.add_to_class(SoftmaxRegressionScratch)
def loss(self, y_hat, y):
    return cross_entropy(y_hat, y)

In [None]:
data = d2l.FashionMNIST(batch_size=256)
model = SoftmaxRegressionScratch(num_inputs=784, num_outputs=10, lr=0.1)
trainer = d2l.Trainer(max_epochs=10)
trainer.fit(model, data)

## ch5

### 5.1. MLP

In [None]:
import torch
from d2l import torch as d2l

In [None]:
x = torch.arange(-8.0, 8.0, 0.1, requires_grad=True)
y = torch.relu(x)
d2l.plot(x.detach(), y.detach(), 'x', 'relu(x)', figsize=(5, 2.5))

In [None]:
y.backward(torch.ones_like(x), retain_graph=True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of relu', figsize=(5, 2.5))

In [None]:
y = torch.sigmoid(x)
d2l.plot(x.detach(), y.detach(), 'x', 'sigmoid(x)', figsize=(5, 2.5))

In [None]:
x.grad.data.zero_()     ## gradients clear!
y.backward(torch.ones_like(x),retain_graph=True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of sigmoid', figsize=(5, 2.5))

In [None]:
y = torch.tanh(x)
d2l.plot(x.detach(), y.detach(), 'x', 'tanh(x)', figsize=(5, 2.5))

In [None]:
x.grad.data.zero_()
y.backward(torch.ones_like(x),retain_graph=True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of tanh', figsize=(5, 2.5))

#### Discussions
- ReLU incorporate non-lineariries to build expressive multilayer NN.
- ReLU is more amendable to optimization than sigmoid or tanh function.
  - However there are further researches on activation functions liek GeLU!

### 5.2. Impelementation of MLP

In [None]:
import torch
from torch import nn
from d2l import torch as d2l

In [None]:
class MLPScratch(d2l.Classifier):
    def __init__(self, num_inputs, num_outputs, num_hiddens, lr, sigma=0.01):
        super().__init__()
        self.save_hyperparameters()
        self.W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens) * sigma)
        self.b1 = nn.Parameter(torch.zeros(num_hiddens))
        self.W2 = nn.Parameter(torch.randn(num_hiddens, num_outputs) * sigma)
        self.b2 = nn.Parameter(torch.zeros(num_outputs))

In [None]:
def relu(X):
    a = torch.zeros_like(X)
    return torch.max(X, a)

@d2l.add_to_class(MLPScratch)
def forward(self, X):
    X = X.reshape((-1, self.num_inputs))
    H = relu(torch.matmul(X, self.W1) + self.b1)
    return torch.matmul(H, self.W2) + self.b2

In [None]:
model = MLPScratch(num_inputs=784, num_outputs=10, num_hiddens=256, lr=0.1)
data = d2l.FashionMNIST(batch_size=256)
trainer = d2l.Trainer(max_epochs=10)
trainer.fit(model, data)

In [None]:
class MLP(d2l.Classifier):
    def __init__(self, num_outputs, num_hiddens, lr):
        super().__init__()
        self.save_hyperparameters()
        self.net = nn.Sequential(nn.Flatten(), nn.LazyLinear(num_hiddens),
                                 nn.ReLU(), nn.LazyLinear(num_outputs))

### 5.3. Forward Prop, Back Prop, and compudational graphs

- Discussions!