# 2.2. Data Preprocessing

In [1]:
import os

os.makedirs(os.path.join('..','data'), exist_ok=True)
data_file = os.path.join('..','data','house_tiny.csv')

with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n') # Column names
    f.write('NA,Pave,127500\n') # Each row represents a data example
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [2]:
import pandas as pd
data = pd.read_csv(data_file)
print(data)

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000


imputation and deletion

* Imputation replaces missing values with substituted ones

In [3]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = inputs.fillna(inputs.mean())
print(inputs)

   NumRooms Alley
0       3.0  Pave
1       2.0   NaN
2       4.0   NaN
3       3.0   NaN


numeric value 인 NumRooms만 mean 값으로 채워짐

In [4]:
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)

   NumRooms  Alley_Pave  Alley_nan
0       3.0           1          0
1       2.0           0          1
2       4.0           0          1
3       3.0           0          1


In [5]:
outputs

0    127500
1    106000
2    178100
3    140000
Name: Price, dtype: int64

In [6]:
import torch
X,y = torch.tensor(inputs.values), torch.tensor(outputs.values)
X,y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

# Linear Algebra

In [7]:
import torch
x = torch.tensor(3.0)
y = torch.tensor(2.0)
x + y, x * y, x / y, x**y

(tensor(5.), tensor(6.), tensor(1.5000), tensor(9.))

In [8]:
x = torch.arange(4)
x

tensor([0, 1, 2, 3])

In [9]:
x[3]

tensor(3)

In [10]:
len(x)

4

In [11]:
x.shape

torch.Size([4])

Note that the word “dimension” tends to get overloaded in these contexts and this tends to confuse
people. To clarify, we use the dimensionality of a vector or an axis to refer to its length, i.e., the
number of elements of a vector or an axis. However, we use the dimensionality of a tensor to refer
to the number of axes that a tensor has. In this sense, the dimensionality of some axis of a tensor
will be the length of that axis.

In [12]:
A = torch.arange(20).reshape(5, 4)
A

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15],
        [16, 17, 18, 19]])

In [13]:
A.T

tensor([[ 0,  4,  8, 12, 16],
        [ 1,  5,  9, 13, 17],
        [ 2,  6, 10, 14, 18],
        [ 3,  7, 11, 15, 19]])

In [14]:
X = torch.arange(24).reshape(2, 3, 4)
X

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])

In [15]:
A = torch.arange(20, dtype=torch.float32).reshape(5, 4)
B = A.clone() # Assign a copy of `A` to `B` by allocating new memory
A, A + B

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [12., 13., 14., 15.],
         [16., 17., 18., 19.]]),
 tensor([[ 0.,  2.,  4.,  6.],
         [ 8., 10., 12., 14.],
         [16., 18., 20., 22.],
         [24., 26., 28., 30.],
         [32., 34., 36., 38.]]))

In [16]:
A * B # elementwise

tensor([[  0.,   1.,   4.,   9.],
        [ 16.,  25.,  36.,  49.],
        [ 64.,  81., 100., 121.],
        [144., 169., 196., 225.],
        [256., 289., 324., 361.]])

In [17]:
A ** 2

tensor([[  0.,   1.,   4.,   9.],
        [ 16.,  25.,  36.,  49.],
        [ 64.,  81., 100., 121.],
        [144., 169., 196., 225.],
        [256., 289., 324., 361.]])

In [18]:
a = 2
X = torch.arange(24).reshape(2, 3, 4)
a + X, (a * X).shape # scalar, tensor operation

(tensor([[[ 2,  3,  4,  5],
          [ 6,  7,  8,  9],
          [10, 11, 12, 13]],
 
         [[14, 15, 16, 17],
          [18, 19, 20, 21],
          [22, 23, 24, 25]]]),
 torch.Size([2, 3, 4]))

In [19]:
x = torch.arange(4, dtype=torch.float32)
x, x.sum()

(tensor([0., 1., 2., 3.]), tensor(6.))

In [22]:
A

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.],
        [12., 13., 14., 15.],
        [16., 17., 18., 19.]])

In [21]:
A.shape, A.sum()

(torch.Size([5, 4]), tensor(190.))

In [23]:
A_sum_axis0 = A.sum(axis=0) 
A_sum_axis0, A_sum_axis0.shape

(tensor([40., 45., 50., 55.]), torch.Size([4]))

In [24]:
A_sum_axis1 = A.sum(axis=1)
A_sum_axis1, A_sum_axis1.shape

(tensor([ 6., 22., 38., 54., 70.]), torch.Size([5]))

In [25]:
A.sum(axis=[0, 1]) # Same as `A.sum()`

tensor(190.)

In [26]:
A.mean(), A.sum() / A.numel()

(tensor(9.5000), tensor(9.5000))

In [27]:
A.mean(axis=0), A.sum(axis=0) / A.shape[0]

(tensor([ 8.,  9., 10., 11.]), tensor([ 8.,  9., 10., 11.]))

In [28]:
sum_A = A.sum(axis=1, keepdims=True)
sum_A

tensor([[ 6.],
        [22.],
        [38.],
        [54.],
        [70.]])

In [33]:
A/sum_A # broadcasting

tensor([[0.0000, 0.1667, 0.3333, 0.5000],
        [0.1818, 0.2273, 0.2727, 0.3182],
        [0.2105, 0.2368, 0.2632, 0.2895],
        [0.2222, 0.2407, 0.2593, 0.2778],
        [0.2286, 0.2429, 0.2571, 0.2714]])

In [36]:
A

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.],
        [12., 13., 14., 15.],
        [16., 17., 18., 19.]])

In [37]:
A.cumsum(axis=0) # axis 0 방향으로 누적 합

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  6.,  8., 10.],
        [12., 15., 18., 21.],
        [24., 28., 32., 36.],
        [40., 45., 50., 55.]])

In [39]:
A.cumprod(axis=0) # 누적 곱

tensor([[0.0000e+00, 1.0000e+00, 2.0000e+00, 3.0000e+00],
        [0.0000e+00, 5.0000e+00, 1.2000e+01, 2.1000e+01],
        [0.0000e+00, 4.5000e+01, 1.2000e+02, 2.3100e+02],
        [0.0000e+00, 5.8500e+02, 1.6800e+03, 3.4650e+03],
        [0.0000e+00, 9.9450e+03, 3.0240e+04, 6.5835e+04]])

In [40]:
y = torch.ones(4)
x,y,torch.dot(x,y)

(tensor([0., 1., 2., 3.]), tensor([1., 1., 1., 1.]), tensor(6.))

In [41]:
torch.sum(x*y)

tensor(6.)

## `torch.mv`: matrix-vector products

In [44]:
help(torch.mv)

Help on built-in function mv:

mv(...)
    mv(input, vec, *, out=None) -> Tensor
    
    Performs a matrix-vector product of the matrix :attr:`input` and the vector
    :attr:`vec`.
    
    If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
    size :math:`m`, :attr:`out` will be 1-D of size :math:`n`.
    
    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
    
    Args:
        input (Tensor): matrix to be multiplied
        vec (Tensor): vector to be multiplied
    
    Keyword args:
        out (Tensor, optional): the output tensor.
    
    Example::
    
        >>> mat = torch.randn(2, 3)
        >>> vec = torch.randn(3)
        >>> torch.mv(mat, vec)
        tensor([ 1.0404, -0.6361])



In [45]:
A.shape, x.shape, torch.mv(A,x)

(torch.Size([5, 4]), torch.Size([4]), tensor([ 14.,  38.,  62.,  86., 110.]))

## Matrix-Matrix `torch.mm`

In [47]:
help(torch.mm)

Help on built-in function mm:

mm(...)
    mm(input, mat2, *, out=None) -> Tensor
    
    Performs a matrix multiplication of the matrices :attr:`input` and :attr:`mat2`.
    
    If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
    :math:`(m \times p)` tensor, :attr:`out` will be a :math:`(n \times p)` tensor.
    
    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
              For broadcasting matrix products, see :func:`torch.matmul`.
    
    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
    
    Args:
        input (Tensor): the first matrix to be multiplied
        mat2 (Tensor): the second matrix to be multiplied
    
    Keyword args:
        out (Tensor, optional): the output tensor.
    
    Example::
    
        >>> mat1 = torch.randn(2, 3)
        >>> mat2 = torch.randn(3, 3)
        >>> torch.mm(mat1, mat2)
        tensor([[ 0.4851,  0.5037, -0.3633],
                [-0.0760, -3.6705,  2.4784]])



In [46]:
B = torch.ones(4,3)
torch.mm(A,B)

tensor([[ 6.,  6.,  6.],
        [22., 22., 22.],
        [38., 38., 38.],
        [54., 54., 54.],
        [70., 70., 70.]])

In [49]:
u = torch.tensor([3.0, -4.0])
torch.norm(u) # L2 Norm

tensor(5.)

In [50]:
torch.abs(u).sum() # L1 Norm

tensor(7.)

## Frobenius norm of matrix $X \in \mathbb{R}^{m \times n}$

$$ \left \| X \right \|_F = \sqrt{\sum_{i=1}^{m}\sum_{j=1}^{n}{x^2}_{ij}} $$

In [52]:
torch.norm(A)

tensor(49.6991)

In [53]:
torch.norm(torch.ones(4,9))

tensor(6.)