# 1. Data Manipulation and Preprocessing

Update: May 23, 2024

Author: Languisher Lin

In [1]:
import torch

## Tensor

### Basic Attrubutes

In [2]:
### --- Tensor attributes and characteristics --- ###

# Creating tensors prepopulated with values
x = torch.arange(12, dtype=torch.float32)
xx = torch.tensor([[2, 3, 4, 5, 6], [1, 2, 3, 4, 5]])

x, xx

(tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.]),
 tensor([[2, 3, 4, 5, 6],
         [1, 2, 3, 4, 5]]))

In [3]:
# Number of elements of the tensor
num_elements = x.numel()

# Shape of tensor, first number correspond to the outer layer of the list, 
# (in this case, the number of elements of each column)
xx_shape = xx.shape

num_elements, xx_shape

(12, torch.Size([2, 5]))

In [4]:
# Reshape the shape without altering its size of values
# e.g. (12, ) -> (3, 4)
X = x.reshape(3, 4)

# To automatically infer ONE COMPONENT of the shape
# (obviously it could be at least inferred manually)
Y = x.reshape(-1, 4)

X, Y

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]]),
 tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]]))

In [5]:
# Create tensors with all 0s or 1s
zero_tensor = torch.zeros((2, 3, 4))
one_tensor = torch.ones((2, 5))

# Create tensor with random values drawn from 
# a std Gaussian (normal) distrubution with mean 0 and deviation 1
normal_tensor = torch.randn((3, 4))

zero_tensor, one_tensor, normal_tensor

(tensor([[[0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]],
 
         [[0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]]]),
 tensor([[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]),
 tensor([[-1.1151,  0.2723,  0.7684,  0.8018],
         [ 1.1924, -2.4827, -0.0274, -0.1840],
         [-0.5955, -0.4199,  0.1617,  0.9762]]))

In [6]:
# Indexing: Similar to list

# Output the values of the tesnor
last_line = X[-1]
last_two_lines = X[1:3] # 1 and 2 but 3 is not contained

# Assign single elements and multiple elements with the same value
Y = X.clone() # Clone to "deepcopy"
Y[1, 2] = 17 
Y[:2, :] = 12

X, last_line, last_two_lines, Y

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]]),
 tensor([ 8.,  9., 10., 11.]),
 tensor([[ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]]),
 tensor([[12., 12., 12., 12.],
         [12., 12., 12., 12.],
         [ 8.,  9., 10., 11.]]))

In [7]:
# Unary Operations: Functions operate like: f: R^p -> R^n
exp_x = torch.exp(x)

# Binary Operations: Between tensors
a = torch.tensor([2, 3])
b = torch.tensor([3, 4])
atimesb = a ** b

# Test for every element if equal or not
aequalb = a == b

x, exp_x, atimesb, aequalb

(tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.]),
 tensor([1.0000e+00, 2.7183e+00, 7.3891e+00, 2.0086e+01, 5.4598e+01, 1.4841e+02,
         4.0343e+02, 1.0966e+03, 2.9810e+03, 8.1031e+03, 2.2026e+04, 5.9874e+04]),
 tensor([ 8, 81]),
 tensor([False, False]))

In [8]:
# Concatenation multiple tensors
# Concate two matrices along rows: axis 0
# Concate two matrices along columns: axis 1

X = torch.arange(12, dtype=torch.float32).reshape((3, 4))
Y = torch.tensor([[2.0, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]])
row_concate = torch.cat((X, Y), dim=0)
col_concate = torch.cat((X, Y), dim=1)

row_concate, col_concate

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [ 2.,  1.,  4.,  3.],
         [ 1.,  2.,  3.,  4.],
         [ 4.,  3.,  2.,  1.]]),
 tensor([[ 0.,  1.,  2.,  3.,  2.,  1.,  4.,  3.],
         [ 4.,  5.,  6.,  7.,  1.,  2.,  3.,  4.],
         [ 8.,  9., 10., 11.,  4.,  3.,  2.,  1.]]))

In [9]:
# Broadcasting
a = torch.arange(3).reshape((3, 1))
b = torch.arange(2).reshape((1, 2))

# In order to make the shape of a and b becomes identical, broadcasting
# is adopted. That is to say, (3, 1) -> (3, 2) by coping the column
# (1, 2) -> (3, 2) by coping two times the line

a, b, a + b

(tensor([[0],
         [1],
         [2]]),
 tensor([[0, 1]]),
 tensor([[0, 1],
         [1, 2],
         [2, 3]]))

In [10]:
# Memory saving
# Take this as an example: We want to update the value of a certain tensor,
# without allocating a num memory but only covering (or replacing) the original
# data

Z = torch.zeros((3, 4))
id_orig = id(Z)

X = Y = torch.ones((3, 4))
Z[:] = X + Y # By adding a "[:]", we could avoid allocating new memory spaces
id_new = id(Z)

id_orig == id_new

True

### Conversion to other Python objects

In [11]:
X = torch.arange(12, dtype=torch.float32).reshape((3, 4))

# Torch -> Numpy: numpy method
# Numpy -> Torch: from_numpy method
# Attention: Two use cases are DIFFERENT !!!
A = X.numpy()
B = torch.from_numpy(A)

type(A), type(B)

(numpy.ndarray, torch.Tensor)

### Linear Algebra

Second-order tensors could be expressed as, mathematically, matrices.

$$
\mathrm{A} = \begin{bmatrix} a_{11} & \cdots & a_{1n} \\ \vdots & & \vdots \\ a_{m1} & \cdots & a_{mn}\end{bmatrix}
$$

In [12]:
A = torch.arange(6).reshape(3, 2) # 3 is the number of lines, thus m=3 and (m, n) = (3, 2)
A, A.T

(tensor([[0, 1],
         [2, 3],
         [4, 5]]),
 tensor([[0, 2, 4],
         [1, 3, 5]]))

Hadamard product: denote $c_{ij}$ as the element of the result matrix,
$$
\forall i, j \in [\![1, m]\!] \times [\![1, n]\!],\quad c_{ij} = a_{ij} \times b_{ij}
$$

In [13]:
B = A.clone()
A, A * B

(tensor([[0, 1],
         [2, 3],
         [4, 5]]),
 tensor([[ 0,  1],
         [ 4,  9],
         [16, 25]]))

Dot product (Inner product): $$ \langle x, y \rangle = \mathrm{x}^T \mathrm{y} $$

In [14]:
# Input should be R^d VECTORS !!!!
# Both should be COLUMN VECTORS !!!
x = torch.ones(3, dtype=torch.float32)
y = torch.ones(3, dtype=torch.float32)

x, y, torch.dot(x, y)

(tensor([1., 1., 1.]), tensor([1., 1., 1.]), tensor(3.))

Matrix-Vector (MV) Products, Matrix-Matrix(MM) Products

In [15]:
# Verify the shape of the matrices and vectors first.

A = torch.arange(6, dtype=torch.float32).reshape((2, 3))
B = torch.ones((3, 4))
v = torch.ones(3)

# mv denotes matrix-vector, mm denotes matrix-matrix
A@v, torch.mv(A, v), A@B, torch.mm(A, B)

(tensor([ 3., 12.]),
 tensor([ 3., 12.]),
 tensor([[ 3.,  3.,  3.,  3.],
         [12., 12., 12., 12.]]),
 tensor([[ 3.,  3.,  3.,  3.],
         [12., 12., 12., 12.]]))

Sum (Reduction or Non-Reduction)

In [16]:
# Sum of all elements
total_sum = A.sum()

# Sum over all elements ALONG THE ROW, i.e. sum of each column: axis=0
# Sum over all elements ALONG THE COLUMN, i.e. sum of each row: axis=1
col_sum = A.sum(axis=0)
line_sum = A.sum(axis=1)


A, total_sum, col_sum, line_sum

(tensor([[0., 1., 2.],
         [3., 4., 5.]]),
 tensor(15.),
 tensor([3., 5., 7.]),
 tensor([ 3., 12.]))

Norm: $l_2$ norm and Frobenius norm

In [17]:
# Method norm calculates the l_2 norm and Frobenius Norm
u = torch.tensor([3.0, -4.0])
X = torch.ones((4, 9))

torch.norm(u), torch.norm(X)

(tensor(5.), tensor(6.))

## Reading the Dataset

In [18]:
import os

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('''NumRooms,RoofType,Price
NA,NA,127500
2,NA,106000
4,Slate,178100
NA,NA,140000''')

In [19]:
import pandas as pd

In [20]:
data = pd.read_csv(data_file)
data

Unnamed: 0,NumRooms,RoofType,Price
0,,,127500
1,2.0,,106000
2,4.0,Slate,178100
3,,,140000


First, we should deal with the problem of "NaN" datas. O
One simple way is to convert these values into 0 and 1 bits, or to take the mean value of the corresponding column.

In [21]:
inputs, target = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = pd.get_dummies(inputs, dummy_na=True) # Convert into two columns
inputs = inputs.fillna(inputs.mean())
inputs

Unnamed: 0,NumRooms,RoofType_Slate,RoofType_nan
0,3.0,False,True
1,2.0,False,True
2,4.0,True,False
3,3.0,False,True


Then, we could convert it to a tensor using the techniques we have already discussed.

In [22]:
# Route of convert:
# Panda Column -> Numpy -> Torch
X = torch.tensor(inputs.to_numpy(dtype=float))
y = torch.tensor(target.to_numpy(dtype=float))
X, y

(tensor([[3., 0., 1.],
         [2., 0., 1.],
         [4., 1., 0.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500., 106000., 178100., 140000.], dtype=torch.float64))

## References

- [2.1 Data Manipulation](https://d2l.ai/chapter_preliminaries/ndarray.html)
- [2.2 Data Preprocessing](https://d2l.ai/chapter_preliminaries/pandas.html#)
- [2.3 Linear Algebra](https://d2l.ai/chapter_preliminaries/linear-algebra.html)