In [2]:
import torch
import numpy as np
import time
import torch.nn as nn
import torch.nn.functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu' # pattern line
print(device) # originally, I'm not going to be able to use GPU, but I took assist of cloud GPUs.

# What's the objetive?
# Here I'm learning a few methods to create different types of tensors that may
# help at the control of data the Large Language Model may use.
# Basically, we're watching some High School Math in programming, although the concepts
# of tensors are far away from High School.

cpu


In [5]:
# magic line. Prints cell time execution with a few parameters
# %%time not currently working at my enviroment 
# possible return: CPU times: user 1.21 ms, sys: 44 µs, total: 1.25 ms
# Wall time: 3.03 ms

start_time = time.time() 
# matrix operations
zeros = torch.zeros(1, 1)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"{elapsed_time:.8f}")
# OBS: While devolping this code at Google Colab, I got a little return
# time for this code. Here, I usually don't.

0.01927161


In [18]:
# Testing GPU speed

# creating torch matrices:
torch_rand1 = torch.rand(100, 100, 100, 100).to(device) 
torch_rand2 = torch.rand(100, 100, 100, 100).to(device)
# As I said my current laptop does not have a GPU, so device='cpu',
# but the results at Google Colab indicates that the GPU
# is faster for parallel tasks like a 4x4 matrix.

# Using 'cpu' (again, I use CPU both cases, but at Google Colab this made difference)
np_rand1 = torch.rand(100, 100, 100, 100)
np_rand2 = torch.rand(100, 100, 100, 100)

start_time = time.time()

rand = (torch_rand1 @ torch_rand2) # @ is a reserved symbol at Python.
# @ can only be used to correct libraries that allows matricial multiplication.
# I can recreate the logic of matricial multiplication with lists, but it's less precise.
# Torch uses C++ (which has arrays) to do it.

end_time = time.time()

elapsed_time = end_time - start_time
print(f"{elapsed_time:.8f}")

start_time = time.time()

# land = np.multiply(np_rand1, np_rand2) - this is element-wise not matricial
# multiplication. Professor made a mistake here. You can directly use PyTorch
# matricial multiplication system, just do not use ".to(device)", (if device == 'gpu')
# and you'll test the cu speed.

land = (np_rand1 @ np_rand2)


end_time = time.time()
elapsed_time = end_time - start_time
print(f"{elapsed_time:.8f}")

# IMPORTANT: torch.rand(x, y, z...) creates a tensor of dimension N, being
# N the number of parameters within ".rand()".
# The number of elements inside the tensor = x . y . z . ... . n (the product
# of all parameters).
# Now, consider that you're using PyTorch library, which is wrote in C code.
# In C language, every float "costs" 4 bytes (usually).
# For a 6D tensor, like (100, 100, 100, 100, 100) the memory used would be:
# 4 (bytes) x 10^10 (elements) = 4TB. So, take care!

10.26713490
2.16649580


In [26]:
# This part of the code is, basically, the strutcture that sustains 
# mutation rate of NEAT algorithm. 

# First step: define a variable that creates a float tensor, with values
probabilities = torch.tensor([0.1, 0.9]) 
# The values don't need to match, when summed, 1, but it's more accurate
# to do like that.

# Second step: use the probability generator method of PyTorch to storage
# the samples. This could be the part of NEAT algorithm that iniatializes the
# mutation rate of some parameters.
samples = torch.multinomial(probabilities, num_samples=10, replacement=True)
# Generic torch.multinomial:
# torch.multinomial(tensor_here, number_of_samples_defined_here, repetition_of_samples_yes_or_not)
# OBS: if replacement=False, then num_samples = len(probabilities) or it won't work.
print(samples)

tensor([1, 1, 1, 0, 1, 1, 1, 1, 1, 1])


In [30]:
# torch.cat method to append tensors.
tensor = torch.tensor([1, 2, 3, 4]) # unidimensional tensor

output = torch.cat((tensor, torch.tensor([2, 3])), dim=0)
output
# Note: Jupyter Notebook print anything at the last line, that's
# why I don't have to type print(output) (this is a rude explanation)

tensor([1, 2, 3, 4, 2, 3])

In [12]:
out = torch.tril(torch.ones(5, 5)) # creates a triangle matrix
out

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [20]:
torch.triu(torch.ones(5,5)) # creates an inverted triangle matrix

tensor([[1., 1., 1., 1., 1.],
        [0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.]])

In [23]:
# Important
# Breaking down the code:

out = torch.zeros(5, 5).masked_fill_(torch.triu(torch.ones(5, 5)) == 1, float('-inf'))
# "torch.zeros(5,5)", creates a tensor 5 x 5 filled with zeros.
# ".masked_fill_", creates an instance that modifies in-place a tensor.
# In this case, we're chain methods, so the tensor being modified is the quoted one.
# ".masked_fill_(parameter a, parameter b)" the parameters you can use are:
# A matrix, that MUST be the same size as the matrix you're going to mask (it wouldn't make sense
# if it was different), and the value that will correspond to True. You can choose any value.
# In this case, all values within the triangle matrix created that are == 1 will be consider True.
# Within masked_fill_, True values are modified. False values stay as they are.
# Parameter b --> indicates what will be the new value for the "True" values inside the masked matrix.
# After that, masked_fill_ will apply the changes at the matrix the method is being used.
# Note: There's also masked_fill (withoout the last underline), which applys the method creating a new matrix
# instead of modifying the previous. In this case, seems to make no difference what method you're going to use,
# once I'm applying the method at a same variable, which basically means "masked_fill" would overwrite the content, just
# like masked_fill_
out

tensor([[-inf, -inf, -inf, -inf, -inf],
        [0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf]])

In [24]:
# Take every element inside the tensor and uses it as an expoent for Euler number (2,7 aproximately)
torch.exp(out)
# In this case, '-inf' bring us close to 0. e^-inf = 0
# andddd... e^0 = 1. So, that's the result:

tensor([[0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.]])

In [39]:
# matrix transpose, matrix transposition.
inp = torch.zeros(2, 3, 4) # 2 matrices, 3 lines each one, 4 columns each one
out = inp.transpose(0, 2) # 0 and 2 are indexes
# breaking down ".transpose"
# The parameters are indexes of the tensor you're applying this method.
# In this case, you're applying this method to a tensor (2, 3, 4), as quoted.
# "0" means the first index and "2" the third. So, the tensor (2, 3, 4) will be transposed
# to (4, 3, 2). Basically, we're swapping the first and the third dimesion.
# This will bring us back 4 matrices (first dimension defined) with 2 columns and 3 rows.
out.shape # prints the shape of the tensor (not the tensor itself)

torch.Size([4, 3, 2])

In [44]:
tensor1 = torch.tensor([1, 2, 3])
tensor2 = torch.tensor([4, 5, 6])
tensor3 = torch.tensor([7, 8, 9])

print(tensor1, tensor2, tensor3)

# Stack the tensors along a new dimension
# Important: Tensor must be the same size. 
stacked_tensor = torch.stack([tensor1, tensor2, tensor3])
stacked_tensor



tensor([1, 2, 3]) tensor([4, 5, 6]) tensor([7, 8, 9])


tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [79]:
sample = torch.tensor([10., 10., 10.]) # tensor that serves as an input.
print(sample)
# Imagine a NEAT algorithm. 
# For any input, there's a transformation function.
# This is the case o ".Linear". It applies a linear transformation.
# Randomly (but you can choose manually) generates weights for the inputs.
# Then makes WEIGHT x INPUT. The formula is:
# L(input) = input . weight^t + bias (in this case, bias = 0)
# Note: weight^t = transpose matrix of weight.
linear = nn.Linear(3, 3, bias=False)
print(linear(sample))
# After that, we have an activation function, like tanh.
# How to read it:
# nn.linear(how_much_inputs_it_will_receive, how_much_outputs, bias)
# howm_much_out_puts >= 0 and do not depend of the number of inputs.
# how_much_inputs_it_will_receive must match the exact number of inputs.

tensor([10., 10., 10.])
tensor([-10.7175,   4.5652,  -4.3617], grad_fn=<SqueezeBackward4>)


In [69]:


tensor1 = torch.tensor([1.0, 2.0, 3.0])
print(tensor1)
# Used to give back the probability of the elements you gave to it.
softmax_output = F.softmax(tensor1, dim=0)
# The higher the element, higher the probability.
# The sum at the output == 1
print(softmax_output)

tensor([1., 2., 3.])
tensor([0.0900, 0.2447, 0.6652])


In [3]:
# Create an embedding layer with 10 'words' and each word represented by a 5-dimensional vector
embedding_layer = nn.Embedding(10, 5)
print(embedding_layer)

# Create a tensor of integers representing word IDs
word_ids = torch.tensor([1, 2, 4, 5, 1])

# Get embeddings for the word IDs
word_embeddings = embedding_layer(word_ids)

print(word_embeddings)
# The 5-dimensional vector means, in Python terms, a list with 5 elements.

# Why Do We Use It?
# The method "nn.Embedding" can be used to associate human-defined parameters for 
# words in a way that the computer can understand. In this case,
# we're creating a dictionary for 10 words. In this dictionary, we have 
# 5 definitions (the 5-dimensional vectors). The numbers that are generated,
# as the model gets more and more trained, will be closer for words that
# have similar meanings. 
# For example:
# Parameter 1 = Semantic meaning - the strict dictionary definition.
# Parameter 2 = Emotional meaning.
# Parameter 3 = Level of formality vs. informality.
# Parameter 4 = Length of the word.
# Parameter 5 = Usage (how frequently the word is used).
# For the model, these parameters are just numbers, but with proper logic,
# the model will start to make words that are longer have a closer value for parameter 4,
# while "love" and "hate" will have greater distance in parameter 2, and so on.



Embedding(10, 5)
tensor([[ 1.8071,  1.4398,  0.9471, -0.0209, -0.6883],
        [-2.2881,  0.6217, -1.0466,  1.4214,  0.7874],
        [-0.4318,  0.5044, -1.4173, -0.0445, -0.2961],
        [ 0.9745, -1.8651, -0.1608,  0.3834,  1.1423],
        [ 1.8071,  1.4398,  0.9471, -0.0209, -0.6883]],
       grad_fn=<EmbeddingBackward0>)


In [None]:
# note: PyTorch does not multiply float with integers. 
# Therefore, while building the Large Language Model, rather use floating numbers 
# (which guarantes more precision) than integers.

In [29]:
# unpacking tensor dimensions and then remodeling it.
a = torch.rand(2, 3, 5) # tensor dimensions
print(a.shape)
x, y, z = a.shape
a = a.view(z, y, x)
print(x, y, z)
print(a.shape)
a = a.view(30, 1, 1, 1, 1, 1)
print(a.shape)
# Not possible: 
# You can't remodel it with dimensions that will have less
# elements than in the begin neither more elements.
# But you can add neutral dimensions (1, basically).

# Others possibilities:
a = torch.rand(1, 2, 3, 4, 5, 10)
l = a.shape[5]
print(l)
l, m, _, _, _, _ = a.shape
print(l, m)


torch.Size([2, 3, 5])
2 3 5
torch.Size([5, 3, 2])
torch.Size([30, 1, 1, 1, 1, 1])
10
1 2
