# **Exercise 1**

This exercise will help you get familiar with NumPy library in python. The goal of this exercise
is for you to explore the documentation of different NumPy libraries and their implementation
details.

In [None]:
import numpy as np

np.random.seed(1) # Makes results reproducible

# Write a Feed Forward pass of a simple Neural Network:
# – Randomly generate a 4x16 input matrix X
X = np.random.random((4, 16))
print("Shape of X: ", X.shape)
print("X = \n\n", X)

# – Randomly generate three weight matrices W1, W2, W3 of sizes: 16x16, 32x16 2x32
W1 = np.random.random((16, 16))
W2 = np.random.random((32, 16))
W3 = np.random.random((2, 32))

# – Perform the forward pass: (((𝑋 ⋅ 𝑊1𝑇 ) ⋅ 𝑊2𝑇 ) ⋅ 𝑊3𝑇 )
F = np.matmul(np.matmul(np.matmul(X, W1.T), W2.T), W3.T)
print("\nShape of F: ", F.shape)
print("F = \n\n", F)

Shape of X:  (4, 16)
X = 

 [[4.17022005e-01 7.20324493e-01 1.14374817e-04 3.02332573e-01
  1.46755891e-01 9.23385948e-02 1.86260211e-01 3.45560727e-01
  3.96767474e-01 5.38816734e-01 4.19194514e-01 6.85219500e-01
  2.04452250e-01 8.78117436e-01 2.73875932e-02 6.70467510e-01]
 [4.17304802e-01 5.58689828e-01 1.40386939e-01 1.98101489e-01
  8.00744569e-01 9.68261576e-01 3.13424178e-01 6.92322616e-01
  8.76389152e-01 8.94606664e-01 8.50442114e-02 3.90547832e-02
  1.69830420e-01 8.78142503e-01 9.83468338e-02 4.21107625e-01]
 [9.57889530e-01 5.33165285e-01 6.91877114e-01 3.15515631e-01
  6.86500928e-01 8.34625672e-01 1.82882773e-02 7.50144315e-01
  9.88861089e-01 7.48165654e-01 2.80443992e-01 7.89279328e-01
  1.03226007e-01 4.47893526e-01 9.08595503e-01 2.93614148e-01]
 [2.87775339e-01 1.30028572e-01 1.93669579e-02 6.78835533e-01
  2.11628116e-01 2.65546659e-01 4.91573159e-01 5.33625451e-02
  5.74117605e-01 1.46728575e-01 5.89305537e-01 6.99758360e-01
  1.02334429e-01 4.14055988e-01 6.94400

In [None]:
np.random.seed(1) # Makes results reproducible

# (Bonus) Generate a random 4x4 matrix and use NumPy library to compute eigenvalues and eigenvectors of the randomly
# generated matrix.
A = np.random.random((4, 4))
eigenvalues, eigenvectors = np.linalg.eig(A)

print("\n(Bonus)\nEigenvalues (each value is an eigenvalue): ", eigenvalues)
print("\nEigenvectors (each row is an eigenvector):\n\n", eigenvectors)


(Bonus)
Eigenvalues (each value is an eigenvalue):  [ 1.35666782  0.25641859  0.19491643 -0.2089802 ]

Eigenvectors (each row is an eigenvector):

 [[-0.38804596 -0.43214848  0.18691558  0.46649225]
 [-0.28996723 -0.12110532 -0.20205441 -0.62691493]
 [-0.70725243 -0.72791773 -0.89755477 -0.33272554]
 [-0.51491101  0.51837918  0.34442998  0.52787913]]


# **Exercise 2**

This exercise will help you get familiar with PyTorch Dataset and Dataloader classes. The goal
of the exercise is to implement your own Dataset and Dataloader classes for different types of data: tabular and sequential.

### **Part 1**

In [69]:
# According to https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
import os
import json
import numpy as np
from torch.utils.data import DataLoader, Dataset
from torch import tensor, stack

class CustomPhrasesDataset(Dataset):
    def __init__(self, data_json_filepath, train=True, train_size=0.8, shuffle_seed=0, transform=None, target_transform=None):
        with open(data_json_filepath) as f:
            self.token_data = json.load(f)
        self.shuffle(shuffle_seed)

        # Checks that it's between 0 and 1, representing the % of the entire data that is training data
        assert 0 <= train_size <= 1

        # Partitions the data accordingly
        if train:
            self.token_data = self.token_data[:int(len(self.token_data) * train_size)]
        else:
            self.token_data = self.token_data[int(len(self.token_data) * train_size):]
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.token_data)

    def __getitem__(self, idx):
        tokens = tensor(self.token_data[idx]['tokens'])
        label = tensor(self.token_data[idx]['label'])
        if self.transform:
            tokens = self.transform(tokens)
        if self.target_transform:
            label = self.target_transform(label)

        return tokens, label

    def shuffle(self, seed):
        np.random.seed(seed)
        np.random.shuffle(self.token_data)

# The tokens are in different sizes, we get the max and pad the missing
def pad(input_tensor, desired_size):
    return F.pad(input_tensor, (0, desired_size - input_tensor.size(0)), value=0)

class CustomPhrasesDataLoader(DataLoader):
    def __init__(self, data, batch_size, shuffle = False, shuffle_seed = 0):
        # calculate the desired size for padding
        len_tokens = max(len(ts) for ts, ls in data)

        if (shuffle):
            data.shuffle(shuffle_seed)

        self.batches = [(
            # data
            stack([pad(tensor(data[j][0]), len_tokens) for j in range(i, min(i + batch_size, len(data)))]),
            # labels
            stack([tensor(data[j][1]) for j in range(i, min(i + batch_size, len(data)))])
            )
        for i in range(0, len(data), batch_size)]

    def __len__(self):
        return len(self.batches)

    def __getitem__(self, itx):
        return self.batches[itx]

    def __iter__(self):
        for batch in self.batches:
            yield batch

# TEST
# data2.csv should be in the current working directory
data_json_filepath = os.path.join(os.getcwd(), 'data1.json')

training_data = CustomPhrasesDataset(
    data_json_filepath,
    train=True
)

test_data = CustomPhrasesDataset(
    data_json_filepath,
    train=False
)

train_dataloader = CustomPhrasesDataLoader(training_data, batch_size=64, shuffle=False)
test_dataloader = CustomPhrasesDataLoader(test_data, batch_size=64, shuffle=False)
f, l = next(iter(train_dataloader))
print(f"\nFeature batch shape: {f.size()}\n")
print(f"Labels batch shape: {l.size()}\n")


  stack([pad(tensor(data[j][0]), len_tokens) for j in range(i, min(i + batch_size, len(data)))]),
  stack([tensor(data[j][1]) for j in range(i, min(i + batch_size, len(data)))])



Feature batch shape: torch.Size([64, 238])

Labels batch shape: torch.Size([64])



### **Part 2**

In [72]:
# According to https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
import os
import json
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch import tensor, stack

class CustomPrognosisDataset(Dataset):
    def __init__(self, data_csv_filepath, train=True, train_size=0.8, shuffle_seed=0, transform=None, target_transform=None):
        self.raw_df = pd.read_csv(data_csv_filepath)

        # From https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows
        # Shuffles the dataframe
        self.raw_df = self.raw_df.sample(frac=1, random_state=shuffle_seed).reset_index(drop=True)

        # Builts the features and labels arrays
        # Each position represents the name of the symptom in the corresponding position in symptoms array
        # That means, len(self.symptoms_classes) == self.symptoms.shape[-1]
        self.symptoms_classes = self.raw_df.drop(columns=['id', 'prognosis']).columns.to_numpy(dtype='U')
        self.symptoms = self.raw_df.drop(columns=['id', 'prognosis']).to_numpy(dtype='bool')
        # One-hot encoding of prognosis column
        self.raw_output = self.raw_df['prognosis']
        one_hot_encoding = pd.get_dummies(self.raw_output, columns = ['prognosis'])
        self.labels_classes = one_hot_encoding.columns.to_numpy(dtype='U')
        self.labels = one_hot_encoding.to_numpy(dtype='bool')

        # Checks that it's between 0 and 1, representing the % of the entire data that is training data
        assert 0 <= train_size <= 1

        # Partitions the data accordingly
        if train:
            self.symptoms = self.symptoms[:int(len(self.symptoms) * train_size)]
        else:
            self.symptoms = self.symptoms[int(len(self.symptoms) * train_size):]
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.symptoms)

    def __getitem__(self, idx):
        symptoms = tensor(self.symptoms[idx])
        label = tensor(self.labels[idx])
        if self.transform:
            symptoms = self.transform(symptoms)
        if self.target_transform:
            label = self.target_transform(label)

        return symptoms, label

    # From https://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison
    def shuffle(self, shuffle_seed=0):
        np.random.seed(shuffle_seed)
        p = np.random.permutation(len(self.symptoms))
        self.symptoms = self.symptoms[p]
        self.labels = self.labels[p]
        self.symptoms_classes = self.symptoms_classes[p]
        self.labels_classes = self.labels_classes[p]


class CustomPrognosisDataLoader(DataLoader):
    def __init__(self, data, batch_size, shuffle=False, shuffle_seed=0):
        if shuffle:
            data.shuffle(shuffle_seed)

        self.batches = [(
            # data
            stack([tensor(data[j][0]) for j in range(i, min(i + batch_size, len(data)))]),
            # labels
            stack([tensor(data[j][1]) for j in range(i, min(i + batch_size, len(data)))])
            )
        for i in range(0, len(data), batch_size)]

    def __len__(self):
        return len(self.batches)

    def __getitem__(self, itx):
        return self.batches[itx]

    def __iter__(self):
        for batch in self.batches:
            yield batch

# TEST
# data2.csv should be in the current working directory
data_json_filepath = os.path.join(os.getcwd(), 'data2.csv')

training_data = CustomPrognosisDataset(
    data_json_filepath,
    train=True
)

test_data = CustomPrognosisDataset(
    data_json_filepath,
    train=False
)

train_dataloader = CustomPrognosisDataLoader(training_data, batch_size=64, shuffle=False)
test_dataloader = CustomPrognosisDataLoader(test_data, batch_size=64, shuffle=False)
f, l = next(iter(train_dataloader))
print(f"\nFeature batch shape: {f.size()}\n")
print(f"Labels batch shape: {l.size()}\n")


Feature batch shape: torch.Size([64, 64])

Labels batch shape: torch.Size([64, 11])



  stack([tensor(data[j][0]) for j in range(i, min(i + batch_size, len(data)))]),
  stack([tensor(data[j][1]) for j in range(i, min(i + batch_size, len(data)))])
