In [None]:
import numpy as np
import gensim.downloader as api
from gensim.models import FastText, KeyedVectors
from tqdm import tqdm

from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re

In [None]:
vocab_size = 1000
maxlen = 100

In [None]:
(x_train, _), (x_test, _) = imdb.load_data(num_words=vocab_size + 3, maxlen=maxlen)

In [None]:
# Create a one-hot encoded representation of the training data
x_train_one_hot_encoded = np.zeros((x_train.shape[0], maxlen, vocab_size))

# Iterate over each sample and each word index to set the one-hot value to 1
for i in range(x_train.shape[0]):
    for j in range(len(x_train[i])):
        idx = x_train[i][j] - 3
        if idx < 0:
          idx = 0
        x_train_one_hot_encoded[i, j, idx] = 1

In [None]:
# Decode the dataset back into words to train Word2Vec
word_index = imdb.get_word_index()
index_word = {v: k for k, v in word_index.items() if v < vocab_size}

x_train_words = [[index_word.get(idx-3, "unk") for idx in sequence] for sequence in x_train]
x_test_words = [[index_word.get(idx-3, "unk") for idx in sequence] for sequence in x_test]

In [None]:
embedding_model = api.load("glove-twitter-50")
embedding_size = embedding_model.vector_size

In [None]:
def embed_sequences(sequences, embedding_model, maxlen, embedding_size):
    embeddings = []
    for sequence in sequences:
        seq_embedding = [
            embedding_model[word] if word in embedding_model else np.zeros(embedding_size)
            for word in sequence
        ]
        if len(seq_embedding) < maxlen:
            # Padding with zero vectors if sequence is shorter
            seq_embedding += [np.zeros(embedding_size)] * (maxlen - len(seq_embedding))
        if len(seq_embedding) > maxlen:
            seq_embedding = seq_embedding[:maxlen]

        embeddings.append(seq_embedding)
    return np.array(embeddings)

In [None]:
x_train_embeddings = embed_sequences(x_train_words, embedding_model, maxlen, embedding_size)
y_train_outputs = x_train_one_hot_encoded

# Get correct values for next word prediction
x_train_embeddings = x_train_embeddings[:, :maxlen - 1, :]
y_train_outputs = y_train_outputs[:, -(maxlen - 1):, :]

In [None]:
def softmax(x: np.ndarray) -> np.ndarray:

    ex = np.exp(x)
    return ex / np.sum(ex)

    pass

In [None]:
class InputLayer:
    inputs: np.ndarray
    weights: np.ndarray = None
    delta_weights: np.ndarray = None

    def __init__(self, inputs: np.ndarray, hidden_size: int) -> None:
        self.inputs = inputs

        limit = np.sqrt(6 / (len(inputs[0]) + hidden_size))
        self.weights = np.random.uniform(low=-limit, high=limit, size=(hidden_size, len(inputs[0])))

        self.delta_weights = np.zeros_like(self.weights)

    def __reset_deltas__(self):
        self.delta_weights = np.zeros_like(self.weights)

    def get_input(self, time_step: int) -> np.ndarray:
        return self.inputs[time_step][:, np.newaxis]

    def forward(self, time_step: int) -> np.ndarray:
        ### BEGIN IMPLEMENTATION ###
        inp = self.get_input(time_step)
        weighted = np.dot(self.weights, inp)
        return weighted

    def backward(
        self, time_step: int, delta_weights: np.ndarray
    ) -> None:
        ### BEGIN IMPLEMENTATION ###
        inp = self.get_input(time_step)
        self.delta_weights += np.dot(delta_weights, inp.T)

    def update_parameters(self, learning_rate: float) -> None:
        self.weights -= learning_rate * self.delta_weights

        self.__reset_deltas__()

In [None]:
class HiddenLayer:
    states: np.ndarray = None
    weights: np.ndarray = None
    delta_weights: np.ndarray = None
    bias: np.ndarray = None
    delta_bias: np.ndarray = None
    next_delta_hidden_state_activation: np.ndarray = None

    def __init__(self, max_num_time_steps: int, size: int) -> None:
        limit = np.sqrt(6 / (size + size))
        self.weights = np.random.uniform(low=-limit, high=limit, size=(size, size))

        self.bias = np.random.uniform(low=-0.1, high=0.1, size=(size, 1))
        self.states = np.zeros(shape=(max_num_time_steps, size, 1))
        self.next_delta_hidden_state_activation = np.zeros(shape=(size, 1))
        self.delta_bias = np.zeros_like(self.bias)
        self.delta_weights = np.zeros_like(self.weights)

    def __reset_deltas__(self):
        self.delta_bias = np.zeros_like(self.bias)
        self.delta_weights = np.zeros_like(self.weights)
        self.next_delta_hidden_state_activation = np.zeros_like(self.next_delta_hidden_state_activation)

    def __reset_states__(self):
        self.states = np.zeros_like(self.states)

    def get_hidden_state(self, time_step: int) -> np.ndarray:
            if time_step < 0:
                return np.zeros_like(self.states[0])
            return self.states[time_step]

    def set_state(self, time_step: int, prediction: np.ndarray) -> None:
        self.states[time_step] = prediction

    def forward(self, weighted_input: np.ndarray, time_step: int) -> np.ndarray:
        ### BEGIN IMPLEMENTATION ###
        prev = self.get_hidden_state(time_step - 1)
        weighted = np.dot(self.weights, prev) + weighted_input + self.bias
        hidden = np.tanh(weighted)
        self.set_state(time_step, hidden)
        return hidden

    def backward(
        self, time_step: int, delta_output: np.ndarray
    ) -> np.ndarray:
        ### BEGIN IMPLEMENTATION ###
        curr = self.get_hidden_state(time_step)
        prev = self.get_hidden_state(time_step - 1)

        tanh = 1 - np.square(curr)
        delta_hidden = delta_output * tanh + self.next_delta_hidden_state_activation
        self.delta_weights += np.dot(delta_hidden, prev.T)
        self.delta_bias += delta_hidden

        self.next_delta_hidden_state_activation = np.dot(self.weights.T, delta_hidden)

        return self.next_delta_hidden_state_activation

    def update_parameters(self, learning_rate: float) -> None:
        self.weights -= learning_rate * self.delta_weights
        self.bias -= learning_rate * self.delta_bias

        self.__reset_deltas__()


In [None]:
class OutputLayer:
    predictions: np.ndarray = None
    weights: np.ndarray = None
    bias: np.ndarray = None
    delta_bias: np.ndarray = None
    delta_weights: np.ndarray = None

    def __init__(self, max_num_time_steps: int, size: int, hidden_size: int) -> None:
        limit = np.sqrt(6 / (size + hidden_size))
        self.weights = np.random.uniform(low=-limit, high=limit, size=(size, hidden_size))
        self.bias = np.random.uniform(low=-0.1, high=0.1, size=(size, 1))
        self.predictions = np.zeros(shape=(max_num_time_steps, size, 1))
        self.delta_bias = np.zeros_like(self.bias)
        self.delta_weights = np.zeros_like(self.weights)

    def __reset_predictions__(self):
        self.predictions = np.zeros_like(self.predictions)

    def __reset_deltas__(self):
        self.delta_bias = np.zeros_like(self.bias)
        self.delta_weights = np.zeros_like(self.weights)

    def forward(self, hidden_state: np.ndarray, time_step: int) -> np.ndarray:
        ### BEGIN IMPLEMENTATION ###
        weighted_sum = np.dot(self.weights, hidden_state) + self.bias
        prediction = softmax(weighted_sum.flatten())[:, np.newaxis]
        self.set_prediction(time_step, prediction)
        return prediction

    def get_prediction(self, time_step: int) -> np.ndarray:
        return self.predictions[time_step]

    def set_prediction(self, time_step: int, prediction: np.ndarray) -> None:
        self.predictions[time_step] = prediction

    def backward(
        self,
        expected: np.ndarray,
        hidden_state: np.ndarray,
        time_step: int,
    ) -> np.ndarray:
        ### BEGIN IMPLEMENTATION ###
        prediction = self.get_prediction(time_step)

        if expected.ndim == 1:
          expected = expected.reshape(-1, 1)

        hidden_state = hidden_state.reshape(1, -1)
        delta = prediction - expected
        self.delta_bias = self.delta_bias + delta
        self.delta_weights += np.dot(delta, hidden_state)

        hidden = np.dot(self.weights.T, delta)

        return hidden

    def update_parameters(self, learning_rate: float) -> None:
        self.weights -= learning_rate * self.delta_weights
        self.bias -= learning_rate * self.delta_bias

        self.__reset_deltas__()

In [None]:
from typing import List

class RNN:
    hidden_layer: HiddenLayer
    output_layer: OutputLayer
    learning_rate: float
    input_layer: InputLayer = None

    def __init__(self, vocab_size: int, hidden_size: int, max_num_time_steps: int, learning_rate: float) -> None:
        self.hidden_layer = HiddenLayer(max_num_time_steps, hidden_size)
        self.output_layer = OutputLayer(max_num_time_steps, vocab_size, hidden_size)
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate

    def feed_forward(self, inputs: np.ndarray) -> OutputLayer:
        self.input_layer = InputLayer(inputs, self.hidden_size)

        for step in range(len(inputs)):
            weighted_input = self.input_layer.forward(step)
            activation = self.hidden_layer.forward(weighted_input, step)
            self.output_layer.forward(activation, step)

        return self.output_layer

    def backpropagation(self, expected: np.ndarray) -> None:
        for step_number in reversed(range(len(expected) - 1)):
            delta_output = self.output_layer.backward(
                expected[step_number],
                self.hidden_layer.get_hidden_state(step_number),
                step_number,
            )
            delta_weighted_sum = self.hidden_layer.backward(
                step_number, delta_output
            )
            self.input_layer.backward(step_number, delta_weighted_sum)


        self.output_layer.update_parameters(self.learning_rate)
        self.hidden_layer.update_parameters(self.learning_rate)
        self.input_layer.update_parameters(self.learning_rate)

    def loss(self, y_hat: List[np.ndarray], y: List[np.ndarray]) -> float:
        return -np.mean([np.sum(y[i] * np.log(y_hat[i])) for i in range(len(y))])

    def _find_end_of_seq(self, expected: np.ndarray) -> int:
        for idx, vector in enumerate(expected):
            if np.all(vector == 0):
                return idx
        return len(expected)

    def _reset_states(self):
      self.output_layer.__reset_predictions__()
      self.hidden_layer.__reset_states__()

    def train(self, inputs: np.ndarray, expected: np.ndarray, epochs: int) -> None:
        for epoch in range(epochs):
            loss_list = []
            for idx, input in enumerate(tqdm(inputs)):
                end_idx = self._find_end_of_seq(expected[idx])
                input = input[:end_idx, :]
                y_hats = self.feed_forward(input)
                self.backpropagation(expected[idx][:end_idx])

                round_loss = self.loss(y_hats.predictions[:end_idx,:,0], expected[idx][:end_idx])
                loss_list.append(round_loss)
                self._reset_states()

                if idx % 100 == 99:
                  print(f"Average Training Loss of Last 100 samples: {np.mean(np.array(loss_list[-100:]))}")

            print(
                f"Epoch Loss: {np.mean(np.array(loss_list))}"
            )


In [None]:
rnn = RNN(vocab_size=vocab_size, hidden_size=32, max_num_time_steps=maxlen - 1, learning_rate=1e-3)
rnn.train(x_train_embeddings, y_train_outputs, epochs=2)

In [None]:
n_layers = 48  # the number of transformer layers (aka. transformer blocks)
n_heads = 25   # the number of attention heads in each layer
d_model = 1600 # the model dimension
d_ffn = 6400   # the FFN (aka. MLP) dimension
d_heads = 64   # the attn head dimension
n_vocab = 50257 # vocabulary size
n_ctx = 1024    # the maximum sequence length the model can process

In [None]:
token_embeddings = n_vocab * d_model
print(token_embeddings)

In [None]:
position_embeddings =  n_ctx * d_model
print(position_embeddings)

In [None]:
attn_q = n_layers * d_model**2
print(attn_q)

In [None]:
attn_k = n_layers * d_model**2
print(attn_k)

In [None]:
attn_v = n_layers * d_model**2
print(attn_v)

In [None]:
attn_o = n_layers * d_model**2
print(attn_o)

In [None]:
ffn1 = n_layers * d_model * d_ffn
print(ffn1)

In [None]:
ffn2 = n_layers * d_ffn * d_model
print(ffn2)

In [None]:
output_embeddings = n_vocab * d_model
print(output_embeddings)

In [None]:
n_total = token_embeddings + position_embeddings + attn_q + attn_k + attn_v + attn_o + ffn1 + ffn2
print(f'{n_total/10**9:.3f}B')

In [None]:
print(f'{(ffn1+ffn2)/n_total:.1%}')

In [None]:
%%script echo skipping
class GPT2MLP(nn.Module):
    def __init__(self, config, d_model, d_ffn):
        super().__init__()
        self.fn1 = nn.Linear(d_model, d_ffn)
        self.fn2 = nn.Linear(d_ffn, d_model)
        self.act = ACT2FN[config.activation_function]
        self.dropout = nn.Dropout(config.resid_pdrop)

    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
        hidden_states = self.fn1(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.fn2(hidden_states)
        hidden_states = self.dropout(hidden_states)
        return hidden_states


class GPT2Block(nn.Module):
    def __init__(self, config, d_model, d_ffn, layer_idx=None):
        super().__init__()
        self.ln_1 = nn.LayerNorm(d_model, eps=config.layer_norm_epsilon)
        self.attn = GPT2_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
        self.ln_2 = nn.LayerNorm(d_model, eps=config.layer_norm_epsilon)
        self.mlp = GPT2MLP(config, d_model, d_ffn)

    def forward(
        self,
        hidden_states: Optional[Tuple[torch.FloatTensor]],
        attention_mask: Optional[torch.FloatTensor] = None,
    ):
        residual = hidden_states
        hidden_states = self.ln_1(hidden_states)
        attn_output = self.attn(
            hidden_states,
            attention_mask=attention_mask,
        )
        # residual connection
        hidden_states = attn_output + residual
        residual = hidden_states
        hidden_states = self.ln_2(hidden_states)
        feed_forward_hidden_states = self.mlp(hidden_states)
        # residual connection
        hidden_states = residual + feed_forward_hidden_states

        return hidden_states

In [None]:
%%script echo skipping
class GPTJBlock(nn.Module):
    def __init__(self, config, d_model, d_ffn, layer_idx=None):
        super().__init__()
        self.ln_1 = nn.LayerNorm(d_model, eps=config.layer_norm_epsilon)
        self.attn = GPTJ_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
        self.mlp = GPTJMLP(config, d_model, d_ffn)

    def forward(
        self,
        hidden_states: Optional[torch.FloatTensor],
        attention_mask: Optional[torch.FloatTensor] = None,
    ):
        residual = hidden_states
        hidden_states = self.ln_1(hidden_states)
        attn_output = self.attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
        )

        feed_forward_hidden_states = self.mlp(hidden_states)
        # residual connection
        hidden_states = attn_output + feed_forward_hidden_states + residual

        return hidden_states

In [None]:
%%script echo skipping
class LlamaMLP(nn.Module):
    def __init__(self, config, d_model, d_ffn):
        super().__init__()
        self.gate_proj = nn.Linear(d_model, d_ffn, bias=config.mlp_bias)
        self.up_proj = nn.Linear(d_model, d_ffn, bias=config.mlp_bias)
        self.down_proj = nn.Linear(d_ffn, d_model, bias=config.mlp_bias)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        # x: [BS, d_model]
        hidden_states = self.act_fn(self.gate_proj(x)) * self.up_proj(x)
        output = self.down_proj(hidden_states)
        return output

In [None]:
!pip install datasets

In [3]:
from sklearn.model_selection import train_test_split
# write your code here, make sure you use the name defined below.

train_data = imdb_dataset['train']

splitter = train_test_split(train_data['text'], train_data['label'], test_size=0.1, random_state=42, stratify=train_data['label'])

train_x, dev_x, train_y, dev_y = splitter



In [None]:
print(train_x[0])

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')


In [6]:
from torch.utils.data import Dataset, DataLoader

import torch


class SentimentAnalysisDataset(Dataset):
  #write your code here
  def __init__(self,data, tokenizer, max_len = 512):
    self.input_texts = data['input']
    self.labels = data['label']
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.prepare()


  def prepare(self):
    encodings = self.tokenizer(self.input_texts, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
    self.input_ids = encodings['input_ids']
    self.len = len(self.input_texts)
    self.attention_masks = encodings['attention_mask']


  def __len__(self):
    return self.len

  def __getitem__(self,idx):
    return {
            'input_ids': self.input_ids[idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
             'attention_mask': self.attention_masks[idx]
          }

# Example of usage
# Usage of GPU: due to limit usage of GPU on Colab, we will not train the whole training set. If you can get access to GPU, we strongly recommend you to run it on GPU and try it on the whole dataset. In this homework, we only run first 20 samples.
train = {'input':train_x[:20], 'label':train_y[:20]}
train_dataset = SentimentAnalysisDataset(train, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size = 4, shuffle = True)


In [7]:
assert sum(train_dataset.attention_masks[0])==149

In [None]:
from torch import nn

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
bertmodel = AutoModel.from_pretrained("distilbert/distilbert-base-uncased")

class ClassificationModel(nn.Module):
    def __init__(self,base_model,num_classes):
        super().__init__()
        torch.manual_seed(42)
        self.base_model = base_model
        self.dropout = nn.Dropout(p=0.5)
        self.classifier = nn.Linear(base_model.config.hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)


    def forward(self,input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        last = outputs.last_hidden_state

        out = last[:, 0, :]
        out = self.dropout(out)
        logits = self.classifier(out)
        predicts = self.softmax(logits)

        return predicts

In [9]:
model = ClassificationModel(base_model = bertmodel, num_classes = 2 )

In [10]:
#test case
input_ids = train_dataset.input_ids[0]
attention_mask = train_dataset.attention_masks[0]
model.eval()
with torch.no_grad():
  predicts = model(input_ids,attention_mask)


In [11]:
assert predicts.tolist() == [[0.4507104158401489, 0.5492895841598511]]

In [None]:
from transformers import AdamW
from torch import nn

optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()


In [None]:
from tqdm import tqdm
epochs = 10 # don't change

dev = {'input': dev_x[:20], 'label': dev_y[:20]}
dev_dataset = SentimentAnalysisDataset(dev, tokenizer)
dev_dataloader = DataLoader(dev_dataset, batch_size=4, shuffle=False)

num_training_steps = epochs * len(train_dataloader)
best_model = None

best_loss = 10000

with tqdm(total=num_training_steps, desc='Finetuning:') as pbar:
  for epoch in range(epochs):
    # training loop
    model.train()
    train_loss = 0
    for batch in train_dataloader:
      '''
      #Tips:
      #1. Put the input and model on the same device
      #2. Use the optimizer correctly
      #3. Update the train loss. The printed train loss should be train_loss/len(train_dataloader)
      '''
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      optimizer.zero_grad()
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      loss = loss_fn(outputs, labels)
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

      pbar.update(1)

    train_loss /= len(train_dataloader)

    print(f'Epoch {epoch}: train loss is {train_loss}')
    model.eval()
    with torch.no_grad():
      for batch in dev_dataloader:

        dev_loss = 0
        '''
        #Tips:
        #1. You don't need to use optimizer
        #2. Update the dev loss. The printed dev loss should be dev_loss/len(dev_dataloader)
        #3. Save the checkpoint if the dev loss is smaller than best loss and update the best loss to dev loss
        '''
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        labels = batch['labels'].to(device)

        dev_loss += (loss_fn(outputs, labels)).item()

      dev_loss /= len(dev_dataloader)

      print(f'Epoch {epoch}: dev loss is {dev_loss}')
      if dev_loss < best_loss:
        #save the checkpoint
        # write your code here
        best_loss = dev_loss
        best_model = model.state_dict()
        print(f'The best loss is {dev_loss}. Saving checkpoint!')

