In [152]:
import re

import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader, TensorDataset

In [153]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [155]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [156]:
df.sentiment = df.sentiment.apply(lambda x: 0 if x == "positive" else 1)
df.review = df.review.apply(lambda x: " ".join(re.sub(r"[^A-Za-z0-9]+", " ", x.lower()).split()))

In [157]:
train = df.sample(frac=0.8, random_state=3)
val = df.drop(train.index)

In [158]:
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

print(train.shape[0], val.shape[0])

40000 10000


In [195]:
class BaselineModel(nn.Module):
    def __init__(self, vocab_size, seq_len, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.flatten = nn.Flatten()
        self.fc_out = nn.Linear(in_features=embed_dim * seq_len, out_features=1)
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.flatten(x)
        x = self.fc_out(x)
        return x.unsqueeze(-1)

In [160]:
def crude_tokenizer(corpus: list[str]):
    word_to_idx = {}
    counter = 1
    
    for sentences in corpus:
        words = sentences.split()
        for word in words:
            if word not in word_to_idx.keys():
                word_to_idx[word] = counter
                counter +=1
                
    return word_to_idx

In [161]:
tokenizer_dict = crude_tokenizer(train.review.values)

In [162]:
tokenized_train = list(map(lambda x: [tokenizer_dict[y] for y in x if y != ' '], train.review.values))
tokenized_val = list(map(lambda x: [tokenizer_dict[y] for y in x if y != ' '], val.review.values))

In [163]:
# Get min and max sequence length

max([len(x) for x in tokenized_train])

# Lets just cap it at 1024 and left-pad to there

10877

In [164]:
itd = [i for i, _ in enumerate(tokenized_train) if len(_) > 1024]

tokenized_train = [x for x in tokenized_train if len(x) <= 1024]
train = train.drop(itd).reset_index(drop=True)

max([len(x) for x in tokenized_train])

1024

In [165]:
itd = [i for i, _ in enumerate(tokenized_val) if len(_) > 1024]

tokenized_val = [x for x in tokenized_val if len(x) <= 1024]
val = val.drop(itd).reset_index(drop=True)

In [166]:
# padding

def left_pad_sequences(sequences, target_length, padding_value=0):
    """
    Left pads each sequence in the list of sequences with the padding_value
    to ensure each sequence has the same length, which is target_length.

    :param sequences: List of lists, where each inner list is a sequence.
    :param target_length: The desired length for all sequences.
    :param padding_value: The value to use for padding. Default is 0.
    :return: List of sequences, all of the same length.
    """
    padded_sequences = []
    for sequence in sequences:
        # Calculate the number of padding elements needed
        padding_length = max(0, target_length - len(sequence))
        # Create the padded sequence and add it to the result list
        padded_sequence = [padding_value] * padding_length + sequence
        padded_sequences.append(padded_sequence)
    
    return padded_sequences

In [167]:
train_tokens_padded = left_pad_sequences(tokenized_train, 1024)
val_tokens_padded = left_pad_sequences(tokenized_val, 1024)

In [168]:
train_loader = DataLoader(
    dataset=TensorDataset(torch.tensor(train_tokens_padded, dtype=torch.int64), torch.tensor(train.sentiment.values, dtype=torch.float32)),
    shuffle=True,
    batch_size=8
)

val_loader = DataLoader(
    dataset=TensorDataset(torch.tensor(val_tokens_padded, dtype=torch.int64), torch.tensor(val.sentiment.values, dtype=torch.float32)),
    shuffle=False,
    batch_size=16
)

In [196]:
def train_on_epoch(epoch, model, optimizer, criterion, loader, device):
    model.train()
    running_loss = 0.
    running_accuracy = 0.
    
    for step, (tokens, labels) in enumerate(loader):
        tokens = tokens.to(device)
        labels = labels.unsqueeze(-1).to(device)
        
        optimizer.zero_grad()
        
        outputs = model(tokens)
                
        loss = criterion(outputs.squeeze(-1), labels)
        
        optimizer.step()
        
        # Just for good measure
        
        with torch.no_grad():
            running_loss += loss
            
            print(f"Epoch {epoch + 1} -- step {step + 1} -- bce_loss: {running_loss / (step + 1)}", end="\r")
    print()

In [197]:
def validate_epoch(epoch, model, optimizer, criterion, loader, device):
    model.eval()
    
    with torch.no_grad():
        running_loss = 0.

        for step, (tokens, labels) in enumerate(loader):
            tokens = tokens.to(device)
            labels = labels.unsqueeze(-1).to(device)

            outputs = model(tokens)

            loss = criterion(outputs.squeeze(-1), labels)
            running_loss += loss

        print(f"Validation Epoch {epoch + 1} -- val_bce_loss: {running_loss / len(loader)}")

In [198]:
bm = BaselineModel(len(tokenizer_dict.keys()), 1024, 64)
optimizer = optim.Adam(bm.parameters(), lr=1e-4)
criterion = nn.BCEWithLogitsLoss()

In [199]:
for epoch in range(25):
    train_on_epoch(epoch, bm, optimizer, criterion, train_loader, "cpu")
    validate_epoch(epoch, bm, optimizer, criterion, val_loader, "cpu")

Epoch 1 -- step 3330 -- bce_loss: 0.7177275419235236
Validation Epoch 1 -- val_bce_loss: 0.7176714539527893
Epoch 2 -- step 3330 -- bce_loss: 0.7177346348762512
Validation Epoch 2 -- val_bce_loss: 0.7176714539527893
Epoch 3 -- step 3330 -- bce_loss: 0.7177257537841797
Validation Epoch 3 -- val_bce_loss: 0.7176714539527893
Epoch 4 -- step 3330 -- bce_loss: 0.7177298069000244
Validation Epoch 4 -- val_bce_loss: 0.7176714539527893
Epoch 5 -- step 3330 -- bce_loss: 0.7177303433418274
Validation Epoch 5 -- val_bce_loss: 0.7176714539527893
Epoch 6 -- step 3330 -- bce_loss: 0.7177314758300781
Validation Epoch 6 -- val_bce_loss: 0.7176714539527893
Epoch 7 -- step 3330 -- bce_loss: 0.7177283763885498
Validation Epoch 7 -- val_bce_loss: 0.7176714539527893
Epoch 8 -- step 3330 -- bce_loss: 0.7177239656448364
Validation Epoch 8 -- val_bce_loss: 0.7176714539527893
Epoch 9 -- step 675 -- bce_loss: 0.7146227359771729

KeyboardInterrupt: 

In [181]:
class LayerOfSin(nn.Module):
    def __init__(self, input_sz, output_sz):
        super(LayerOfSin, self).__init__()
        self.amplitude = nn.Parameter(torch.randn(output_sz, 1))
        self.phase = nn.Parameter(torch.randn(input_sz, output_sz))
        self.bias = nn.Parameter(torch.ones(output_sz,))

    def forward(self, x):
        x = torch.sin(x @ self.phase)
        x = x @ self.amplitude + self.bias
        return x

In [182]:
class LayerOfCos(nn.Module):
    def __init__(self, input_sz, output_sz):
        super(LayerOfCos, self).__init__()
        self.amplitude = nn.Parameter(torch.randn(output_sz, 1))
        self.phase = nn.Parameter(torch.randn(input_sz, output_sz))
        self.bias = nn.Parameter(torch.ones(output_sz,))

    def forward(self, x):
        x = torch.cos(x @ self.phase)
        x = x @ self.amplitude + self.bias
        return x

In [200]:
class TrigonometricModel(nn.Module):
    def __init__(self, vocab_size, seq_len, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoding = nn.Embedding(seq_len, embed_dim)
        self.seq_len = seq_len
        self.lsin_1 = LayerOfSin(seq_len, 128)
        self.lcos_1 = LayerOfCos(embed_dim, 128)
        self.lsin_2 = LayerOfSin(embed_dim, 128)
        self.lcos_2 = LayerOfCos(seq_len, 128)
        self.conv = nn.Conv1d(in_channels=128, out_channels=1, kernel_size=2)
        self.ln = nn.LayerNorm(4)
        self.linear = nn.Linear(3, 1)
    
    def forward(self, x):
        x_e = self.embedding(x)
        
        pos = torch.arange(start=0, end=self.seq_len, device=x.device)
        
        encoded = self.pos_encoding(pos).repeat(x_e.shape[0], 1, 1)
                                
        x_a = self.lsin_1(x_e[:, :, :1].squeeze(-1))
        x_b = self.lcos_1(x_e[:, :1, :].squeeze(1))
        
        x_c = self.lcos_2(encoded[:, :, :1].squeeze(-1))
        x_d = self.lsin_2(encoded[:, :1, :].squeeze(1))
        
        stacked = torch.stack([x_a, x_b, x_c, x_d], dim=-1)
        normed = self.ln(stacked)
        
        normed = self.conv(normed)
                
        return self.linear(normed)

In [203]:
tm = TrigonometricModel(len(tokenizer_dict.keys()), 1024, 32)
optimizer = optim.SGD(tm.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [204]:
for epoch in range(25):
    train_on_epoch(epoch, tm, optimizer, criterion, train_loader, "cpu")
    validate_epoch(epoch, tm, optimizer, criterion, val_loader, "cpu")

Epoch 1 -- step 535 -- bce_loss: 0.7133740782737732

KeyboardInterrupt: 

In [None]:
# Given I only had < 2-3 hours to port over my idea of using sine waves as
# I have succesfully applied to basic regression, I figured the challenge
# may exceed the time needed although I am still convinced exploiting 
# wave and particle like behaiviour through use of trigonometric and geometric
# relationships is the key to pushing LLMs to the next level.

In [None]:
# I was planning to demonstrate how straight forward it is to fine-tune a pretrained
# model such as BERT or RoBERTa etc. however I figure it doens't really make
# sense to reinvent the wheel so I will just give a quick conceptual overview

# 1. Import transformers & datasets if needed
# 2. Load tokenizer and model either directly from huggingface or from disk
# 3. Tokenize your dataset using the datasets library or however you wish,
# so long as it ends up as streamable from disk as would by the case with 
# a pyarrow dataset (native to huggingface effectiely) or a straight-forward
# tensor dataset in torch or TF format depending on your framework of choice
# 4. Use HF's trainer appropriate for the task at hand, this should correspond
# to your '.from_pretrained' selection, i.e. your SequenceClassification choice
# in your model loading step should mirror your trainers objective 
# (alternatively, and as I usually prefer, you can build the training loop yourself
# including the Datasets behvaiour, i.e. on-the-fly tokenziation, streaming, or pre-tokenize
# in memory, usually only a good choice with fine-tuning on smaller datasets)
# 5. Train and validate like any other scenario. This is not really unique to
# an NLP problem. 
# 6. If you weren't opting for checkpointing, save your pre-trained model, again
# this is conditional on your framework of choice, also consider how you will be deploying
# your model with regards to the format you choose. Ideally you want to save models
# in a format that is compatible across both HF and your DL framework, and it is best to
# choose a method that preserves the architecture as well as the weights if storage
# is not a concern. 

# Points to consider:
# - As you may have noticed, anyone who can read API documentation, and also read
# books in general can and will be able to perform these steps given that 
# the tooling is open-source (as are the instruction manuals).

# - Challenges are usually not with tooling. Bugs are dealt with by the library
# maintainers very quickly. Problems arise when outcomes are radically different
# to expectations, particulary amplified when the tooling and instructions are followed
# closely. Interesting examples to ponder is if I gave you a million row CSV
# with moview reviews except I assigned the labels using a uniform distribution 
# with a random seed based on the current datetime multiplied by the amount grass blades
# in my backyard normalized to be between 1-5. So in essence, one cannot rely
# on tooling alone. Garbage in, garbage out.