In [2]:
import numpy as np

from utils import DatasetReader
from transformers import AutoTokenizer, AutoModel, RobertaModel, RobertaTokenizerFast
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Load embeddings

In [3]:
class Model(nn.Module):
    def __init__(self, pretrained_name='distilroberta-base'):
        super(Model, self).__init__()

        self.tokenizer = RobertaTokenizerFast.from_pretrained(pretrained_name)
        self.model = RobertaModel.from_pretrained(pretrained_name)

        self.embedding_size = self.model.embeddings.word_embeddings.embedding_dim

        self.lstm_blocks_1 = nn.LSTM(input_size=self.embedding_size, hidden_size=self.embedding_size * 2,
                                     num_layers=10, batch_first=True)

        self.lstm_blocks_2 = nn.LSTM(input_size=self.embedding_size * 2, hidden_size=self.embedding_size,
                                     num_layers=5, batch_first=True)

    def forward(self, x: list[str]):
        encoded_batch = self.tokenizer(x, padding=True, truncation=True, return_tensors='pt')
        outputs = self.model(**encoded_batch).last_hidden_state

        lstm_output, (_, _) = self.lstm_blocks_1(outputs)
        _, (lstm_output, _) = self.lstm_blocks_2(lstm_output)

        return lstm_output.reshape(len(x), self.embedding_size * self.lstm_blocks_2.num_layers)


text_batch = ["Hello, world!",
              "Transformers are amazing.",
              "One more text",
              "And here are one one more text",
              "ya eby sobak"]

b = Model()
b(text_batch).shape

torch.Size([5, 3840])

## Load dataset

In [5]:
data = DatasetReader().read('../data/csv/clean_data.csv')
data

Unnamed: 0,Questions,Topic
0,define the term brand,marketing_mix_and_strategy
1,explain one risk jack ma may have taken when s...,entreprenuers_and_leaders
2,analyse two factors that may have increased de...,market
3,discuss if profit maximisation is the main bus...,entreprenuers_and_leaders
4,assess the advantages of a paternalistic style...,managing_people
...,...,...
259,evaluate the likely value of each of the follo...,meeting_customer_needs
260,evaluate the likely value to mike watson of us...,meeting_customer_needs
261,briefly explain two reasons why levi roots con...,meeting_customer_needs
262,assess the likely implications for reggae regg...,meeting_customer_needs


In [None]:
class TextDataset(Dataset):
    def __init__(self, data, labels):
        super(TextDataset, self).__init__()
        self.data = np.array(data)
        self.labels = labels
        
    def __len__(self):
        pass

    def __getitem__(self, index):
        pass

## Data preprocessing

## Spliting the data

## Training the model

## Evaluating the model