### Imports

In [None]:
!pip install datasets transformers torch  # install required packages
!pip install transformers faiss-cpu

import faiss
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import math
from datasets import load_dataset
import torch.utils.data as d
from transformers import BertTokenizer, BertModel
from torchtext.data import get_tokenizer

from torch import nn as nn
from typing import List, Tuple, Any, Optional
from tqdm import tqdm


### Parameters

In [2]:
BATCH_SIZE=16
NUM_BATCHES=64
HIDDEN_DIM=768
SEQ_LEN=20

### Dataset Dataloader modified (corrected number of batches and seqlen)

In [4]:

class MyIterableDataset(d.IterableDataset):
    def __init__(self, dataset, tokenizer, seq_len, article_indices):
        super(MyIterableDataset).__init__()
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.article_indices = article_indices
        self.num_articles = len(article_indices)

    def __iter__(self):
        def helper(start, end):
            for i in range(start, end):
                article = self.dataset[i]["text"]
                tokenized = self.tokenizer(article)
                length = len(tokenized)
                index = 0
                while index < length:
                    yield tokenized[index:index + self.seq_len]
                    index += self.seq_len

        worker_info = d.get_worker_info()
        if worker_info is None:
            start = 0
            end = self.num_articles
        else:
            per_worker = int(math.ceil(self.num_articles / float(worker_info.num_workers)))
            worker_id = worker_info.id
            start = worker_id * per_worker
            end = min(start + per_worker, self.num_articles)
        return helper(start, end)

def give_dataloader_words(development=True, batch_size=16,num_batches=64):
    if development:
        wiki_huggingface_dataset = load_dataset("wikipedia", "20220301.simple")["train"]
    else:
        wiki_huggingface_dataset = load_dataset("wikipedia", "20220301.en")["train"]

    if num_batches is None:
        article_indices = range(wiki_huggingface_dataset.num_rows)
    else:
        article_indices = range(num_batches * batch_size)
    tokenizer = get_tokenizer("basic_english")
    ds = MyIterableDataset(wiki_huggingface_dataset, tokenizer, 20, article_indices=range(wiki_huggingface_dataset.num_rows))
    return d.DataLoader(ds, batch_size=batch_size, collate_fn=lambda x: x)

# Example usage:

data_loader = give_dataloader_words(development=True, batch_size=BATCH_SIZE,num_batches=NUM_BATCHES)

sample = next(iter(data_loader))
num_batches = sum(1 for _ in data_loader)
# Print the shape of the sample batch to verify the dimensions
print("Number of batches in a dataloader:", num_batches)
print("Batch Size", len(sample))
print("Seq Length:", len(sample[0]))
print("Sample", sample)

Number of batches in a dataloader: 132011
Batch Size 16
Seq Length: 20
Sample [['april', 'is', 'the', 'fourth', 'month', 'of', 'the', 'year', 'in', 'the', 'julian', 'and', 'gregorian', 'calendars', ',', 'and', 'comes', 'between', 'march', 'and'], ['may', '.', 'it', 'is', 'one', 'of', 'four', 'months', 'to', 'have', '30', 'days', '.', 'april', 'always', 'begins', 'on', 'the', 'same', 'day'], ['of', 'week', 'as', 'july', ',', 'and', 'additionally', ',', 'january', 'in', 'leap', 'years', '.', 'april', 'always', 'ends', 'on', 'the', 'same', 'day'], ['of', 'the', 'week', 'as', 'december', '.', 'april', "'", 's', 'flowers', 'are', 'the', 'sweet', 'pea', 'and', 'daisy', '.', 'its', 'birthstone', 'is'], ['the', 'diamond', '.', 'the', 'meaning', 'of', 'the', 'diamond', 'is', 'innocence', '.', 'the', 'month', 'april', 'comes', 'between', 'march', 'and', 'may', ','], ['making', 'it', 'the', 'fourth', 'month', 'of', 'the', 'year', '.', 'it', 'also', 'comes', 'first', 'in', 'the', 'year', 'out', 'o

### Dataloader with Bert Tokenizer

In [None]:
import torch
import torch.utils.data as d
from transformers import BertTokenizer, BertModel, BertConfig
from datasets import load_dataset
import math

class MyIterableDataset(d.IterableDataset):
    def __init__(self, dataset, tokenizer, embedding_layer, seq_len, article_indices):
        super(MyIterableDataset).__init__()
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.embedding_layer = embedding_layer
        self.seq_len = seq_len
        self.article_indices = article_indices

    def __iter__(self):
        def helper(start, end):
            for i in range(start, end):
                article = self.dataset[i]["text"]
                tokenized = self.tokenizer(article, padding='max_length', truncation=True, max_length=self.seq_len, return_tensors='pt')

                # Get embeddings from the embedding layer
                input_ids = tokenized['input_ids']
                with torch.no_grad():
                    embeddings = self.embedding_layer(input_ids).squeeze(0).numpy()  # [seq_len, hidden_size]

                yield embeddings

        worker_info = d.get_worker_info()
        if worker_info is None:
            start = 0
            end = len(self.article_indices)
        else:
            per_worker = int(math.ceil(len(self.article_indices) / float(worker_info.num_workers)))
            worker_id = worker_info.id
            start = worker_id * per_worker
            end = min(start + per_worker, len(self.article_indices))
        return helper(start, end)

def give_dataloader(development=True, batch_size=64, seq_len=20, num_batches=None):
    if development:
        wiki_huggingface_dataset = load_dataset("wikipedia", "20220301.simple")["train"]
    else:
        wiki_huggingface_dataset = load_dataset("wikipedia", "20220301.en")["train"]

    if num_batches is None:
        article_indices = range(wiki_huggingface_dataset.num_rows)
    else:
        article_indices = range(num_batches * batch_size)

    # Load tokenizer and embedding layer ### CHANGE TOKENIZER IF U WISH TO HAVE SMALLER EMBEDDING DIM THAN 768
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    config = BertConfig.from_pretrained('bert-base-uncased')
    embedding_layer = BertModel(config).embeddings.word_embeddings  # Only load the embedding layer

    ds = MyIterableDataset(wiki_huggingface_dataset, tokenizer, embedding_layer, seq_len, article_indices=article_indices)
    return d.DataLoader(ds, batch_size=batch_size, collate_fn=lambda x: x)


data_loader = give_dataloader(development=True, batch_size=BATCH_SIZE, num_batches=NUM_BATCHES)

sample = next(iter(data_loader))
num_batches = sum(1 for _ in data_loader)
# Print the shape of the sample batch to verify the embedding dimensions
print("Number of batches in a dataloader:", num_batches)
print("Batch Size", len(sample))
print("Seq Length:", len(sample[0]))
print("Embedding size (hidden size):", len(sample[0][0]))  # Expecting: (number of batches, seq_len, hidden_size)
