In [3]:
import torch
import torch.nn as nn
import pandas as pd
import spacy
from torch.utils.data import Dataset, DataLoader

In [None]:
# Create a simple dataset
data = {
    'Sentence': ['Hello, how are you?', 'I am learning about AI!', 'Transformers are interesting.']
}

df = pd.DataFrame(data)
df.to_csv("sentences.csv", index=False)

In [7]:
# Load spacy model for tokenization
nlp = spacy.load("en_core_web_sm")

# Tokenization function using spacy
def tokenize(text):
    return [token.text.lower() for token in nlp.tokenizer(text)]

# Create a simple dataset
data = {
    'Sentence': ['Hello, how are you?', 'I am learning about AI!', 'Transformers are interesting.']
}
df = pd.DataFrame(data)
df.to_csv("sentences.csv", index=False)

# Manually create vocabulary
def build_vocab(dataframe):
    vocab = {}
    for sentence in dataframe['Sentence']:
        tokens = tokenize(sentence)
        for token in tokens:
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

vocab = build_vocab(df)
vocab_size = len(vocab)
embedding_dim = 64


In [10]:
vocab

{'hello': 0,
 ',': 1,
 'how': 2,
 'are': 3,
 'you': 4,
 '?': 5,
 'i': 6,
 'am': 7,
 'learning': 8,
 'about': 9,
 'ai': 10,
 '!': 11,
 'transformers': 12,
 'interesting': 13,
 '.': 14}

In [11]:

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, vocab):
        self.dataframe = dataframe
        self.vocab = vocab

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['Sentence']
        return torch.tensor([self.vocab[token] for token in tokenize(text)], dtype=torch.long)

# Create dataset and dataloader
dataset = CustomDataset(df, vocab)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [14]:
token = next(iter(dataloader))
token

tensor([[12,  3, 13, 14]])

In [16]:
vocab[12]

KeyError: 12