<a href="https://colab.research.google.com/github/Juxtpawan/AIML/blob/main/pytorch3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall -y torch torchtext torchvision torchaudio
!pip install torch==2.1.0+cpu torchvision==0.16.0+cpu torchaudio==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu
!pip install torchtext==0.16.0 --no-dependencies

In [None]:
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [None]:
# Define custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data_path, max_seq_length):
        self.data = []
        self.tokenizer = get_tokenizer('basic_english')
        self.vocab = None
        self.max_seq_length = max_seq_length

        self.load_data(data_path)

    def load_data(self, data_path):
        # Read CSV file
        df = pandas.read_csv(data_path)

        # Iterate over the rows and store the data
        for index, row in df.iterrows():
            name = row['name']
            quote = row['quote']
            score = row['score']

            self.data.append((name, quote, score))

        # Build vocabulary
        self.build_vocab()

    def build_vocab(self):
        def yield_tokens():
            for item in self.data:
                yield self.tokenizer(item[1])  # Tokenize the 'quote' column

        self.vocab = build_vocab_from_iterator(yield_tokens(), specials=["<unk>"])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        name, quote, score = self.data[idx]
        tokenized_quote = [self.vocab[token] for token in self.tokenizer(quote)]

        # Pad the quote sequence to a fixed length
        padded_quote = torch.tensor(tokenized_quote[:self.max_seq_length], dtype=torch.long)
        padded_quote = torch.cat((padded_quote, torch.zeros(self.max_seq_length - len(padded_quote), dtype=torch.long)))

        score = torch.tensor(score, dtype=torch.float)

        return name, padded_quote, score

In [None]:
# Define batch size and create data loaders
batch_size = 2
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
# Example usage of the data loaders
for name, quote, score in train_loader:
    # Process the data or pass it to your transformer model for training
    print(name)
    print(quote)
    print(score)