# Dataset Builder  
This notebook tests tokenizer, dataset class, and DataLoader before using them in training.


In [1]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import DistilBertTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [2]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

sample_text = "AI-generated text is becoming harder to detect."
tokens = tokenizer(sample_text, return_tensors="pt")

tokens


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


{'input_ids': tensor([[  101,  9932,  1011,  7013,  3793,  2003,  3352,  6211,  2000, 11487,
          1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [3]:
df = pd.read_csv("../data/processed/train.csv")
df.head()


Unnamed: 0,text,label
0,"I’m not sure. I could mean some kind of mean, ...",1
1,Michael Lesy’s Wisconsin Death Trip: Photograp...,0
2,What is the average price of a house in san fr...,0
3,Why don't scientists trust atoms? Because they...,1
4,"ed in, I traced back to him. It ’ s buried, bu...",0


In [5]:
class TextDataset(Dataset):
    """
    Custom PyTorch Dataset for AI vs Human text classification.
    Loads text and label, and applies DistilBERT tokenization.
    """

    def __init__(self, csv_path, tokenizer, max_length=256):
        """
        Args:
            csv_path (str): Path to the CSV file (train/val/test).
            tokenizer: DistilBERT tokenizer.
            max_length (int): Max token length for DistilBERT input.
        """
        assert os.path.exists(csv_path), f"File not found: {csv_path}"
        
        self.data = pd.read_csv(csv_path)
        self.texts = self.data["text"].astype(str).tolist()
        self.labels = self.data["label"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Return:
            dictionary containing:
                - input_ids
                - attention_mask
                - label
        """
        text = self.texts[idx]
        label = int(self.labels[idx])

        # Tokenize using DistilBERT tokenizer
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),     # tensor
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [6]:
train_dataset = TextDataset("../data/processed/train.csv", tokenizer, max_length=256)

len(train_dataset), train_dataset[0]


(1519222,
 {'input_ids': tensor([ 101, 1045, 1521, 1049, 2025, 2469, 1012, 1045, 2071, 2812, 2070, 2785,
          1997, 2812, 1010, 2066, 1523, 1996, 2779, 2166, 8487, 1997, 1996, 3586,
          2017, 1524, 1010, 2030, 1523, 1996, 2779, 2166, 8487, 1997, 1037, 3586,
          2017, 1524, 1012, 2052, 2008, 2022, 2017, 1029,  102,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,

In [7]:
def get_dataloaders(
    train_path,
    val_path,
    tokenizer_name="distilbert-base-uncased",
    batch_size=16,
    max_length=256,
    num_workers=2
):
    """
    Loads the tokenizer, creates Datasets and DataLoaders for training.

    Args:
        train_path (str)
        val_path (str)
        tokenizer_name (str)
        batch_size (int)
        max_length (int)
        num_workers (int)

    Returns:
        train_loader, val_loader, tokenizer
    """

    print(f"Loading tokenizer: {tokenizer_name}")
    tokenizer = DistilBertTokenizerFast.from_pretrained(tokenizer_name)

    print("Creating training dataset...")
    train_dataset = TextDataset(train_path, tokenizer, max_length=max_length)

    print("Creating validation dataset...")
    val_dataset = TextDataset(val_path, tokenizer, max_length=max_length)

    # DataLoaders create batches and handle shuffling
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )

    print("DataLoaders ready:")
    print(f"- Train batches: {len(train_loader)}")
    print(f"- Val batches: {len(val_loader)}")

    return train_loader, val_loader, tokenizer


In [8]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

batch = next(iter(train_loader))
batch.keys(), batch["input_ids"].shape, batch["labels"]


(dict_keys(['input_ids', 'attention_mask', 'labels']),
 torch.Size([4, 256]),
 tensor([1, 1, 1, 0]))

In [9]:
train_loader, val_loader, tokenizer = get_dataloaders(
    "../data/processed/train.csv",
    "../data/processed/val.csv",
    batch_size=8,
    max_length=256
)


Loading tokenizer: distilbert-base-uncased




Creating training dataset...
Creating validation dataset...
DataLoaders ready:
- Train batches: 189903
- Val batches: 40694


The dataset and dataloaders are now working correctly.  
Now going to build model and train
