In [1]:
from sklearn.datasets import make_classification
import torch

## **creating synthetic classification dataset**

In [4]:
x,y = make_classification(
    n_samples=10,
    n_features=2,
    n_informative=2,  #no. of informative features
    n_redundant=0,  #no. of redundant features
    n_classes=2,
    random_state=38
)

In [10]:
x

tensor([[ 0.0521, -0.3096],
        [ 1.5251,  0.8699],
        [ 2.1065,  1.2934],
        [-2.3315,  0.7055],
        [-0.9892, -0.8807],
        [-1.2207,  0.7804],
        [ 2.8130,  1.5992],
        [-0.4337,  0.0847],
        [-2.7708,  0.7429],
        [ 1.7008, -0.5261]])

In [7]:
y

array([0, 0, 1, 1, 0, 1, 1, 0, 1, 0])

In [8]:
x = torch.tensor(x, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

In [11]:
x

tensor([[ 0.0521, -0.3096],
        [ 1.5251,  0.8699],
        [ 2.1065,  1.2934],
        [-2.3315,  0.7055],
        [-0.9892, -0.8807],
        [-1.2207,  0.7804],
        [ 2.8130,  1.5992],
        [-0.4337,  0.0847],
        [-2.7708,  0.7429],
        [ 1.7008, -0.5261]])

In [12]:
y

tensor([0., 0., 1., 1., 0., 1., 1., 0., 1., 0.])

## **Dataset & DataLoader**

**Dataset**

In [13]:
from torch.utils.data import Dataset, DataLoader

In [14]:
class CustomDataset(Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    #return no. of rows
    return self.features.shape[0]

  def __getitem__(self, index):
    #can apply transofrmations to data here

    return self.features[index], self.labels[index]

In [15]:
dataset = CustomDataset(x,y)

In [18]:
print(f"No. of rows : {len(dataset)}")
print(f"3rd row  : {dataset[2]}")

No. of rows : 10
3rd row  : (tensor([2.1065, 1.2934]), tensor(1.))


**DataLoader**

In [19]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

**Other imp dataloader parameters**

**dataset (mandatory):**  
The Dataset from which the DataLoader will pull data.
Must be a subclass of torch.utils.data.Dataset that implements __getitem__ and
__len__.

**batch_size:**  
How many samples per batch to load.
Default is 1.
Larger batch sizes can speed up training on GPUs but require more memory.  

**shuffle:**  
If True, the DataLoader will shuffle the dataset indices each epoch.
Helpful to avoid the model becoming too dependent on the order of samples.

**num_workers:**  
The number of worker processes used to load data in parallel.  
Setting num_workers > 0 can speed up data loading by leveraging multiple CPU
cores, especially if I/O or preprocessing is a bottleneck.

**drop_last:**  
If True, the DataLoader will drop the last incomplete batch if the total number of samples is not divisible by the batch size.  
Useful when exact batch sizes are required (for example, in some batch
normalization scenarios).

**collate_fn:**  
A callable that processes a list of samples into a batch (the default simply stacks tensors).  
Custom collate_fn can handle variable-length sequences, perform custom batching
logic, or handle complex data structures.

**sampler:**  
sampler defines the strategy for drawing samples (e.g., for handling imbalanced
classes, or custom sampling strategies).  
batch_sampler works at the batch level, controlling how batches are formed.

**Creating Batches**

In [26]:
i=0
for batch_features, batch_labels in dataloader:
  print(f"Batch {i+1}")
  print(f"\nFeatures  : {batch_features}")
  print(f"\nLabels  : {batch_labels}")
  print("-"*35)
  i+=1

Batch 1

Features  : tensor([[-0.4337,  0.0847],
        [-0.9892, -0.8807]])

Labels  : tensor([0., 0.])
-----------------------------------
Batch 2

Features  : tensor([[ 2.1065,  1.2934],
        [-2.3315,  0.7055]])

Labels  : tensor([1., 1.])
-----------------------------------
Batch 3

Features  : tensor([[ 0.0521, -0.3096],
        [ 1.7008, -0.5261]])

Labels  : tensor([0., 0.])
-----------------------------------
Batch 4

Features  : tensor([[ 1.5251,  0.8699],
        [-1.2207,  0.7804]])

Labels  : tensor([0., 1.])
-----------------------------------
Batch 5

Features  : tensor([[-2.7708,  0.7429],
        [ 2.8130,  1.5992]])

Labels  : tensor([1., 1.])
-----------------------------------
