### *Data Processing Pipeline by use Pytorch*

In [32]:
import torch
import torch.nn as nn
import pandas as pd
from torchvision import transforms
import os

In [33]:
''' Data Load '''
root_dir = '/Users/mahadiur/Desktop/Deep Learning Data Processing/Data_Processing_Pipeline/Data'
dataset_path = os.path.join(root_dir, 'DigitDataset.csv')

digit_data = pd.read_csv(
    dataset_path
)
digit_data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
''' Check Device '''
Device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(Device)

cpu


In [35]:
''' Processing single example of the dataset '''
pixels = digit_data.iloc[0].values[1:].astype('float')
label = digit_data.iloc[0].values[0:1]
print(pixels.shape)
print(label.shape)
pixels = pixels.reshape(-1, 28, 28)
print(pixels.shape)

(784,)
(1,)
(1, 28, 28)


In [36]:
''' Transformation '''
Tranformations = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=(0.1307),
        std=(0.3181)
    )
])

### *Data Processing Pipeline*

In [37]:
from torch.utils.data import Dataset

class DataPipeline(Dataset):
    def __init__(self, file_path, transformation):
        super().__init__()
        self.data = pd.read_csv(file_path)
        print(self.data.head())
        self.transformation = transformation
    
    # Dataset length
    def __len__(self):
        return len(self.data)
    
    # Process Data
    def __getitem__(self, index):
        pixels = self.data[index].values[1:].astype('float')
        label = self.data[index].values[0:1]

        pixels = torch.tensor(pixels)
        label = torch.tensor(label)

        pixels = pixels.reshape(28, 28).unsqueeze(0) / 255
        if self.transformation:
            pixels = Tranformations(pixels)
        return pixels, label
    

dataset = DataPipeline(
    file_path=dataset_path,
    transformation=Tranformations
    )
print(len(dataset))

   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8  ...  pixel774  pixel775  pixel776  pixel777  pixel778  pixel779  \
0       0  ...         0         0         0         0         0         0   
1       0  ...         0         0         0         0         0         0   
2       0  ...         0         0         0         0         0         0   
3       0  ...         0         0         0         0         0         0   
4       0  ...         0         0         0         0         0         0   

   pixel780  pixel781  pixel782  pixel783  
0         0         0         

### *Train-Test-Validation*

In [38]:
'''
TrainDataset > BatchSize > Step 
All Batch run > 1 epoch 
'''
from torch.utils.data import  random_split

torch.manual_seed(50)

train_size = int(0.7 * len(dataset))
validation_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - validation_size

Train_dataset, Validation_dataset, Test_dataset = random_split(
    dataset=dataset,
    lengths=[train_size, validation_size, test_size]
)

print(len(Train_dataset))
print(len(Test_dataset))
print(len(Validation_dataset))

29399
6301
6300


### **Train, Test & Validation Dataloader**

In [39]:
from torch.utils.data import DataLoader

Train_Dataloader = DataLoader(
    Train_dataset,
    batch_size=32,
    shuffle=True
)

Test_Dataloader = DataLoader(
    Test_dataset,
    batch_size=32,
    shuffle=False
)

Validation_Dataloader = DataLoader(
    Validation_dataset,
    batch_size=32,
    shuffle=False
)