# Data Representation

### --- Tabular Data
### --- Image Data
### --- Text Data
### --- Dataset and DataLoaders

In [59]:
import numpy as np
import pandas as pd
import torch
from PIL import Image
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms

In [3]:
torch.__version__

'1.4.0'

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

### Tabular Data

Usually stored in the csv file. This csv file may consist of values other than numbers, which is not accepted by pytorch tensors. Pytorch Tensors only works with numerical values. So, we need to convert attributes to numerical values and then convert them top pytorch tensors.

In [60]:
#read data
data = pd.read_csv('./../0. Data/ILPD.csv', header=None)

In [61]:
#See data information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
0     583 non-null int64
1     583 non-null object
2     583 non-null float64
3     583 non-null float64
4     583 non-null int64
5     583 non-null int64
6     583 non-null int64
7     583 non-null float64
8     583 non-null float64
9     579 non-null float64
10    583 non-null int64
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [62]:
#We can see that attribute 1 is not numeric, let's see what is it
data[1].unique()

array(['Female', 'Male'], dtype=object)

In [63]:
#attribute 1 consists of only two values, so, we encode them
data[1] = data[1].replace({'Female':0, 'Male':1})

In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
0     583 non-null int64
1     583 non-null int64
2     583 non-null float64
3     583 non-null float64
4     583 non-null int64
5     583 non-null int64
6     583 non-null int64
7     583 non-null float64
8     583 non-null float64
9     579 non-null float64
10    583 non-null int64
dtypes: float64(5), int64(6)
memory usage: 50.2 KB


In [65]:
#there is one more issue, in attribute 9, some values are null, let's fill them by zeros
data = data.fillna(0)

In [66]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
0     583 non-null int64
1     583 non-null int64
2     583 non-null float64
3     583 non-null float64
4     583 non-null int64
5     583 non-null int64
6     583 non-null int64
7     583 non-null float64
8     583 non-null float64
9     583 non-null float64
10    583 non-null int64
dtypes: float64(5), int64(6)
memory usage: 50.2 KB


In [67]:
#Convert pandas dataframe to tensor by extracting it's values and then using tensor() function
t = torch.tensor(data.values, dtype=torch.float)

In [68]:
t

tensor([[65.0000,  0.0000,  0.7000,  ...,  3.3000,  0.9000,  1.0000],
        [62.0000,  1.0000, 10.9000,  ...,  3.2000,  0.7400,  1.0000],
        [62.0000,  1.0000,  7.3000,  ...,  3.3000,  0.8900,  1.0000],
        ...,
        [52.0000,  1.0000,  0.8000,  ...,  3.2000,  1.0000,  1.0000],
        [31.0000,  1.0000,  1.3000,  ...,  3.4000,  1.0000,  1.0000],
        [38.0000,  1.0000,  1.0000,  ...,  4.4000,  1.5000,  2.0000]])

In [69]:
t.shape

torch.Size([583, 11])

In [71]:
data_new = t.numpy()

In [74]:
data_new =  pd.DataFrame(data_new)

In [76]:
data_new.to_csv('./../0. Data/ILPD_new.csv',index=False)

In [77]:
data_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,65.0,0.0,0.700000,0.100000,187.0,16.0,18.0,6.8,3.3,0.90,1.0
1,62.0,1.0,10.900000,5.500000,699.0,64.0,100.0,7.5,3.2,0.74,1.0
2,62.0,1.0,7.300000,4.100000,490.0,60.0,68.0,7.0,3.3,0.89,1.0
3,58.0,1.0,1.000000,0.400000,182.0,14.0,20.0,6.8,3.4,1.00,1.0
4,72.0,1.0,3.900000,2.000000,195.0,27.0,59.0,7.3,2.4,0.40,1.0
5,46.0,1.0,1.800000,0.700000,208.0,19.0,14.0,7.6,4.4,1.30,1.0
6,26.0,0.0,0.900000,0.200000,154.0,16.0,12.0,7.0,3.5,1.00,1.0
7,29.0,0.0,0.900000,0.300000,202.0,14.0,11.0,6.7,3.6,1.10,1.0
8,17.0,1.0,0.900000,0.300000,202.0,22.0,19.0,7.4,4.1,1.20,2.0
9,55.0,1.0,0.700000,0.200000,290.0,53.0,58.0,6.8,3.4,1.00,1.0


In [78]:
data_new2 = pd.read_csv('./../0. Data/ILPD_new.csv')
data_new2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,65.0,0.0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.90,1.0
1,62.0,1.0,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1.0
2,62.0,1.0,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1.0
3,58.0,1.0,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.00,1.0
4,72.0,1.0,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.40,1.0
5,46.0,1.0,1.8,0.7,208.0,19.0,14.0,7.6,4.4,1.30,1.0
6,26.0,0.0,0.9,0.2,154.0,16.0,12.0,7.0,3.5,1.00,1.0
7,29.0,0.0,0.9,0.3,202.0,14.0,11.0,6.7,3.6,1.10,1.0
8,17.0,1.0,0.9,0.3,202.0,22.0,19.0,7.4,4.1,1.20,2.0
9,55.0,1.0,0.7,0.2,290.0,53.0,58.0,6.8,3.4,1.00,1.0


### Image Data

A colored image consists of 3 channels (R,G,B). So, dimention of a single image is 3.

An image is represented as a collection of scalars arranged in a regular grid, hav-
ing a height and a width (in pixels). You might have a single scalar per grid point (the
pixel), which would be represented as a grayscale image, or multiple scalars per grid
point, which typically represent different colors or different features, such as depth
from a depth camera.

In [124]:
#read image
im = np.array(Image.open('./../0. Data/image.png'))

In [125]:
im.shape

(1080, 1080, 4)

In [131]:
#Height = 1080
#Width = 1080
#Channels = 4
# 4th channel contains only 255, so, we can ignore it
im = im[:,:,0:3]

In [132]:
im.shape

(1080, 1080, 3)

In [133]:
#pytorch expects channels in image at the beginning, so we transpose it accordingly
im = im.transpose((2,0,1))

In [134]:
im.shape

(3, 1080, 1080)

In [135]:
t = torch.tensor(im)
t.shape

torch.Size([3, 1080, 1080])

To create a dataset, tensors coprresponding to different images can be stacked

### Text Data

Text data also need to be converted to numerical data before creating a tensor of it. Some of the methods are :

One hot encoding

CBoW

GloVe

Language Model Embedding (e.g. BERT, ELMo etc.)



### Dataset and DataLoaders

Now that we know how to convert various data to pytorch, let now look at how to convert this data to a form that is much easier to manage. Pytorch provide two powerful classes that make our task of handling data very easy. The two classes are:

torch.utils.data.Dataset ------> An abstract class for representing a dataset.

torch.utils.data.DataLoader ---> Wraps a dataset and provides access to the underlying data. 



In [89]:
#Let us now define a class for tabular data saved in csv_file
class MyData(Dataset):
    
    def __init__(self, csv_file, root_dir, transform=None):
        
        #read data from csv file using pandas
        self.data = pd.read_csv(root_dir+csv_file)
        self.root_dir = root_dir
        self.transform = transform
        
    #function to return length of dataset   
    def __len__(self):
        
        return len(self.data)
    
    #funstion to get sample from data
    def __getitem__(self, idx):
        
        #if indices list id tensor, convert it to list
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        #Extract feature from dataset
        features = self.data.iloc[idx,:-1]
        #Extract labels from dataset
        label = self.data.iloc[idx,-1]
        #Create sample
        sample = {'features' : features, 'label' :label}
        
        #Apply transformation, if any
        if(self.transform):
            sample = self.transform(sample)
        
        return sample


In [110]:
#class to convert data to tensors, to be used in transform
class ToTensor(object):
    
    def __call__(self, sample):
        X = sample['features']
        Y = sample['label']
        
        return {'features' : torch.tensor(X), 'label' : torch.tensor(Y)}

In [111]:
#Use our custome dataset class to obtain train data
train_data = MyData(csv_file = 'ILPD_new.csv',
                    root_dir = './../0. Data/',
                    transform = transforms.Compose([
                        ToTensor() ]) )

In [112]:
#Provide data to dataloader to use it efficiently and easily
train_loader = torch.utils.data.DataLoader( train_data,
                                          batch_size=32,
                                          shuffle=True)

In [117]:
#we can now obtain data in batches of size 32
sample = next(iter(train_loader))

In [119]:
sample['features'].shape

torch.Size([32, 10])

In [120]:
sample['label'].shape

torch.Size([32])

In [123]:
#Now, let's iterate over the whole dataset
for i, sample in enumerate(train_loader):
    print('Batch : ',i)
    print(sample['features'].shape)
    print(sample['label'].shape)

Batch :  0
torch.Size([32, 10])
torch.Size([32])
Batch :  1
torch.Size([32, 10])
torch.Size([32])
Batch :  2
torch.Size([32, 10])
torch.Size([32])
Batch :  3
torch.Size([32, 10])
torch.Size([32])
Batch :  4
torch.Size([32, 10])
torch.Size([32])
Batch :  5
torch.Size([32, 10])
torch.Size([32])
Batch :  6
torch.Size([32, 10])
torch.Size([32])
Batch :  7
torch.Size([32, 10])
torch.Size([32])
Batch :  8
torch.Size([32, 10])
torch.Size([32])
Batch :  9
torch.Size([32, 10])
torch.Size([32])
Batch :  10
torch.Size([32, 10])
torch.Size([32])
Batch :  11
torch.Size([32, 10])
torch.Size([32])
Batch :  12
torch.Size([32, 10])
torch.Size([32])
Batch :  13
torch.Size([32, 10])
torch.Size([32])
Batch :  14
torch.Size([32, 10])
torch.Size([32])
Batch :  15
torch.Size([32, 10])
torch.Size([32])
Batch :  16
torch.Size([32, 10])
torch.Size([32])
Batch :  17
torch.Size([32, 10])
torch.Size([32])
Batch :  18
torch.Size([7, 10])
torch.Size([7])
