In [None]:
import torch
import numpy as np
import pandas as pd
import os

from torch.utils.data import Dataset, DataLoader, TensorDataset, WeightedRandomSampler, random_split
from torchvision import transforms
from matplotlib.pyplot import imshow

# Simple Datasets

In [2]:
# create a tensor dataset and a data loader

X, y = torch.ones(5, 2), torch.tensor([0, 1, 1, 2, 1])

# dataset
dataset = TensorDataset(X, y)

# sampler
class_counts = torch.unique(dataset.tensors[1], return_counts=True)[1]
weights = [cnt / len(dataset) for cnt in class_counts]
sampler = WeightedRandomSampler(torch.tensor(weights), len(dataset))

# data loader
dataloader = DataLoader(dataset, batch_size=16, sampler=sampler)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True) # alternative

In [None]:
# split the dataset

random_split(dataset, [0.3, 0.3, 0.4])

In [4]:
# create a dataset

class my_set(Dataset):
    def __init__(self, length = 100, transform = None):
        self.len = length
        self.x = torch.ones(length, 2)
        self.y = torch.ones(length, 1)
        self.transform = transform

    def __getitem__(self, index):
        sample = self.x[index], self.y[index]
        if self.transform:
            sample = self.transform(sample)
        return sample

    def __len__(self):
        return self.len

In [None]:
# dataset functionalities

my_dataset = my_set()
print(my_dataset)
print(my_dataset[5])
print(len(my_dataset))

for x, y in my_dataset:
    pass

In [None]:
# transforms

class add_mult(object):
    def __init__(self, addx = 1, muly = 3):
        self.addx = addx
        self.muly = muly

    def __call__(self, sample):
        x, y = sample
        x += self.addx
        y *= self.muly
        return x, y

my_dataset = my_set(transform=add_mult())
print(my_dataset[3])

def mult(sample):
    return sample[0] + 1, sample[1] * 10

data_transform = transforms.Compose([add_mult(), mult])
print(data_transform(my_dataset[3]))
my_dataset = my_set(transform=data_transform)
print(my_dataset[3])

# sample transformation for images
transforms.Compose([transforms.CenterCrop(20), transforms.RandomVerticalFlip(), transforms.ToTensor()])

# Pandas

In [None]:
# create a dataset

# from dict / array / numpy
my_dataset = pd.DataFrame([[50, True], [40, False]])
print(my_dataset)
my_dataset = pd.DataFrame({ 'col1': [1, 2], 'col2': [3, 4] }, copy=True)
print(my_dataset)
my_dataset = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6]], columns=['a', 'b', 'c'], index=[3, 'd'])
print(my_dataset)

# from csv
my_dataset = pd.read_csv(os.path.join("data","train.csv"))
my_dataset.head()

In [None]:
# access elements

print(my_dataset.shape, my_dataset.iloc[0, 3])

In [None]:
# exploring the data - basic info

my_dataset.info()
my_dataset.describe() # only int values

In [None]:
# exploring the data - value counts

my_dataset["Sex"].value_counts()  

In [None]:
# exploring the data - correlations

# you should do that only with train data
corr_matrix = my_dataset.corr(numeric_only=True)
print(corr_matrix["Survived"].sort_values(ascending=False))
corr_matrix # only int values