# Train & Validation

vogliamo che la *distribuzione* di **foto per persona** e delle varie **classi** rimanga *uguale* sia nel training che nel validation set

### Define function to Download  +  Split Train Set

the first function download the dataset from gdrive
the second functions splits poeople's ids in training & validation

In [None]:
import os, pandas as pd, numpy as np
from google_drive_downloader import GoogleDriveDownloader as gdd
from collections import defaultdict


def download_dataset():
  os.system('mkdir ./project_dataset')            # create folder for the dataset
  GDRIVE_ID = '1Re1N_Qc0884fnHfK0vFs5NvrMB5fK6lm' # id of file in google drive
  gdd.download_file_from_google_drive(file_id=GDRIVE_ID,  # function to download
                                    dest_path='./project_dataset/dataset.zip',
                                    unzip=True)



def split_dataset(partitions=(.7,.3), force_at_least_one=False, rand_seed=0, verbose=False):
  """
  returns a list of people's ids for every partition specified.
  eg. 
  if there are 751 samples with partition (.5,.25,.25)
  the result will something like: [[370], [190], [190]]
  where [370] means an array containing 370 ids of people
  -----------------------
  we want to have at least 1 sample of every class in every partition 

  eg.
  input:  partitions=(.5, .5)
  output: [[ids of 376 people], [ids of other 376 people]]
  """
  # check that the percentages of partitions sums to 1
  assert((sum(partitions)-1)**2 < 0.0002)

  # get data
  download_dataset()
  df = pd.read_csv('./project_dataset/annotations_train.csv', index_col='id')

  # process
  if len(partitions)==1: return df.index.to_list() ### SPECIAL CASE: no split is done
  splits = [[] for _ in range(len(partitions))]    # train, validation, (validation2)


  ###  sort by classes sizes, and then select samples staring from the smallest
  ###  in this way we are more sure to have at least one sample for every class
  keys = set(df.keys()) - {'id'}
  smallest_classes = []
  for column in keys:
    for value in df[column].unique():
      count = df.loc[df[column]==value].count()['age']    # 'age' do not matter, any field could do
      smallest_classes.append((count, column, value))

  # smallest_classes = [(1, 'downpurple', 2),  (7, 'age', 1),  (8, 'downyellow', 2), ... ]
  smallest_classes.sort(key=lambda x:x[0])

  for count, column, value in smallest_classes: 
    # get specific combination category-value
    tmp  = df.loc[df[column]==value]           ### columns can be 'age', 'gender', ...         ### values can be 1, 2, ...

    if count < len(partitions):
      # SPECIAL CASE: very very small class  --> eg. 'downpurple' has only 1 person
      # Risk of overfitting the purple guy instead of learning what is purple..... (hope in data augmentation)
      if verbose: print(f'- combination {column}={value} is very rare, copying this person in both splits')
      samples = tmp.index.to_list()
      for i, split in enumerate(splits):
        j = min(i, len(tmp)-1)
        split.append(samples[j])
      df = df.drop(samples)
      continue
    elif count < 30 and len(tmp) < len(partitions):
      # SPECIAL CASE: we are unlucky with the previous assignments
      # this depends on the correlation between small categories...
      # anyway, probably, this will never happen
      raise Exception('we are unlucky, try again') # do not happen with seeds 1 and 2
    elif len(tmp)==0:
      # SPECIAL CASE: we assigned everything already
      # so, the remaining classes are empty
      continue

    if len(tmp) >= len(partitions) and force_at_least_one:
      # manually add 1 person of that category-value to each split
      samp = tmp.sample(n=len(partitions), random_state=rand_seed).index.tolist()
      for i, split in enumerate(splits):
        split.append(samp[i])
      tmp  = tmp.drop(samp)
      df = df.drop(samp)

    # add the remaining samples randomly
    sections = [int(sum(partitions[:i])*len(tmp)) for i in range(1, len(partitions))] # if there are 100 samples with partition (.6,.2,.2) ==> [60, 80]
    samp2 = np.split(tmp.sample(frac=1, random_state=rand_seed).index.to_numpy(), sections)
    for i, split in enumerate(splits):
      split += list(samp2[i])
    df = df.drop(tmp.index.to_numpy())

    if len(df) == 0:
      break
  return splits

### Analysis of the Split Made By the Function in the Previous Cell

In [None]:
### TAKES THE RESULT OF SPLIT_DATASET AND RETURNS SOME INFOS ABOUT THE SPLIT ###
def print_split_stats(splits):
  """
  util function that print information about the split.
  (I think that this function is consistent in creating good splits 
  --> estimated generative pdf in training/validation seems similar)
  """
  df = pd.read_csv('./project_dataset/annotations_train.csv', index_col='id')
  names = ['train', 'validation', 'validation2']

  print(f'n_people = {len(df)}\n------------------------------------------')
  counters = defaultdict(lambda:defaultdict(lambda:[0 for _ in range(len(splits))]))
  for i, split in enumerate(splits):
    print(f'len {names[i]} = {len(split)}')
    for pid in split:
      row = df.loc[pid]
      for k in row.keys():
        if k!='id':
          counters[k][int(row[k])][i] += 1

  print('-----------------------------------------------')
  print('POSSIBLE VALUES FOR EACH COLUMN + [HOW MANY PEOPLE HAVE THAT VALUE]:')
  for k in df.keys():
    if k != 'id':
      possible_values = set(df[k].tolist())
      possible_values_and_counts = ', '.join([f'{val} {counters[k][int(val)]}'  for val in possible_values])
      percentage = counters[k][2] / (np.array(counters[k][2])+counters[k][1])
      print(k, ' '*(10-len(k)),':', possible_values_and_counts, '       \033[94mprob_val_2=', percentage,'\033[0m')
  print('\033[94mNOTE: the values in blue should be pretty similar (distribution of classes in training/validation)\033[0m')      

  count_files  = [[] for _ in range(len(splits))]
  images = os.listdir('./project_dataset/train')
  splits = [set(s) for s in splits]
  for pid in df.index.to_list():
    for i, split in enumerate(splits):
      if pid in split:
        f_begin = ('0000'+str(pid)+'_')[-5:]
        images_about_pid = len([x for x in images if f_begin in x])
        count_files[i].append(images_about_pid)
  print('-----------------------------------------------')
  print('how many files each person has: \n(good if the values in the arrays below seems to be generated by a similar pdf)')
  for cf in count_files:
    cf.sort()
    print(cf)

### Print Stats
print_split_stats(split_dataset(verbose=True))


# What is the meaning of "backpack    : 1 [386, 177], 2 [128, 61]        prob_val_2= [0.24902724 0.25630252]"   ?
#     
# - "backpack" is the category
# - "1 [386, 177]" means that 387 people do not have a backpack in the training set and that 177 people do not have the bagpack in the validation set
# - "2 [128, 61]"  means that 128 people have a backpack in the training set and that 61 people have the bagpack in the validation set
# - "prob_val_2= [0.24902724 0.25630252]"   means that 24.902724% of the people in the training set have a bagpack (while 25.630252% have a bagpack in the validation set)
#
#  0.24902724 = 128/(128+386)

- combination downpurple=2 is very rare, copying this person in both splits
n_people = 751
------------------------------------------
len train = 514
len validation = 238
-----------------------------------------------
POSSIBLE VALUES FOR EACH COLUMN + [HOW MANY PEOPLE HAVE THAT VALUE]:
age         : 1 [4, 3], 2 [420, 178], 3 [85, 54], 4 [5, 3]        [94mprob_val_2= [0.99056604 0.98342541] [0m
backpack    : 1 [386, 177], 2 [128, 61]        [94mprob_val_2= [0.24902724 0.25630252] [0m
bag         : 1 [380, 178], 2 [134, 60]        [94mprob_val_2= [0.26070039 0.25210084] [0m
handbag     : 1 [453, 218], 2 [61, 20]        [94mprob_val_2= [0.11867704 0.08403361] [0m
clothes     : 1 [69, 29], 2 [445, 209]        [94mprob_val_2= [0.86575875 0.87815126] [0m
down        : 1 [202, 87], 2 [312, 151]        [94mprob_val_2= [0.60700389 0.63445378] [0m
up          : 1 [30, 14], 2 [484, 224]        [94mprob_val_2= [0.94163424 0.94117647] [0m
hair        : 1 [331, 157], 2 [183, 81]      

### From list of people's ids to list of images

In [None]:
def get_files(splits):
  """
  for each list of people's ids, retrieve their files
  :return a list of filenames for each split

  eg.
  input:  [[1,44], [6]]
  output: [['0001_c1_2131.jpg', '0001_c2_7134.jpg', '0044_c2_8364.jpg'], ['0006_c1_5341.jpg', '0006_c1_5373.jpg']]
  """
  s_files  = [[] for _ in range(len(splits))]
  images = os.listdir('./project_dataset/train')
  splits = [set(s) for s in splits]
  for pid in set().union(*splits):
    for i, split in enumerate(splits):
      if pid in split:
        f_begin = ('0000'+str(pid)+'_')[-5:]
        s_files[i] += [x for x in images if f_begin in x]
  return s_files




### Define Dataset

define a class for our dataset

In [None]:
import torch, torchvision
from torch.utils.data import Dataset
from PIL import Image

class ReIdentificationDataset(Dataset):
  """
  this dataset receives in input the image_files that it must load and return
  when the image at index idx is asked, this Dataset checks if the pil image is already in the cache,
  if it is not, the file is loaded from the file system
  """
  def __init__(self, root, files, transform=None, target_transform=None):
    super().__init__()
    self.root              = (root[-1]=='/') and root or (root+'/')          # add final slash if there isn't
    self.transform         = transform or torchvision.transforms.ToTensor()  # if not specified: transform to Tensor
    self.target_transform  = target_transform or (lambda y: torch.tensor(y)) # if not specified: transform to Tensor

    self.files  = files
    self.df     = pd.read_csv('./project_dataset/annotations_train.csv', index_col='id')
    self._cache = [None for _ in range(len(files))]

  def __getitem__(self, idx):
    # load image, target from file-system or from cache
    if self._cache[idx] is None:
      # load image
      path = self.root + self.files[idx]
      x = Image.open(path)

      # load target
      pid = int(self.files[idx][:4])
      y = self.df.loc[pid].to_list()
      self._cache[idx] = (x,y)
    else:
      # retrieve from cache
      x,y = self._cache[idx]

    x = self.transform(x)
    y = self.target_transform(y)
    return x, y

  def __len__(self):
    return len(self.files)

### DataLoaders

define a functions that puts toghether everything specified above and return Dataloaders for training and validation

In [None]:
from torch.utils.data import DataLoader
train_path='./project_dataset/train'

def get_data_loaders(partitions, rand_seed=0, batch_size=16, transform=None, target_transform=None, pin_memory=False):
  """
  This functions is a wrapper for the functions used above:
  0. download dataset if not already in memory (done inside split_dataset())
  1. ids of the people are splitted in training and validation set
  2. for each splits the files about the people in the split are retrieved
  3. a Dataset and a Dataloader are created

  Parameters
  ----------
  partitions:         tuple
    percentages of how to split the dataset, eg. (0.7, 0.3) will return 2 dataloaders where the first will contain 70% of the people and the second 30%
    you can use more than 2 partitions,  eg. (.5, .25, .25) ---> train(50%), validation1(25%), validation2(25%)

  rand_seed:          int (default=0)
    the seed used when doing shuffles  --> for deterministic results & reproducibility

  batch_size:         int (default=16)
    how many images are returned in a batch

  transform:          callable (default=None)
    rotations, cropping, transformations, normalizations, .... applied to the image

  target_transform:   callable (default=None)
    transformations made to the target (eg. one-hot encoding for values in age; normalizing)

  pin_memory:         bool (default=False)
    a speedup when using GPUs (automatic toDevice?)

  Returns
  -------
  one DataLoader for each partition (usually 2, but if len(partition)==3 three DataLoaders are returned)

  inputBatch= torch.Size([16, 3, 128, 64]), targetBatch=[16,30]
  """
  # download dataset & split people's ids in train & validation
  splits = split_dataset(partitions, rand_seed=rand_seed) 

  # get the files 
  splits = get_files(splits)

  dataloaders = []
  for files in splits:
    # create dataset & dataloader for every split
    dataset = ReIdentificationDataset(root=train_path, files=files, transform=transform, target_transform=target_transform)
    d_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=pin_memory)
    dataloaders.append(d_loader)

  return dataloaders