# Imports

In [2]:
%matplotlib inline

from collections import namedtuple

import os
import pandas as pd
import matplotlib as plt
from sklearn.model_selection import train_test_split


pd.set_option('display.width',1000)

In [None]:
import torch
import torch.nn as nn
import torn.nn.functional as F
from torch import optim, transforms
from torch.utils.data import Dataset, Dataloader
from torchvision.io import read_image, ImageReadMode

import sklearn.metrics as metrics
import seaborn as sns

# Device

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"> Torch version: {torch.__version__}")
print(f"> Dispositivo configurado para usar: {device}")
print(torch.cuda.get_device_name(0))

# Constants & hiperparams

In [3]:
# conts ---

LABELS = []
DATASETS_PATHS = namedtuple("DatasetPath", "originals faces norm_faces")(
    "data/dataset", "data/face_images", "data/normalized_face_images")

TRAIN_PERCENT = .7
VAL_PERCENT = .2
TEST_PERCENT = .1
RANDOM_STATE = None

IMAGE_SIZE = 220
BATCH_SIZE = 32


# hyperparams ---

NUM_EPOCH = 100
LEARNING_RATE = .001
MOMENTUM = .9
EARLY_PATIENCE = 10
SCHEDULER_PATIENCE = 6

DROPOUTS = [.5, .75]

# Data preprocessing

## Dataset creation (and adaptation)

In [4]:
dataset = pd.read_csv(os.path.join(DATASETS_PATHS.originals, "selfie_id.csv"))
dataset.drop(columns=["UserRace", "Age","Name"], inplace=True)
dataset.rename(columns={"SetId": "setid", "FName": "imgid", "URL": "path"}, inplace=True)

print("> num elementos:", dataset.count(axis=1).size)
print("> num ids diferentes", len(dataset.setid.unique()))
print(dataset.setid.value_counts())
dataset.head()

> num elementos: 435
> num ids diferentes 29
setid
0001cc1e1e--61af8513667cba1b47349869    15
0001cc1e1e--61af86f374501f072edf6d2c    15
0001cc1e1e--61af892e1735844b198f7269    15
0001cc1e1e--61af89eea5a19325b68a2801    15
0001cc1e1e--61af8a351735844b198f7ade    15
0001cc1e1e--61af8aa9d46ff461a71462cc    15
0001cc1e1e--61af8b9119bf98074e184806    15
0001cc1e1e--61af8bd7667cba1b473506bd    15
0001cc1e1e--61af8d2aad9d9c12b4864776    15
0001cc1e1e--61af8d4045612c79df65d153    15
0001cc1e1e--61af91e464610d607e3a329e    15
0001ca9b9a--61aa90111e37d6183d870ea5    15
0001ca9b9a--61ab21f4277a1c323b746383    15
0001ca9b9a--61ab3a5fd6d182382f20a168    15
0001ca9b9a--61ab5c83d6d182382f227ccb    15
0001ca9b9a--61ab93399e265a148f85e5d0    15
0001ca9b9a--61ab9a5e686e3d72a4be5551    15
0001ca9b9a--61abceec4376670967375e12    15
0001ca9b9a--61abcfbb45612c79df4dba1f    15
0001ca9b9a--61ac93aeed797b7414edf419    15
0001ca9b9a--61ac9cfdd211124f5daebd40    15
0001ca9b9a--61acd3c5b402c12388930bcf    15
000

Unnamed: 0,setid,imgid,path
0,0001cc1e1e--61af8513667cba1b47349869,ID_1.jpg,0001cc1e1e--61af8513667cba1b47349869_age_25_na...
1,0001cc1e1e--61af8513667cba1b47349869,ID_2.jpg,0001cc1e1e--61af8513667cba1b47349869_age_25_na...
2,0001cc1e1e--61af8513667cba1b47349869,Selfie_1.jpg,0001cc1e1e--61af8513667cba1b47349869_age_25_na...
3,0001cc1e1e--61af8513667cba1b47349869,Selfie_10.jpg,0001cc1e1e--61af8513667cba1b47349869_age_25_na...
4,0001cc1e1e--61af8513667cba1b47349869,Selfie_11.jpg,0001cc1e1e--61af8513667cba1b47349869_age_25_na...


## Data balancing

In [5]:
min_set_size = dataset.setid.value_counts().min()
print("> Cantidad de imagenes de la setid que menos tiene:", min_set_size)

> Cantidad de imagenes de la setid que menos tiene: 15


In [6]:
# Although is not necesary in this case, I want to create a code usable in other cases (maybe i change my dataset)
dataset = pd.DataFrame(dataset.groupby("setid")[dataset.columns].apply(lambda setid: setid.sample(min_set_size, random_state=RANDOM_STATE)).reset_index(drop=True))

print("> Cantidad de elementos:", dataset.count(axis=1).size, end='\n\n')
print(dataset.setid.value_counts())
dataset.head()

> Cantidad de elementos: 435

setid
0001ca9b9a--61aa90111e37d6183d870ea5    15
0001ca9b9a--61ab21f4277a1c323b746383    15
0001ca9b9a--61ab3a5fd6d182382f20a168    15
0001ca9b9a--61ab5c83d6d182382f227ccb    15
0001ca9b9a--61ab93399e265a148f85e5d0    15
0001ca9b9a--61ab9a5e686e3d72a4be5551    15
0001ca9b9a--61abceec4376670967375e12    15
0001ca9b9a--61abcfbb45612c79df4dba1f    15
0001ca9b9a--61ac93aeed797b7414edf419    15
0001ca9b9a--61ac9cfdd211124f5daebd40    15
0001ca9b9a--61acd3c5b402c12388930bcf    15
0001ca9b9a--61acdab83e0f222c5afd75a2    15
0001ca9b9a--61ad04a074501f072eccf903    15
0001ca9b9a--61ad4051889fb241713def3a    15
0001ca9b9a--61ad582184c9021db9ea19df    15
0001ca9b9a--61adf4903e0f222c5a048507    15
0001ca9b9a--61af4a11ec7c4619a7c95ddf    15
0001ca9b9a--61af51c056ee9447b653e097    15
0001cc1e1e--61af8513667cba1b47349869    15
0001cc1e1e--61af86f374501f072edf6d2c    15
0001cc1e1e--61af892e1735844b198f7269    15
0001cc1e1e--61af89eea5a19325b68a2801    15
0001cc1e1e--61af8a

Unnamed: 0,setid,imgid,path
0,0001ca9b9a--61aa90111e37d6183d870ea5,Selfie_10.jpg,0001ca9b9a--61aa90111e37d6183d870ea5_age_34_na...
1,0001ca9b9a--61aa90111e37d6183d870ea5,Selfie_9.jpg,0001ca9b9a--61aa90111e37d6183d870ea5_age_34_na...
2,0001ca9b9a--61aa90111e37d6183d870ea5,Selfie_1.jpg,0001ca9b9a--61aa90111e37d6183d870ea5_age_34_na...
3,0001ca9b9a--61aa90111e37d6183d870ea5,Selfie_3.jpg,0001ca9b9a--61aa90111e37d6183d870ea5_age_34_na...
4,0001ca9b9a--61aa90111e37d6183d870ea5,Selfie_11.jpg,0001ca9b9a--61aa90111e37d6183d870ea5_age_34_na...


## Save / load preprocessed database

### Save database

In [7]:
dataset.to_csv(os.path.join(DATASETS_PATHS.originals, 'dataset.csv'), index=False)

### Load database

In [8]:
dataset = pd.read_csv(os.path.join(DATASETS_PATHS.originals, 'dataset.csv'))
dataset.head()

Unnamed: 0,setid,imgid,path
0,0001ca9b9a--61aa90111e37d6183d870ea5,Selfie_10.jpg,0001ca9b9a--61aa90111e37d6183d870ea5_age_34_na...
1,0001ca9b9a--61aa90111e37d6183d870ea5,Selfie_9.jpg,0001ca9b9a--61aa90111e37d6183d870ea5_age_34_na...
2,0001ca9b9a--61aa90111e37d6183d870ea5,Selfie_1.jpg,0001ca9b9a--61aa90111e37d6183d870ea5_age_34_na...
3,0001ca9b9a--61aa90111e37d6183d870ea5,Selfie_3.jpg,0001ca9b9a--61aa90111e37d6183d870ea5_age_34_na...
4,0001ca9b9a--61aa90111e37d6183d870ea5,Selfie_11.jpg,0001ca9b9a--61aa90111e37d6183d870ea5_age_34_na...


## Data slicing in training, validation and tests groups

In [9]:
train_dataset, tmp = train_test_split(dataset, train_size=TRAIN_PERCENT, stratify=dataset.setid, shuffle=True, random_state=RANDOM_STATE)
val_dataset, test_dataset = train_test_split(tmp, test_size=TEST_PERCENT/(TEST_PERCENT+VAL_PERCENT), stratify=tmp.setid, shuffle=True, random_state=RANDOM_STATE)

print("> train", train_dataset.count(axis=1).size, train_dataset.count(axis=1).size/dataset.count(axis=1).size)
print("> val", val_dataset.count(axis=1).size, val_dataset.count(axis=1).size/dataset.count(axis=1).size)
print("> test", test_dataset.count(axis=1).size, test_dataset.count(axis=1).size/dataset.count(axis=1).size)

> train 304 0.6988505747126437
> val 87 0.2
> test 44 0.10114942528735632


In [10]:
print((train_dataset.path.isin(test_dataset.path) == True).value_counts(), end='\n\n')
print((train_dataset.path.isin(val_dataset.path) == True).value_counts(), end='\n\n')
print((val_dataset.path.isin(test_dataset.path) == True).value_counts(), end='\n\n')

path
False    304
Name: count, dtype: int64

path
False    304
Name: count, dtype: int64

path
False    87
Name: count, dtype: int64



## Save / load datasets slices (train, val, test)

## Save

In [11]:
train_dataset.to_csv(os.path.join(DATASETS_PATHS.originals, 'train_dataset.csv'), index=False)
val_dataset.to_csv(os.path.join(DATASETS_PATHS.originals, 'val_dataset.csv'), index=False)
test_dataset.to_csv(os.path.join(DATASETS_PATHS.originals, 'test_dataset.csv'), index=False)

## Load

In [12]:
train_dataset = pd.read_csv(os.path.join(DATASETS_PATHS.originals, 'train_dataset.csv'))
val_dataset = pd.read_csv(os.path.join(DATASETS_PATHS.originals, 'val_dataset.csv'))
test_dataset = pd.read_csv(os.path.join(DATASETS_PATHS.originals, 'test_dataset.csv'))