In [1]:
import numpy as np
import os
try:
    import ujson as json
except ModuleNotFoundError:
    ! pip install ujson -qU
    ! pip install umap-learn -qU
    import ujson as json

import requests
import pandas as pd
import random

if 'google.colab' in str(get_ipython()):
    on_colab = True
    # assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'
    # !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl -qU
    !pip install pytorch-lightning -qU
else:
    on_colab = False

import pytorch_lightning as pl

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn



In [2]:
# Helper function to download files
def download_file(url):
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as ff:
            for chunk in r.iter_content(chunk_size=8192):
                ff.write(chunk)
    return local_filename

In [None]:
# Helper function to pick random parameter from iterable or range
def random_element(parameter_iterable):
    # If it's a tuple treat it as an upper and lower bound
    if isinstance(parameter_iterable, tuple):
        out = random.uniform(parameter_iterable[0], parameter_iterable[1])
        return round(out, 6)

    # If it's a list, return a random element from the list
    elif isinstance(parameter_iterable, list):
        no_choices = len(parameter_iterable)
        return parameter_iterable[random.randrange(0, no_choices)]

    else:
        print('Input not a tuple or list.')
        raise TypeError

In [None]:
def randomize_hparams():

    parameters_ = {
        'l2_norm': random_element((0.05, 0.75)),
        'batch_size': 256,
                  'learning_rate': random_element((0.1, 1e-3)),
                  'lr_scheduler_factor': random_element((0.1, 1)),
                  'lr_scheduler_patience': random_element([2, 4, 8, 16]),
                  'lr_scheduler_min_lr': random_element((1e-8, 1e-2)),
                  'hidden_layer_size': random_element([16, 24, 32, 64, 96, 128]),
                    'gradient_clip_val': random_element((0.4, 2)),
        'stochastic_weight_avg': random_element([True, False]),
        'gradient_clip_algorithm': random_element(['norm', 'value']),
                          }

    return parameters_

In [3]:
# Load x data
if on_colab:
    download_file('https://objectstorage.eu-frankfurt-1.oraclecloud.com/n/frwwzrj6ghal/b/thesis/o/micro_dataset1_resnet18_output_identity.json')
    data_dir = r'micro_dataset1_resnet18_output_identity.json'
else:
    data_dir = r'F:\temp\thesisdata\micro_dataset_1\micro_dataset1_resnet18_output_identity.json'

with open(data_dir, 'r') as f:
    data_dict_list = json.load(f)

data_dict = {}
for element in data_dict_list:
    data_dict.update(element)

# Show first two elements of the dict
# dict(itertools.islice(data_dict.items(), 2))
df_x = pd.DataFrame.from_dict(data_dict, orient='index')
df_x.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
3865991_3865991_691412_2935874-DSMUXGTJ-7.jpg,0.402192,0.215186,1.006052,2.535887,0.223182,0.966906,0.067344,3.101986,1.115319,0.726254,...,0.732725,0.914586,0.61877,0.791526,2.018329,0.108899,0.651192,0.192771,2.346089,1.501905
7980766_7980766_669333_7048178-XOYQRJZQ-7.jpg,1.07395,0.707743,0.106056,0.551383,0.68053,1.220285,1.024527,0.30502,1.0397,0.217051,...,0.649347,1.637149,2.630768,2.322523,0.047876,1.300324,3.735312,1.352288,0.054118,3.584239
3749936_3749936_314728_2819820-JDANXKLD-7.jpg,2.806773,0.006849,0.898165,0.802126,0.967394,0.287235,0.244238,1.446031,6.778771,0.235296,...,1.267828,0.326778,0.115728,0.466623,0.193548,1.720899,1.446586,2.53537,0.864782,0.062465
5610715_5610715_91068_4680525-LMQNOWJA-7.jpg,0.039423,0.658119,1.192224,2.684522,2.460881,0.046748,0.357242,3.366874,0.91903,0.776935,...,0.823428,0.713102,0.531813,1.427407,0.424931,2.481088,0.868538,2.333207,0.845097,1.062181
6771765_6771765_786228_5841405-PSPFNCAV-7.jpg,0.292377,0.011645,0.000842,1.337585,0.382337,0.305897,0.060697,0.592383,1.28745,0.122081,...,0.067053,0.803128,0.23425,2.015079,1.405711,0.291771,0.038078,0.586244,0.069435,0.06638


In [4]:
# Load y data
if on_colab:
    download_file('https://objectstorage.eu-frankfurt-1.oraclecloud.com/n/frwwzrj6ghal/b/thesis/o/SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv')
    data_dir = 'SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv'
else:
    data_dir = r'F:\temp\thesisdata\SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv'

df_y = pd.read_csv(data_dir, sep='\t')
df_y.set_index('FILENAME', inplace=True)

# Bin the data
# df_y['PRICE_BIN'] = pd.qcut(df_y['PRICE'], q=5)
df_y['PRICE_BIN_IDX'] = pd.qcut(df_y['PRICE'], q=5, labels=[0, 1, 2, 3, 4])
# df_y['LIKES_VIEWS_RATIO_BIN'] = pd.qcut(df_y['LIKES_VIEWS_RATIO'], q=5)
df_y['LIKES_VIEWS_RATIO_BIN_IDX'] = pd.qcut(df_y['LIKES_VIEWS_RATIO'], q=5, labels=[0, 1, 2, 3, 4])
df_y = df_y.astype({'PRICE_BIN_IDX': int, 'LIKES_VIEWS_RATIO_BIN_IDX': int})
df_y.drop(['PRICE', 'LIKES_VIEWS_RATIO'], axis=1, inplace=True)

df_y.head()

Unnamed: 0_level_0,PRICE_BIN_IDX,LIKES_VIEWS_RATIO_BIN_IDX
FILENAME,Unnamed: 1_level_1,Unnamed: 2_level_1
481029_481029_349215_257967-7.jpg,2,1
511001_511001_388280_276511-7.jpg,3,0
517326_517326_376595_282597-CCCWLPRX-7.jpg,4,0
524310_524310_342634_286117-LEMCITZY-7.jpg,4,0
5045187_5045187_7198_4115009-DDZJITSS-7.jpg,2,1


In [5]:
# Join x and y into a single dataframe
df = df_y.join(df_x)
df.head()

Unnamed: 0,PRICE_BIN_IDX,LIKES_VIEWS_RATIO_BIN_IDX,0,1,2,3,4,5,6,7,...,502,503,504,505,506,507,508,509,510,511
1008695_1008695_16575_492565-WPTALJUX-7.jpg,4,0,0.506733,1.394197,0.142876,1.595724,0.447865,2.721004,0.549245,0.088606,...,0.514084,1.684672,2.690634,1.076642,1.014139,0.403866,1.054885,1.213333,0.08846,0.78616
1020928_1020928_20375_496298-MVOEZUTF-7.jpg,1,0,0.833585,1.465594,0.606634,0.643496,0.979372,1.374905,1.059826,0.382815,...,0.505261,1.688515,0.417936,0.20068,1.674028,1.199314,0.406038,1.510623,0.472602,1.073225
1051436_1051436_17127_506738-INLFTOGF-7.jpg,1,1,1.898964,0.537831,0.355688,1.91466,0.678449,0.122319,0.920843,0.896121,...,0.21323,0.716053,0.299384,2.024731,0.045484,1.757237,0.77597,2.071618,0.271196,1.722807
1055377_1055377_18467_508857-HYTIVNMU-7.jpg,3,0,0.959911,1.014624,0.928045,0.345727,0.310256,0.644479,0.349452,1.512597,...,0.680928,2.470096,2.451149,3.417884,1.716235,0.149804,0.0,1.423536,0.022786,0.289719
1057504_1057504_19082_509430-EKIORJVM-7.jpg,4,1,1.535582,0.951705,0.470121,2.631895,0.672225,0.481367,0.571308,0.184078,...,0.659331,0.722163,1.269953,0.223978,1.508935,0.835212,2.030264,1.891201,0.333001,2.909141


In [6]:
class SaatchiDataset(Dataset):
    training_set = df[:13000]
    validation_set = df[13000:14000]
    test_set = df[14000:]

    @property
    def targets(self):
        return self.targets_

    @property
    def data(self):
        return self.data_

    def __init__(self, stage: str = None, target_selection=None):
        self.stage = stage
        self.target_selection = target_selection

        if self.stage == 'train':
            self.dataset = self.training_set
        elif self.stage == 'validation':
            self.dataset = self.validation_set
        elif self.stage == 'test':
            self.dataset = self.test_set
        else:
            print(f'Invalid stage specified: "{stage}" , valid options are: [train, validation, test].')
            self.dataset = None

        self.data_ = self.dataset.drop(['PRICE_BIN_IDX', 'LIKES_VIEWS_RATIO_BIN_IDX'], axis=1).values

        if self.target_selection == 'price':
            self.targets_ = self.dataset['PRICE_BIN_IDX'].values
        elif self.target_selection == 'likes_view_ratio':
            self.targets_ = self.dataset['LIKES_VIEWS_RATIO_BIN_IDX'].values
        else:
            print(
                f'Invalid target selection specified: "{target_selection}"'
                f', valid options are: [price, likes_view_ratio].')

    def __getitem__(self, index):
        return torch.as_tensor(self.data_[index]).float(), torch.as_tensor(self.targets_[index]).long()

    def __len__(self):
        return len(self.data_)

In [7]:
class SaatchiDataModule(pl.LightningDataModule):
  def __init__(self,
               batch_size: int = 64,
               num_workers: int = 4,
               target_selection: str = 'price'):
    super().__init__()
    self.batch_size = hparams['batch_size']
    self.data = None
    self.num_workers = num_workers
    self.target_selection = target_selection

  def prepare_data(self):
      pass

  def setup(self, stage: str = None):
    if stage == 'fit':
      self.data = SaatchiDataset(stage='train', target_selection=self.target_selection)
    else:
      self.data = SaatchiDataset(stage=stage, target_selection=self.target_selection)

  def train_dataloader(self):
    return DataLoader(self.data,
                      batch_size=self.batch_size,
                      drop_last=True,
                      num_workers=self.num_workers)

  def val_dataloader(self):
    return DataLoader(self.data,
                      batch_size=self.batch_size,
                      drop_last=True,
                      num_workers=self.num_workers)

  def test_dataloader(self):
    return DataLoader(self.data,
                      batch_size=self.batch_size,
                      drop_last=True,
                      num_workers=self.num_workers)

In [8]:
class SaatchiMLP(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.hparams.hidden_layer_size = hparams['hidden_layer_size']
    self.hparams.l2_norm = hparams['l2_norm']
    self.hparams.lr = hparams['learning_rate']
    self.hparams.lr_scheduler_factor = hparams['lr_scheduler_factor']
    self.hparams.lr_scheduler_patience = hparams['lr_scheduler_patience']
    self.hparams.lr_scheduler_min_lr = hparams['lr_scheduler_min_lr']

    self.layers = nn.Sequential(
      nn.Linear(512, self.hparams.hidden_layer_size),
      nn.ReLU(),
      nn.Linear(self.hparams.hidden_layer_size, 5)
    )
    self.ce = nn.CrossEntropyLoss()

  def forward(self, x):
    return self.layers(x)

  def training_step(self, batch, batch_idx):
    x, y = batch
    x = x.view(x.size(0), -1)
    y_hat = self.layers(x)
    loss = self.ce(y_hat, y)
    self.log('train_loss', loss)

    # Logic for calculating and printing accuracy
    step_counter.increment()
    if step_counter.step_count % 100 == 0:

        pred = np.array([x.argmax() for x in y_hat.cpu().detach().numpy()])
        y_ = y.cpu().detach().numpy()
        correct_preds = np.sum(y_ == pred)
        acc = round(correct_preds / y_.shape[0], 4)
        # print(f'y_hat = {y_hat}; y = {y}')
        # print(f'Accuracy at step {step_counter.step_count}: {acc}')
        accuracy_tracker.register_accuracy(accuracy_value=acc)
        # print(f'Loss at step {loss}')

    return loss

  def validation_step(self, batch, batch_idx):
    x, y = batch
    x = x.view(x.size(0), -1)
    y_hat = self.layers(x)
    # print(f'y = {y}; y_hat = {y_hat}')
    loss = self.ce(y_hat, y)
    self.log('validation_loss', loss)
    # print(f'Loss at validation {loss}')
    return loss

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(),
                                 lr=self.hparams.lr,
                                 weight_decay=self.hparams.l2_norm)

    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                            factor=self.hparams.lr_scheduler_factor,
                                                            patience=self.hparams.lr_scheduler_patience,
                                                            min_lr=self.hparams.lr_scheduler_min_lr,
                                                            verbose=False)

    scheduler = {
            'scheduler': lr_scheduler,
            'monitor': 'validation_loss',
            'reduce_on_plateau': True
        }

    return [optimizer], [scheduler]

In [None]:
class AccuracyTracker(object):
  def __init__(self, run):
    self.accuracy = 0.0
    self.accuracy_list = list()
    self.run_number = run

  def register_accuracy(self, accuracy_value):
    self.accuracy_list.append({self.run_number: accuracy_value})
    if accuracy_value > self.accuracy:
      self.accuracy = accuracy_value

  @property
  def get_accuracy_value(self):
    return self.accuracy

class StepCounter(object):
    def __init__(self):
        self.step_count = 0

    def increment(self):
        self.step_count = self.step_count + 1

    @property
    def get_step_count(self):
        return self.step_count

hparams = randomize_hparams()
saatchi_data = SaatchiDataModule(target_selection='price',
                                  batch_size=hparams['batch_size'],
                                  num_workers=1)

results = []

hparam_list = []

num_sanity_val_steps = 0
max_epochs = 25


Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name   | Type             | Params
--------------------------------------------
0 | layers | Sequential       | 3.0 K 
1 | ce     | CrossEntropyLoss | 0     
--------------------------------------------
3.0 K     Trainable params
0         Non-trainable params
3.0 K     Total params
0.012     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [None]:
tracker_list = []

def train(run):

  hparams = randomize_hparams()

  saatchi_mlp = SaatchiMLP()

  trainer = pl.Trainer(gpus=1,
                      max_epochs=max_epochs,
                      progress_bar_refresh_rate=0,
                      gradient_clip_val=hparams['gradient_clip_val'],
                      gradient_clip_algorithm=hparams['gradient_clip_algorithm'],
                      stochastic_weight_avg=hparams['stochastic_weight_avg'],
                       weights_summary=None
                      )
  print('Training...')
  trainer.fit(saatchi_mlp, saatchi_data)
  print(f'Run {run} best accuracy: {accuracy_tracker.get_accuracy_value * 100}%, hparams: {hparams}')

for i in range(100):
  step_counter = StepCounter()
  accuracy_tracker = AccuracyTracker(i)

  train(i)
  hparam_list.append({i: hparams})

  tracker_list.append(accuracy_tracker)
  results.append(accuracy_tracker.get_accuracy_value)

In [None]:
s = SaatchiDataset(stage='train', target_selection='price')

In [None]:
s.__getitem__(1)

In [None]:

input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
input

In [None]:
target

In [None]:

df.to_excel('saatchi_micro_512d.xlsx')