In [1]:
try:
    import ujson as json
    import umap
except ModuleNotFoundError:
    ! pip install ujson -qU
    ! pip install umap-learn -qU
    import ujson as json
    import umap

import requests
import pandas as pd
import random

import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

if 'google.colab' in str(get_ipython()):
    on_colab = True
else:
    on_colab = False

# xgb_mode = 'gbtree'
xgb_mode = 'dart'


In [2]:
# Helper function to download files
def download_file(url):
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename

In [3]:
# Helper function to pick random parameter from iterable or range
def random_element(parameter_iterable):
    # If it's a tuple treat it as an upper and lower bound
    if isinstance(parameter_iterable, tuple):
        out = random.uniform(parameter_iterable[0], parameter_iterable[1])
        return round(out, 6)

    # If it's a list, return a random element from the list
    elif isinstance(parameter_iterable, list):
        no_choices = len(parameter_iterable)
        return parameter_iterable[random.randrange(0, no_choices)]

    else:
        print('Input not a tuple or list.')
        raise TypeError

In [4]:
# Dart booster
learning_task_parameters = {'seed': 3}

dart_general_parameters = {'booster': random_element(['gbtree', 'dart']), 'nthread': 2}

gbtree_general_parameters = {'booster': 'gbtree', 'nthread': 2}

def randomize_hparams():
    if xgb_mode == 'dart':
        booster_parameters_ = {'learning_rate': random_element((0.1, 0.8)),
                              'gamma': random_element((0, 5)),
                              'max_depth': random_element(list(range(3,12))),
                              'min_child_weight': random_element((0, 3)),
                              'max_delta_step': 0,
                              'subsample': random_element((0.4, 0.8)),
                              'sampling_method': random_element(['uniform', 'gradient_based']),
                              'lambda': random_element((0.3, 3)),
                              'alpha': random_element((0, 2)),
                              'grow_policy': random_element(['depthwise', 'lossguide']),
                              'max_leaves': random_element([1, 2, 3, 4]),
                              'sample_type': random_element(['uniform', 'weighted']),
                              'normalize_type': random_element(['tree', 'forest']),
                              'rate_drop': random_element((0, 1)),
                              'one_drop': random_element([0, 1]),
                              }

    elif xgb_mode == 'gbtree':
        booster_parameters_ = {'learning_rate': random_element((0.1, 0.8)),
                              'gamma': random_element((0, 5)),
                              'max_depth': random_element(list(range(3,12))),
                              'min_child_weight': random_element((0, 3)),
                              'max_delta_step': 0,
                              'subsample': random_element((0.4, 0.8)),
                              'sampling_method': random_element(['uniform', 'gradient_based']),
                              'lambda': random_element((0.3, 3)),
                              'alpha': random_element((0, 2)),
                              'grow_policy': random_element(['depthwise', 'lossguide']),
                              'max_leaves': random_element([1, 2, 3, 4]),
                              }
    else:
        print('"Global varialbe "xgb_mode" not set!')
        raise TypeError

    if on_colab:
        booster_parameters_.update({'tree_method': 'gpu_hist'})
    else:
        booster_parameters_.update({'tree_method': 'auto'})
    return booster_parameters_

booster_parameters = randomize_hparams()

In [5]:
# Load x data
if on_colab:
    download_file('https://objectstorage.eu-frankfurt-1.oraclecloud.com/n/frwwzrj6ghal/b/thesis/o/micro_dataset1_resnet18_output_identity.json')
    data_dir = r'micro_dataset1_resnet18_output_identity.json'
    booster_parameters.update({'tree_method': 'gpu_hist'})
else:
    data_dir = r'F:\temp\thesisdata\micro_dataset_1\micro_dataset1_resnet18_output_identity.json'

with open(data_dir, 'r') as f:
    data_dict_list = json.load(f)

data_dict = {}
for element in data_dict_list:
    data_dict.update(element)

# Show first two elements of the dict
# dict(itertools.islice(data_dict.items(), 2))
df_x = pd.DataFrame.from_dict(data_dict, orient='index')
df_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
3865991_3865991_691412_2935874-DSMUXGTJ-7.jpg,0.402192,0.215186,1.006052,2.535887,0.223182,0.966906,0.067344,3.101986,1.115319,0.726254,...,0.732725,0.914586,0.61877,0.791526,2.018329,0.108899,0.651192,0.192771,2.346089,1.501905
7980766_7980766_669333_7048178-XOYQRJZQ-7.jpg,1.07395,0.707743,0.106056,0.551383,0.68053,1.220285,1.024527,0.30502,1.0397,0.217051,...,0.649347,1.637149,2.630768,2.322523,0.047876,1.300324,3.735312,1.352288,0.054118,3.584239
3749936_3749936_314728_2819820-JDANXKLD-7.jpg,2.806773,0.006849,0.898165,0.802126,0.967394,0.287235,0.244238,1.446031,6.778771,0.235296,...,1.267828,0.326778,0.115728,0.466623,0.193548,1.720899,1.446586,2.53537,0.864782,0.062465
5610715_5610715_91068_4680525-LMQNOWJA-7.jpg,0.039423,0.658119,1.192224,2.684522,2.460881,0.046748,0.357242,3.366874,0.91903,0.776935,...,0.823428,0.713102,0.531813,1.427407,0.424931,2.481088,0.868538,2.333207,0.845097,1.062181
6771765_6771765_786228_5841405-PSPFNCAV-7.jpg,0.292377,0.011645,0.000842,1.337585,0.382337,0.305897,0.060697,0.592383,1.28745,0.122081,...,0.067053,0.803128,0.23425,2.015079,1.405711,0.291771,0.038078,0.586244,0.069435,0.06638


In [None]:
# Load y data
if on_colab:
    download_file('https://objectstorage.eu-frankfurt-1.oraclecloud.com/n/frwwzrj6ghal/b/thesis/o/SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv')
    data_dir = 'SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv'
else:
    data_dir = r'F:\temp\thesisdata\SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv'

df_y = pd.read_csv(data_dir, sep='\t')
df_y.set_index('FILENAME', inplace=True)

# Bin the data
# df_y['PRICE_BIN'] = pd.qcut(df_y['PRICE'], q=5)
df_y['PRICE_BIN_IDX'] = pd.qcut(df_y['PRICE'], q=5, labels=[0, 1, 2, 3, 4])
# df_y['LIKES_VIEWS_RATIO_BIN'] = pd.qcut(df_y['LIKES_VIEWS_RATIO'], q=5)
df_y['LIKES_VIEWS_RATIO_BIN_IDX'] = pd.qcut(df_y['LIKES_VIEWS_RATIO'], q=5, labels=[0, 1, 2, 3, 4])
df_y = df_y.astype({'PRICE_BIN_IDX': int, 'LIKES_VIEWS_RATIO_BIN_IDX': int})
df_y.drop(['PRICE', 'LIKES_VIEWS_RATIO'], axis=1, inplace=True)

df_y.head()

In [1]:
# Join x and y into a single dataframe
df = df_y.join(df_x)
df.head()

NameError: name 'df_y' is not defined

In [None]:
X = df.drop(['PRICE_BIN_IDX', 'LIKES_VIEWS_RATIO_BIN_IDX'], axis=1).values
y = pd.get_dummies(df['LIKES_VIEWS_RATIO_BIN_IDX'].values).to_numpy()
y_idx = df['LIKES_VIEWS_RATIO_BIN_IDX'].values

X_train, X_val, X_test = X[:13000], X[12000:13000], X[13000:]
y_train, y_val, y_test = y[:13000], y[12000:13000], y[13000:]
y_idx_train, y_idx_val, y_idx_test = y_idx[:13000], y_idx[12000:13000], y_idx[13000:]

In [None]:
# Reduce dimensionality
reducer = umap.UMAP(n_neighbors=30, min_dist=0.0, n_components=50)

mapper = umap.UMAP(n_neighbors=30, min_dist=0.0, n_components=50).fit(X_train, y=y_idx_train)

X_train_embedding = mapper.transform(X_train)
X_test_embedding = mapper.transform(X_test)

In [None]:
results = []

In [None]:
# Training
no_iterations = 100
for i in range(no_iterations):
    # create XGBoost instance with default hyper-parameters
    xgb_estimator = xgb.XGBClassifier(objective='binary:logistic',
                                      use_label_encoder=False)

    booster_parameters = randomize_hparams()
    xgb_estimator.set_params(**booster_parameters)

    # create MultiOutputClassifier instance with XGBoost model inside
    multilabel_model = MultiOutputClassifier(xgb_estimator, n_jobs=1)

    multilabel_model.fit(X_train_embedding, y_train)

    accuracy = accuracy_score(y_test, multilabel_model.predict(X_test_embedding)) * 100
    # print(f'Accuracy on test data: {accuracy}')

    r_ = {'run': i,
                    'accuracy': accuracy,
                    'hparams': booster_parameters}

    print(r_)
    results.append(r_)