In [2]:
try:
    import ujson as json
    import umap
except ModuleNotFoundError:
    ! pip install ujson -qU
    ! pip install umap-learn -qU
    import ujson as json
    import umap

import requests
import pandas as pd
import random

from sklearn import svm
from sklearn.metrics import accuracy_score

if 'google.colab' in str(get_ipython()):
    on_colab = True
else:
    on_colab = False



In [3]:
# Helper function to download files
def download_file(url):
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename

In [4]:
# Load x data
if on_colab:
    download_file('https://objectstorage.eu-frankfurt-1.oraclecloud.com/n/frwwzrj6ghal/b/thesis/o/micro_dataset1_resnet18_output_identity.json')
    data_dir = r'micro_dataset1_resnet18_output_identity.json'
else:
    data_dir = r'F:\temp\thesisdata\micro_dataset_1\micro_dataset1_resnet18_output_identity.json'

with open(data_dir, 'r') as f:
    data_dict_list = json.load(f)

data_dict = {}
for element in data_dict_list:
    data_dict.update(element)

# Show first two elements of the dict
# dict(itertools.islice(data_dict.items(), 2))
df_x = pd.DataFrame.from_dict(data_dict, orient='index')
df_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
3865991_3865991_691412_2935874-DSMUXGTJ-7.jpg,0.402192,0.215186,1.006052,2.535887,0.223182,0.966906,0.067344,3.101986,1.115319,0.726254,...,0.732725,0.914586,0.61877,0.791526,2.018329,0.108899,0.651192,0.192771,2.346089,1.501905
7980766_7980766_669333_7048178-XOYQRJZQ-7.jpg,1.07395,0.707743,0.106056,0.551383,0.68053,1.220285,1.024527,0.30502,1.0397,0.217051,...,0.649347,1.637149,2.630768,2.322523,0.047876,1.300324,3.735312,1.352288,0.054118,3.584239
3749936_3749936_314728_2819820-JDANXKLD-7.jpg,2.806773,0.006849,0.898165,0.802126,0.967394,0.287235,0.244238,1.446031,6.778771,0.235296,...,1.267828,0.326778,0.115728,0.466623,0.193548,1.720899,1.446586,2.53537,0.864782,0.062465
5610715_5610715_91068_4680525-LMQNOWJA-7.jpg,0.039423,0.658119,1.192224,2.684522,2.460881,0.046748,0.357242,3.366874,0.91903,0.776935,...,0.823428,0.713102,0.531813,1.427407,0.424931,2.481088,0.868538,2.333207,0.845097,1.062181
6771765_6771765_786228_5841405-PSPFNCAV-7.jpg,0.292377,0.011645,0.000842,1.337585,0.382337,0.305897,0.060697,0.592383,1.28745,0.122081,...,0.067053,0.803128,0.23425,2.015079,1.405711,0.291771,0.038078,0.586244,0.069435,0.06638


In [5]:
# Load y data
if on_colab:
    download_file('https://objectstorage.eu-frankfurt-1.oraclecloud.com/n/frwwzrj6ghal/b/thesis/o/SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv')
    data_dir = 'SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv'
else:
    data_dir = r'F:\temp\thesisdata\SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv'

df_y = pd.read_csv(data_dir, sep='\t')
df_y.set_index('FILENAME', inplace=True)

# Bin the data
# df_y['PRICE_BIN'] = pd.qcut(df_y['PRICE'], q=5)
df_y['PRICE_BIN_IDX'] = pd.qcut(df_y['PRICE'], q=5, labels=[0, 1, 2, 3, 4])
# df_y['LIKES_VIEWS_RATIO_BIN'] = pd.qcut(df_y['LIKES_VIEWS_RATIO'], q=5)
df_y['LIKES_VIEWS_RATIO_BIN_IDX'] = pd.qcut(df_y['LIKES_VIEWS_RATIO'], q=5, labels=[0, 1, 2, 3, 4])
df_y = df_y.astype({'PRICE_BIN_IDX': int, 'LIKES_VIEWS_RATIO_BIN_IDX': int})
df_y.drop(['PRICE', 'LIKES_VIEWS_RATIO'], axis=1, inplace=True)

df_y.head()

Unnamed: 0_level_0,PRICE_BIN_IDX,LIKES_VIEWS_RATIO_BIN_IDX
FILENAME,Unnamed: 1_level_1,Unnamed: 2_level_1
481029_481029_349215_257967-7.jpg,2,1
511001_511001_388280_276511-7.jpg,3,0
517326_517326_376595_282597-CCCWLPRX-7.jpg,4,0
524310_524310_342634_286117-LEMCITZY-7.jpg,4,0
5045187_5045187_7198_4115009-DDZJITSS-7.jpg,2,1


In [6]:
# Join x and y into a single dataframe
df = df_y.join(df_x)
df.head()

Unnamed: 0,PRICE_BIN_IDX,LIKES_VIEWS_RATIO_BIN_IDX,0,1,2,3,4,5,6,7,...,502,503,504,505,506,507,508,509,510,511
1008695_1008695_16575_492565-WPTALJUX-7.jpg,4,0,0.506733,1.394197,0.142876,1.595724,0.447865,2.721004,0.549245,0.088606,...,0.514084,1.684672,2.690634,1.076642,1.014139,0.403866,1.054885,1.213333,0.08846,0.78616
1020928_1020928_20375_496298-MVOEZUTF-7.jpg,1,0,0.833585,1.465594,0.606634,0.643496,0.979372,1.374905,1.059826,0.382815,...,0.505261,1.688515,0.417936,0.20068,1.674028,1.199314,0.406038,1.510623,0.472602,1.073225
1051436_1051436_17127_506738-INLFTOGF-7.jpg,1,1,1.898964,0.537831,0.355688,1.91466,0.678449,0.122319,0.920843,0.896121,...,0.21323,0.716053,0.299384,2.024731,0.045484,1.757237,0.77597,2.071618,0.271196,1.722807
1055377_1055377_18467_508857-HYTIVNMU-7.jpg,3,0,0.959911,1.014624,0.928045,0.345727,0.310256,0.644479,0.349452,1.512597,...,0.680928,2.470096,2.451149,3.417884,1.716235,0.149804,0.0,1.423536,0.022786,0.289719
1057504_1057504_19082_509430-EKIORJVM-7.jpg,4,1,1.535582,0.951705,0.470121,2.631895,0.672225,0.481367,0.571308,0.184078,...,0.659331,0.722163,1.269953,0.223978,1.508935,0.835212,2.030264,1.891201,0.333001,2.909141


In [7]:
X = df.drop(['PRICE_BIN_IDX', 'LIKES_VIEWS_RATIO_BIN_IDX'], axis=1).values
y = df['PRICE_BIN_IDX'].values

X_train, X_val, X_test = X[:13000], X[13000:14000], X[13000:]
y_train, y_val, y_test = y[:13000], y[13000:14000], y[13000:]

In [10]:
model = svm.SVC()

model.fit(X_train, y_train)