In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [2]:
gnet_all = np.load('../data/models/features/googlenet/train_pool5_7x7_s1.npy')

In [3]:
!head -n 4 ../data/train.list

train_photos_256x256_noise/262212.jpg
train_photos_256x256_noise/333665.jpg
train_photos_256x256_noise/421372.jpg
train_photos_256x256_noise/394322.jpg


In [4]:
all_pids = []
with open('../data/train.list', 'r') as lfile:
    for line in lfile:
        _, filename = os.path.split(line)
        pid_str, _ = os.path.splitext(filename)
        all_pids.append(int(pid_str))

In [5]:
len(all_pids)

234842

In [6]:
train_df = pd.read_csv('../data/raw/train.csv')

In [7]:
train_photo_to_biz = pd.read_csv('../data/raw/train_photo_to_biz_ids.csv')

In [8]:
train_photo_id_to_biz_id = dict(zip(train_photo_to_biz.photo_id, train_photo_to_biz.business_id))

In [9]:
from collections import defaultdict

In [10]:
train_df_cleaned = train_df.dropna()

In [11]:
biz_id_to_labels_str = dict(zip(train_df_cleaned.business_id, train_df_cleaned['labels']))

In [12]:
biz_id_to_labels = dict()
for biz_id, labels_str in biz_id_to_labels_str.items():
    biz_id_to_labels[biz_id] = [int(l) for l in labels_str.split()]

In [13]:
def OHE(labels, size=9):
    ohe = np.zeros((9,))
    for l in labels:
        ohe[l] = 1
    return ohe

In [14]:
biz_id_to_ohe_labels = dict()
for biz_id, labels in biz_id_to_labels.items():
    biz_id_to_ohe_labels[biz_id] = OHE(labels)

In [15]:
bid_to_pids = defaultdict(list)
for pid, bid in train_photo_id_to_biz_id.items():
    if bid in biz_id_to_ohe_labels:
        bid_to_pids[bid].append(pid)

In [16]:
gnet_all.shape

(234842, 1024)

In [17]:
X = []
pid_to_index = {pid: index for index, pid in enumerate(all_pids)}
for bid, pids in bid_to_pids.items():
    pids_s = [pid for pid in pids if pid in pid_to_index]
    x_pids = np.zeros((len(pids), 1024))
    for i, pid in enumerate(pids_s):
        x_pids[i] = gnet_all[pid_to_index[pid]]
    X.append(x_pids)

In [18]:
len(X)

1996

In [19]:
y = np.zeros((len(bid_to_pids), 9))
for i, (bid, _) in enumerate(bid_to_pids.items()):
    y[i] = biz_id_to_ohe_labels[bid]

In [20]:
y.shape

(1996, 9)

In [21]:
np.random.seed(42)
alls = np.random.permutation(np.arange(1996))
train_ratio = 0.7
n_train = int(train_ratio * alls.shape[0])
train_index = alls[:n_train]
test_index = alls[n_train:]

In [22]:
import misvm

In [23]:
y_att0 = y[:, 0]

In [24]:
y_att0_train, y_att0_test = y_att0[train_index], y_att0[test_index]

In [25]:
X_train = [X[i] for i in train_index]
X_test = [X[i] for i in test_index]

In [26]:
y_att0_train.shape

(1397,)

In [27]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
clf = misvm.MISVM(kernel='linear', C=1.0, max_iters=50)

In [28]:
y_att0_train = np.where(y_att0_train==0, -1, 1)

In [29]:
y_att0_test = np.where(y_att0_test==0, -1, 1)

In [None]:
f = clf.fit(X_test, y_att0_test)

In [53]:
import pickle

In [28]:
with open('../data/tmp_y.pkl', 'wb') as tmp_file:
    pickle.dump(y, tmp_file)

In [29]:
with open('../data/tmp_X.pkl', 'wb') as tmp_file:
    pickle.dump(X, tmp_file)

In [30]:
n_photos = 0
for arr in X_train:
    n_photos += arr.shape[0]

In [31]:
X_train_a = np.zeros((n_photos, 1024))

In [32]:
curr = 0
for arr in X_train:
    next_i = curr + arr.shape[0]
    X_train_a[curr:next_i] = arr
    curr = next_i

In [33]:
from sklearn.decomposition import PCA

In [34]:
pca = PCA(n_components=128)

In [35]:
X_train_128 = pca.fit_transform(X_train_a)

In [36]:
n_photos_test = 0
for arr in X_test:
    n_photos_test += arr.shape[0]

In [37]:
X_test_a = np.zeros((n_photos_test, 1024))
curr = 0
for arr in X_test:
    next_i = curr + arr.shape[0]
    X_test_a[curr:next_i] = arr
    curr = next_i

In [38]:
X_test_128 = pca.transform(X_test_a)

In [39]:
bags_train_128 = []
for arr in X_train:
    next_i = curr + arr.shape[0]
    bags_train_128.append(X_train_128[curr:next_i])

In [40]:
len(bags_train_128)

1397

In [44]:
X_test_128.shape

(69840, 128)

In [50]:
bags_test_128 = []
curr = 0
for arr in X_test:
    next_i = curr + arr.shape[0]
    bags_test_128.append(X_test_128[curr:next_i])

In [54]:
with open('../data/tmp_X_train_128.pkl', 'wb') as tmp_file:
    pickle.dump(bags_train_128, tmp_file)

In [55]:
with open('../data/tmp_X_test_128.pkl', 'wb') as tmp_file:
    pickle.dump(bags_test_128, tmp_file)

In [59]:
with open('../data/tmp_y_train.pkl', 'wb') as tmp_file:
    pickle.dump(y_att0_train, tmp_file)

In [60]:
with open('../data/tmp_y_test.pkl', 'wb') as tmp_file:
    pickle.dump(y_att0_test, tmp_file)