Main idea:
* keep only businesses with a given attribute A
* find complement businesses b1 and b2, ie b1 and b2 such as hamming(d1, d2) is maximum. b1 and b2 should have only one attribute in common (A).
* find the closest photos between b1 and b2, p1 and p2: as they have only one attribute in common, we assume that p1 and p2 also have this attribute.
* find other similar photos to p1 and p2: all those images are supposed to have the attribute A.

-> that gives the positive training set.

* For negative training set, just use all images of business that don't have the attribute A.

In [43]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict

In [2]:
gnet_train = np.load('../data/models/features/googlenet/train_pool5_7x7_s1.npy')

In [3]:
gnet_train.shape

(234842, 1024)

In [4]:
train_pids = []
with open('../data/train.list', 'r') as lfile:
    for line in lfile:
        _, filename = os.path.split(line)
        pid_str, _ = os.path.splitext(filename)
        train_pids.append(int(pid_str))

In [5]:
len(train_pids)

234842

In [48]:
train_pid_to_index = {pid: i for i, pid in enumerate(train_pids)}

In [6]:
train_df_bid_labels = pd.read_csv('../data/raw/train.csv')

In [7]:
train_df_pid_bid = pd.read_csv('../data/raw/train_photo_to_biz_ids.csv')

In [8]:
train_pid_to_bid = dict(zip(train_df_pid_bid.photo_id, train_df_pid_bid.business_id))

In [44]:
train_bid_to_pids = defaultdict(list)
for pid, bid in train_pid_to_bid.items():
    train_bid_to_pids[bid].append(pid)

In [9]:
train_df_bid_labels = train_df_bid_labels.dropna()

In [10]:
bid_to_labels_str = dict(zip(train_df_bid_labels.business_id, train_df_bid_labels['labels']))

In [11]:
bid_to_labels = dict()
for bid, labels_str in bid_to_labels_str.items():
    bid_to_labels[bid] = [int(l) for l in labels_str.split()]

In [12]:
def binarize(labels, size=9):
    ohe = np.zeros((9,))
    for l in labels:
        ohe[l] = 1
    return ohe

In [13]:
bid_to_bin_labels = dict()
for bid, labels in bid_to_labels.items():
    bid_to_bin_labels[bid] = binarize(labels)

In [16]:
bid_att3_to_bin_labels = {bid: labels for bid, labels in bid_to_bin_labels.items() if labels[3]==1}

In [17]:
len(bid_att3_to_bin_labels)

1003

In [22]:
distance_bid_bid = np.zeros((len(bid_att3_to_bin_labels), len(bid_att3_to_bin_labels)))

In [23]:
for i, (bid_i, labels_i) in enumerate(bid_att3_to_bin_labels.items()):
    for j, (bid_j, labels_j) in enumerate(bid_att3_to_bin_labels.items()):
        distance_bid_bid[i][j] = np.sum(np.abs(labels_i - labels_j))

In [39]:
np.count_nonzero(distance_bid_bid == 8)/2

19151

In [29]:
index_to_bid = {i: bid for i, (bid, _) in enumerate(bid_att3_to_bin_labels.items())}

In [30]:
index_to_bid[0]

2048

In [41]:
index_to_bid[168]

2414

In [47]:
train_bid_to_pids[2414]

[33603,
 88519,
 183760,
 199281,
 221362,
 261927,
 262361,
 300266,
 334291,
 339738,
 340489,
 340559,
 350462,
 459820]

In [49]:
from scipy.spatial.distance import euclidean

In [60]:
def get_closest(pids1, pids2, train_pid_to_index, gnet_train):
    closest_distance = 10000000
    c1, c2 = 0, 0
    for p1 in pids1:
        for p2 in pids2:
            if p1 != p2:
                if p1 in train_pid_to_index and p2 in train_pid_to_index:
                    d = euclidean(gnet_train[train_pid_to_index[p1]], gnet_train[train_pid_to_index[p2]])
                    if d < closest_distance:
                        closest_distance = d
                        c1, c2 = p1, p2
    return c1, c2, closest_distance

In [61]:
get_closest(train_bid_to_pids[2414], train_bid_to_pids[2048], train_pid_to_index, gnet_train)

(339738, 95344, 22.757676456475686)

In [62]:
get_closest(train_bid_to_pids[2048], train_bid_to_pids[2048], train_pid_to_index, gnet_train)

(72491, 255870, 10.933053092260215)

In [69]:
def get_close(p1, p2, pids, train_pid_to_index, gnet_train, d_max):
    close = []
    for p in pids:
        if p in train_pid_to_index:
            d1 = euclidean(gnet_train[train_pid_to_index[p]], gnet_train[train_pid_to_index[p1]])
            d2 = euclidean(gnet_train[train_pid_to_index[p]], gnet_train[train_pid_to_index[p2]])
            if d1 < d_max and d2 < d_max:
                close.append(p)
    return close

In [70]:
get_close(95344, 339738, train_bid_to_pids[2414] + train_bid_to_pids[2048], train_pid_to_index, gnet_train, 22.75*1.5)

[183760,
 261927,
 300266,
 339738,
 350462,
 22560,
 69893,
 76395,
 88548,
 95344,
 114380,
 207253,
 320679]

In [32]:
bid_att3_to_bin_labels[2048]

array([ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.])

In [42]:
bid_att3_to_bin_labels[2414]

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.])

In [35]:
distance_bid_bid[0][1]

1.0

In [37]:
bid_att3_to_bin_labels[2048] - bid_att3_to_bin_labels[2049]

array([ 0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,  0.])

In [36]:
np.sum(np.abs(bid_att3_to_bin_labels[2048] - bid_att3_to_bin_labels[2049]))

1.0

In [40]:
np.where(distance_bid_bid == 8)

(array([   0,    0,    0, ...,  997, 1002, 1002]),
 array([168, 270, 763, ..., 763, 421, 892]))

In [None]:
for i, (bid_i, labels_i) in enumerate(bid_att3_to_bin_labels.items()):

In [14]:
y_train = np.zeros((len(train_pids), 9))
missings = []
for i, pid in enumerate(train_pids):
    bid = train_pid_to_bid[pid]
    if bid in bid_to_bin_labels:
        y_train[i] = bid_to_bin_labels[bid]
    else:
        missings.append(i)

In [105]:
scores_labels = np.zeros((n_clusters, 9))
for i in range(X_train_s.shape[0]):
    scores_labels[clusters[i]] += y_train[i]
for i in range(n_clusters):
    size = np.count_nonzero(clusters==i)
    print("Cluster {c}: {n}".format(c=i, n=size))
    scores_labels[i] /= size

Cluster 0: 2435
Cluster 1: 3292
Cluster 2: 2324
Cluster 3: 895
Cluster 4: 1733
Cluster 5: 3360
Cluster 6: 2
Cluster 7: 1714
Cluster 8: 2754
Cluster 9: 2952
Cluster 10: 410
Cluster 11: 2090
Cluster 12: 1051
Cluster 13: 1700
Cluster 14: 1734
Cluster 15: 2075
Cluster 16: 5
Cluster 17: 2
Cluster 18: 12
Cluster 19: 1
Cluster 20: 3553
Cluster 21: 2213
Cluster 22: 24
Cluster 23: 2
Cluster 24: 415
Cluster 25: 1057
Cluster 26: 1965
Cluster 27: 2489
Cluster 28: 2302
Cluster 29: 3
Cluster 30: 15
Cluster 31: 1
Cluster 32: 1667
Cluster 33: 3912
Cluster 34: 2152
Cluster 35: 2103
Cluster 36: 4
Cluster 37: 4054
Cluster 38: 2640
Cluster 39: 764
Cluster 40: 5
Cluster 41: 328
Cluster 42: 2
Cluster 43: 3
Cluster 44: 2621
Cluster 45: 2
Cluster 46: 1
Cluster 47: 3930
Cluster 48: 1579
Cluster 49: 270
Cluster 50: 830
Cluster 51: 1782
Cluster 52: 3909
Cluster 53: 9
Cluster 54: 5
Cluster 55: 2729
Cluster 56: 3679
Cluster 57: 19
Cluster 58: 1
Cluster 59: 1083
Cluster 60: 4
Cluster 61: 2
Cluster 62: 364
Cluster 6

In [106]:
y_per_instances = y_train.copy()

In [107]:
y_per_instances = np.where(y_per_instances==1, -1, 0)

In [108]:
threshold_up = 0.8

In [109]:
threshold_down = 0.1

In [117]:
for i in range(X_train.shape[0]):
    labels = y_train[i]
    c = clusters[i]
    for a in range(9):
        if labels[a] != 0:
            if scores_labels[c][a] > threshold_up:
                y_per_instances[i][a] = 1
            else:
                y_per_instances[c][a] = -1

In [130]:
np.count_nonzero(y_per_instances[:, 8] == 1)

9775