In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.cluster import MiniBatchKMeans

In [2]:
gnet_train = np.load('../data/models/features/googlenet/train_pool5_7x7_s1.npy')

In [3]:
gnet_train.shape

(234842, 1024)

In [4]:
train_pids = []
with open('../data/train.list', 'r') as lfile:
    for line in lfile:
        _, filename = os.path.split(line)
        pid_str, _ = os.path.splitext(filename)
        train_pids.append(int(pid_str))

In [5]:
len(train_pids)

234842

In [6]:
train_df_bid_labels = pd.read_csv('../data/raw/train.csv')

In [7]:
train_df_pid_bid = pd.read_csv('../data/raw/train_photo_to_biz_ids.csv')

In [8]:
train_pid_to_bid = dict(zip(train_df_pid_bid.photo_id, train_df_pid_bid.business_id))

In [9]:
train_df_bid_labels = train_df_bid_labels.dropna()

In [10]:
bid_to_labels_str = dict(zip(train_df_bid_labels.business_id, train_df_bid_labels['labels']))

In [11]:
bid_to_labels = dict()
for bid, labels_str in bid_to_labels_str.items():
    bid_to_labels[bid] = [int(l) for l in labels_str.split()]

In [12]:
def binarize(labels, size=9):
    ohe = np.zeros((9,))
    for l in labels:
        ohe[l] = 1
    return ohe

In [13]:
bid_to_bin_labels = dict()
for bid, labels in bid_to_labels.items():
    bid_to_bin_labels[bid] = binarize(labels)

In [14]:
y_train = np.zeros((len(train_pids), 9))
missings = []
for i, pid in enumerate(train_pids):
    bid = train_pid_to_bid[pid]
    if bid in bid_to_bin_labels:
        y_train[i] = bid_to_bin_labels[bid]
    else:
        missings.append(i)

In [15]:
len(missings)

297

In [16]:
presents = list(set(np.arange(gnet_train.shape[0])) - set(missings))

In [17]:
len(presents)

234545

In [18]:
X_train = gnet_train[presents]
y_train = y_train[presents]

In [19]:
X_train.shape

(234545, 1024)

In [20]:
y_train.shape

(234545, 9)

In [21]:
np.save('../data/models/features/googlenet/X_train.npy', X_train)

In [22]:
np.save('../data/models/features/googlenet/y_train.npy', y_train)

In [99]:
n_clusters = 250

In [100]:
kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=256, random_state=42)

In [101]:
from sklearn.preprocessing import StandardScaler

In [102]:
scaler = StandardScaler()

In [103]:
X_train_s = scaler.fit_transform(X_train)

In [104]:
clusters = kmeans.fit_predict(X_train_s)

In [105]:
scores_labels = np.zeros((n_clusters, 9))
for i in range(X_train_s.shape[0]):
    scores_labels[clusters[i]] += y_train[i]
for i in range(n_clusters):
    size = np.count_nonzero(clusters==i)
    print("Cluster {c}: {n}".format(c=i, n=size))
    scores_labels[i] /= size

Cluster 0: 2435
Cluster 1: 3292
Cluster 2: 2324
Cluster 3: 895
Cluster 4: 1733
Cluster 5: 3360
Cluster 6: 2
Cluster 7: 1714
Cluster 8: 2754
Cluster 9: 2952
Cluster 10: 410
Cluster 11: 2090
Cluster 12: 1051
Cluster 13: 1700
Cluster 14: 1734
Cluster 15: 2075
Cluster 16: 5
Cluster 17: 2
Cluster 18: 12
Cluster 19: 1
Cluster 20: 3553
Cluster 21: 2213
Cluster 22: 24
Cluster 23: 2
Cluster 24: 415
Cluster 25: 1057
Cluster 26: 1965
Cluster 27: 2489
Cluster 28: 2302
Cluster 29: 3
Cluster 30: 15
Cluster 31: 1
Cluster 32: 1667
Cluster 33: 3912
Cluster 34: 2152
Cluster 35: 2103
Cluster 36: 4
Cluster 37: 4054
Cluster 38: 2640
Cluster 39: 764
Cluster 40: 5
Cluster 41: 328
Cluster 42: 2
Cluster 43: 3
Cluster 44: 2621
Cluster 45: 2
Cluster 46: 1
Cluster 47: 3930
Cluster 48: 1579
Cluster 49: 270
Cluster 50: 830
Cluster 51: 1782
Cluster 52: 3909
Cluster 53: 9
Cluster 54: 5
Cluster 55: 2729
Cluster 56: 3679
Cluster 57: 19
Cluster 58: 1
Cluster 59: 1083
Cluster 60: 4
Cluster 61: 2
Cluster 62: 364
Cluster 6

In [106]:
y_per_instances = y_train.copy()

In [107]:
y_per_instances = np.where(y_per_instances==1, -1, 0)

In [108]:
threshold_up = 0.8

In [109]:
threshold_down = 0.1

In [117]:
for i in range(X_train.shape[0]):
    labels = y_train[i]
    c = clusters[i]
    for a in range(9):
        if labels[a] != 0:
            if scores_labels[c][a] > threshold_up:
                y_per_instances[i][a] = 1
            else:
                y_per_instances[c][a] = -1

In [130]:
np.count_nonzero(y_per_instances[:, 8] == 1)

9775