In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [13]:
gnet_all = np.load('../data/models/features/googlenet/train_pool5_7x7_s1.npy')

In [3]:
!head -n 4 ../data/train.list

train_photos_256x256_noise/262212.jpg
train_photos_256x256_noise/333665.jpg
train_photos_256x256_noise/421372.jpg
train_photos_256x256_noise/394322.jpg


In [4]:
all_pids = []
with open('../data/train.list', 'r') as lfile:
    for line in lfile:
        _, filename = os.path.split(line)
        pid_str, _ = os.path.splitext(filename)
        all_pids.append(int(pid_str))

In [5]:
len(all_pids)

234842

In [6]:
!head -n 4 ../data/val_caffe.txt

262147.jpg 0
131078.jpg 0
10.jpg 0
393227.jpg 0


In [7]:
val_pids = []
with open('../data/val_caffe.txt', 'r') as lfile:
    for line in lfile:
        pid_str, _ = os.path.splitext(line.split()[0])
        val_pids.append(int(pid_str))

In [10]:
pid_to_index = {pid: i for i, pid in enumerate(all_pids)}

In [16]:
train_df = pd.read_csv('../data/raw/train.csv')

In [17]:
train_photo_to_biz = pd.read_csv('../data/raw/train_photo_to_biz_ids.csv')

In [18]:
train_photo_id_to_biz_id = dict(zip(train_photo_to_biz.photo_id, train_photo_to_biz.business_id))

In [22]:
val_bids = set()
train_bids = set()
sval_pids = set(val_pids) 
for pid, bid in train_photo_id_to_biz_id.items():
    if pid in sval_pids:
        val_bids.add(bid)
    else:
        train_bids.add(bid)

In [24]:
from collections import defaultdict

In [25]:
bid_to_pids = defaultdict(list)
for pid, bid in train_photo_id_to_biz_id.items():
    bid_to_pids[bid].append(pid)

In [26]:
gnet_all.shape

(234842, 1024)

In [30]:
X_train = np.zeros((len(train_bids), 1024))

In [31]:
for i, bid in enumerate(train_bids):
    for pid in bid_to_pids[bid]:
        index = pid_to_index[pid]
        X_train[i] += gnet_all[index]
    X_train[i] /= len(bid_to_pids[bid])

In [33]:
X_val = np.zeros((len(val_bids), 1024))

In [34]:
for i, bid in enumerate(val_bids):
    for pid in bid_to_pids[bid]:
        index = pid_to_index[pid]
        X_val[i] += gnet_all[index]
    X_val[i] /= len(bid_to_pids[bid])

In [35]:
X_val

array([[ 0.95267509,  2.03563903,  0.15004862, ...,  0.49960505,
         0.28688758,  0.3049866 ],
       [ 1.18332484,  1.24298283,  0.27974792, ...,  0.6903441 ,
         0.22187185,  0.55064937],
       [ 0.81671498,  0.67376542,  0.54582791, ...,  0.77234137,
         0.15175865,  0.53816518],
       ..., 
       [ 0.95106818,  1.26992639,  0.14772708, ...,  0.61699631,
         0.38141354,  0.32525882],
       [ 0.84371509,  2.06388409,  0.35485263, ...,  0.40849259,
         0.80597208,  0.68059153],
       [ 0.95197373,  1.4948377 ,  0.45678396, ...,  0.71561163,
         0.41972236,  0.73768763]])

In [37]:
train_df_cleaned = train_df.dropna()

In [38]:
biz_id_to_labels_str = dict(zip(train_df_cleaned.business_id, train_df_cleaned['labels']))

In [39]:
biz_id_to_labels = dict()
for biz_id, labels_str in biz_id_to_labels_str.items():
    biz_id_to_labels[biz_id] = [int(l) for l in labels_str.split()]

In [43]:
def OHE(labels, size=9):
    ohe = np.zeros((9,))
    for l in labels:
        ohe[l] = 1
    return ohe

In [44]:
biz_id_to_ohe_labels = dict()
for biz_id, labels in biz_id_to_labels.items():
    biz_id_to_ohe_labels[biz_id] = OHE(labels)

In [46]:
y_train = np.zeros((len(train_bids), 9))
for i, bid in enumerate(train_bids):
    if bid in biz_id_to_ohe_labels:
        y_train[i] = biz_id_to_ohe_labels[bid]

In [48]:
y_val = np.zeros((len(val_bids), 9))
for i, bid in enumerate(val_bids):
    if bid in biz_id_to_ohe_labels:
        y_val[i] = biz_id_to_ohe_labels[bid]

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
X_train.shape

(1400, 1024)

In [54]:
from sklearn.preprocessing import StandardScaler

In [55]:
scaler = StandardScaler()

In [56]:
X_train_s = scaler.fit_transform(X_train)

In [57]:
X_val_s = scaler.transform(X_val)

In [60]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm

In [61]:
clf = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))

In [68]:
clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=0.1, probability=True))
clf.fit(X_train_s, y_train)
y_pred_val = clf.predict(X_val_s)
print "F1 score: ", f1_score(y_val, y_pred_val, average='micro') 
print "Individual Class F1 score: ", f1_score(y_val, y_pred_val, average=None)

F1 score:  0.780628979356
Individual Class F1 score:  [ 0.58653846  0.79029463  0.82101806  0.608       0.71597633  0.85170068
  0.9005102   0.6626506   0.87614081]


In [69]:
clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=0.01, probability=True))
clf.fit(X_train_s, y_train)
y_pred_val = clf.predict(X_val_s)
print "F1 score: ", f1_score(y_val, y_pred_val, average='micro') 
print "Individual Class F1 score: ", f1_score(y_val, y_pred_val, average=None)

F1 score:  0.811207729469
Individual Class F1 score:  [ 0.65508685  0.80743243  0.86522463  0.65506329  0.76646707  0.8714479
  0.91342535  0.72        0.87765957]


In [70]:
clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=0.001, probability=True))
clf.fit(X_train_s, y_train)
y_pred_val = clf.predict(X_val_s)
print "F1 score: ", f1_score(y_val, y_pred_val, average='micro') 
print "Individual Class F1 score: ", f1_score(y_val, y_pred_val, average=None)

F1 score:  0.823967103975
Individual Class F1 score:  [ 0.71611253  0.82393162  0.8836425   0.6369637   0.78881988  0.87242798
  0.92288557  0.72222222  0.88977424]


In [71]:
clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=0.0001, probability=True))
clf.fit(X_train_s, y_train)
y_pred_val = clf.predict(X_val_s)
print "F1 score: ", f1_score(y_val, y_pred_val, average='micro') 
print "Individual Class F1 score: ", f1_score(y_val, y_pred_val, average=None)

F1 score:  0.801365736092
Individual Class F1 score:  [ 0.63247863  0.83130435  0.88624788  0.54611212  0.70103093  0.87213997
  0.90754258  0.67346939  0.8804205 ]


In [63]:
y_pred_val = clf.predict(X_val_s)

In [65]:
y_pred_val.shape

(600, 9)

In [66]:
from sklearn.metrics import f1_score

In [67]:
print "F1 score: ", f1_score(y_val, y_pred_val, average='micro') 
print "Individual Class F1 score: ", f1_score(y_val, y_pred_val, average=None)

F1 score:  0.776665383134
Individual Class F1 score:  [ 0.5915493   0.77796902  0.80788177  0.61066236  0.71597633  0.85170068
  0.9005102   0.65256798  0.8690013 ]


In [50]:
clf = RandomForestClassifier()

In [51]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [52]:
clf.score(X_val, y_val)

0.13166666666666665

In [47]:
y_train

array([[ 1.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  1.,  0.],
       [ 1.,  1.,  0., ...,  1.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  1.,  0.]])