In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [2]:
gnet_all = np.load('../data/models/features/googlenet/train_pool5_7x7_s1.npy')

In [3]:
!head -n 4 ../data/train.list

train_photos_256x256_noise/262212.jpg
train_photos_256x256_noise/333665.jpg
train_photos_256x256_noise/421372.jpg
train_photos_256x256_noise/394322.jpg


In [4]:
all_pids = []
with open('../data/train.list', 'r') as lfile:
    for line in lfile:
        _, filename = os.path.split(line)
        pid_str, _ = os.path.splitext(filename)
        all_pids.append(int(pid_str))

In [5]:
len(all_pids)

234842

In [6]:
!head -n 4 ../data/val_caffe.txt

262147.jpg 0
131078.jpg 0
10.jpg 0
393227.jpg 0


In [7]:
val_pids = []
with open('../data/val_caffe.txt', 'r') as lfile:
    for line in lfile:
        pid_str, _ = os.path.splitext(line.split()[0])
        val_pids.append(int(pid_str))

In [8]:
pid_to_index = {pid: i for i, pid in enumerate(all_pids)}

In [9]:
train_df = pd.read_csv('../data/raw/train.csv')

In [10]:
train_photo_to_biz = pd.read_csv('../data/raw/train_photo_to_biz_ids.csv')

In [11]:
train_photo_id_to_biz_id = dict(zip(train_photo_to_biz.photo_id, train_photo_to_biz.business_id))

In [12]:
val_bids = set()
train_bids = set()
sval_pids = set(val_pids) 
for pid, bid in train_photo_id_to_biz_id.items():
    if pid in sval_pids:
        val_bids.add(bid)
    else:
        train_bids.add(bid)

In [13]:
from collections import defaultdict

In [14]:
bid_to_pids = defaultdict(list)
for pid, bid in train_photo_id_to_biz_id.items():
    bid_to_pids[bid].append(pid)

In [15]:
gnet_all.shape

(234842, 1024)

In [16]:
X_train = np.zeros((len(train_bids), 1024))

In [17]:
for i, bid in enumerate(train_bids):
    for pid in bid_to_pids[bid]:
        index = pid_to_index[pid]
        X_train[i] += gnet_all[index]
    X_train[i] /= len(bid_to_pids[bid])

In [18]:
X_val = np.zeros((len(val_bids), 1024))

In [19]:
for i, bid in enumerate(val_bids):
    for pid in bid_to_pids[bid]:
        index = pid_to_index[pid]
        X_val[i] += gnet_all[index]
    X_val[i] /= len(bid_to_pids[bid])

In [20]:
X_val

array([[ 0.95267509,  2.03563903,  0.15004862, ...,  0.49960505,
         0.28688758,  0.3049866 ],
       [ 1.18332484,  1.24298283,  0.27974792, ...,  0.6903441 ,
         0.22187185,  0.55064937],
       [ 0.81671498,  0.67376542,  0.54582791, ...,  0.77234137,
         0.15175865,  0.53816518],
       ..., 
       [ 0.95106818,  1.26992639,  0.14772708, ...,  0.61699631,
         0.38141354,  0.32525882],
       [ 0.84371509,  2.06388409,  0.35485263, ...,  0.40849259,
         0.80597208,  0.68059153],
       [ 0.95197373,  1.4948377 ,  0.45678396, ...,  0.71561163,
         0.41972236,  0.73768763]])

In [21]:
train_df_cleaned = train_df.dropna()

In [22]:
biz_id_to_labels_str = dict(zip(train_df_cleaned.business_id, train_df_cleaned['labels']))

In [23]:
biz_id_to_labels = dict()
for biz_id, labels_str in biz_id_to_labels_str.items():
    biz_id_to_labels[biz_id] = [int(l) for l in labels_str.split()]

In [24]:
def OHE(labels, size=9):
    ohe = np.zeros((9,))
    for l in labels:
        ohe[l] = 1
    return ohe

In [25]:
biz_id_to_ohe_labels = dict()
for biz_id, labels in biz_id_to_labels.items():
    biz_id_to_ohe_labels[biz_id] = OHE(labels)

In [26]:
y_train = np.zeros((len(train_bids), 9))
for i, bid in enumerate(train_bids):
    if bid in biz_id_to_ohe_labels:
        y_train[i] = biz_id_to_ohe_labels[bid]

In [27]:
y_val = np.zeros((len(val_bids), 9))
for i, bid in enumerate(val_bids):
    if bid in biz_id_to_ohe_labels:
        y_val[i] = biz_id_to_ohe_labels[bid]

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
X_train.shape

(1400, 1024)

In [30]:
from sklearn.preprocessing import StandardScaler

In [31]:
scaler = StandardScaler()

In [32]:
X = np.vstack([X_train, X_val])

In [33]:
y = np.vstack([y_train, y_val])

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [35]:
from sklearn.cross_validation import KFold

In [45]:
best_Cs = np.zeros((9, ))
best_f1s = np.zeros((9, ))
best_thresholds = np.zeros((9, ))

Cs = np.logspace(-5, 1, num=20)
thresholds = np.linspace(0, 1.0, 100)

for att in range(9):
    for C in Cs:
        kfold = KFold(n = X.shape[0], n_folds = 5, shuffle=True)
        
        f1s = [[] for i in range(100)]
        for train_index, test_index in kfold:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index, att], y[test_index, att]
            
            scaler = StandardScaler()
            X_train_s = scaler.fit_transform(X_train)
            
            clf = LogisticRegression(C=C)
            clf.fit(X_train_s, y_train)
            
            X_test_s = scaler.transform(X_test)
            y_probas_test = clf.predict_proba(X_test_s)
            
            for i, threshold in enumerate(thresholds):
                y_pred_test = np.where(y_probas_test[:, 1] > threshold, 1, 0)
                f1s[i].append(f1_score(y_test, y_pred_test))
        
        for i, threshold in enumerate(thresholds):
            f1_mean = np.mean(f1s[i])
            if f1_mean > best_f1s[att]:
                best_f1s[att] = f1_mean
                best_Cs[att] = C
                best_thresholds[att] = threshold
                
    print("Best for Attribute: {a}, C: {C}, T: {t}, F1-score: {f1}".format(a=att, C=best_Cs[att],\
            t=best_thresholds[att], f1=best_f1s[att]))

Best for Attribute: 0, C: 0.00695192796178, T: 0.454545454545, F1-score: 0.72296570164
Best for Attribute: 1, C: 0.00335981828628, T: 0.444444444444, F1-score: 0.842264307441
Best for Attribute: 2, C: 0.00335981828628, T: 0.383838383838, F1-score: 0.878912664886
Best for Attribute: 3, C: 0.00162377673919, T: 0.353535353535, F1-score: 0.715042052067
Best for Attribute: 4, C: 0.0143844988829, T: 0.565656565657, F1-score: 0.797506275009
Best for Attribute: 5, C: 0.00335981828628, T: 0.393939393939, F1-score: 0.894731625094
Best for Attribute: 6, C: 0.00695192796178, T: 0.383838383838, F1-score: 0.933335663406
Best for Attribute: 7, C: 0.00162377673919, T: 0.515151515152, F1-score: 0.77463203847
Best for Attribute: 8, C: 0.00162377673919, T: 0.373737373737, F1-score: 0.895950993423


In [46]:
best_Cs

array([ 0.00695193,  0.00335982,  0.00335982,  0.00162378,  0.0143845 ,
        0.00335982,  0.00695193,  0.00162378,  0.00162378])

In [47]:
best_thresholds

array([ 0.45454545,  0.44444444,  0.38383838,  0.35353535,  0.56565657,
        0.39393939,  0.38383838,  0.51515152,  0.37373737])

In [48]:
best_f1s

array([ 0.7229657 ,  0.84226431,  0.87891266,  0.71504205,  0.79750628,
        0.89473163,  0.93333566,  0.77463204,  0.89595099])

In [103]:
best_Cs

array([ 0.00335982,  0.00335982,  0.00335982,  0.00335982,  0.0143845 ,
        0.0143845 ,  0.00695193,  0.00695193,  0.00335982])

In [None]:
best_f1s

In [104]:
best_f1s

array([ 0.7219946 ,  0.83872189,  0.87628017,  0.67585163,  0.79954874,
        0.88640654,  0.92450377,  0.77203958,  0.88446557])

In [53]:
kfold = KFold(n = X.shape[0], n_folds = 5, shuffle=True)

f1s = []
f1s_per_class = []
for train_index, test_index in kfold:
    y_pred_test = np.zeros((len(test_index), 9))
    y_test = y[test_index]
    for att in range(9):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index, att]
            
        scaler = StandardScaler()
        X_train_s = scaler.fit_transform(X_train)
            
        clf = LogisticRegression(C=best_Cs[att])
        clf.fit(X_train_s, y_train)
            
        X_test_s = scaler.transform(X_test)
        y_att_probas_test = clf.predict_proba(X_test_s)
        y_att_pred_test = np.where(y_att_probas_test[:, 1] > best_thresholds[att], 1, 0)
        y_pred_test[:, att] = y_att_pred_test
        
    f1s.append(f1_score(y_test, y_pred_test, 'micro'))
    f1s_per_class.append(f1_score(y_test, y_pred_test, average=None))
print("Mean F1: {f1}".format(f1=np.mean(f1s)))

Mean F1: 0.841043820468


In [54]:
np.mean(np.array(f1s_per_class), axis=0)

array([ 0.71264766,  0.84480466,  0.86996249,  0.70162426,  0.7988557 ,
        0.8903407 ,  0.93257989,  0.77235747,  0.89410482])

# Train on full dataset

In [55]:
scaler = StandardScaler()

In [56]:
X_s = scaler.fit_transform(X)

In [57]:
clfs = []
for att in range(9):
    clf = LogisticRegression(C=best_Cs[att])
    clfs.append(clf.fit(X_s, y[:, att]))

# Submission 002

In [58]:
gnet_test = np.load('../data/models/features/googlenet/test_pool5_7x7_s1.npy')

In [59]:
gnet_test.shape

(237152, 1024)

In [60]:
!head -n 3 ../data/test.list

test_photos_256x256_noise/459084.jpg
test_photos_256x256_noise/67847.jpg
test_photos_256x256_noise/215999.jpg


In [61]:
test_pid_to_index = dict()
with open('../data/test.list', 'r') as lfile:
    for i, line in enumerate(lfile):
        _, filename = os.path.split(line)
        pid_str, _ = os.path.splitext(filename)
        test_pid_to_index[int(pid_str)] = i

In [62]:
len(test_pid_to_index)

237152

In [63]:
test_photo_to_biz = pd.read_csv('../data/raw/test_photo_to_biz.csv')

In [64]:
def add_to_dict(x, bids_to_pids):
    bids_to_pids[x.business_id].append(x.photo_id)

In [65]:
test_bids_to_pids = defaultdict(list)

In [66]:
_ = test_photo_to_biz.apply(lambda x: add_to_dict(x, test_bids_to_pids), axis=1)

In [67]:
X_test = np.zeros((len(test_bids_to_pids), 1024))
for index, (bid, pids) in enumerate(test_bids_to_pids.items()):
    for pid in pids:
        X_test[index] += gnet_test[test_pid_to_index[pid]]
    X_test[index] /= len(pids)

In [68]:
X_test.shape

(10000, 1024)

In [69]:
X_test_s = scaler.transform(X_test)

In [70]:
y_test = np.zeros((X_test.shape[0], 9))

In [77]:
for att in range(9):
    y_probas_att = clfs[att].predict_proba(X_test_s)
    y_pred_att = np.where(y_probas_att[:, 1] > best_thresholds[att], 1, 0)
    y_test[:, att] = y_pred_att

In [78]:
y_test

array([[ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  1.,  0.],
       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
       ..., 
       [ 1.,  1.,  1., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  1.,  0.],
       [ 0.,  1.,  1., ...,  1.,  1.,  0.]])

In [79]:
def inverse_ohe(ohe):
    inv = []
    for i, x in enumerate(ohe):
        if x == 1:
            inv.append(i) 
    return inv

In [80]:
test_bid_to_labels = dict()
for index, (bid, _) in enumerate(test_bids_to_pids.items()):
    test_bid_to_labels[bid] = inverse_ohe(y_test[index])

In [81]:
test_bid_to_labels

{'af0iy': [3, 5, 6, 8],
 's4n0d': [1, 2, 3, 4, 5, 6, 7],
 's1e3l': [1, 2, 3, 4, 5, 6, 7, 8],
 'yinms': [1, 2, 3, 5, 6, 7, 8],
 '3i26e': [1, 2, 3, 5, 6, 8],
 'rl851': [0, 3, 8],
 '0zrov': [1, 2, 3, 5, 6, 7],
 'ean2c': [1, 2, 3, 5, 6, 7],
 'toisr': [1, 2, 3, 5, 6, 7, 8],
 '74ecy': [1, 2, 3, 5, 6, 8],
 '80dw9': [1, 2, 3, 4, 5, 6, 7],
 'aitij': [1, 2, 3, 5, 6, 7],
 's0m7q': [1, 2, 3, 4, 5, 6, 7],
 '7mcjz': [0, 3, 8],
 'm3s1l': [0, 3, 5, 6, 8],
 'l8fvq': [1, 2, 3, 5, 6, 7],
 'z70r9': [1, 2, 3, 5, 6, 8],
 'ugfqb': [1, 2, 3, 4, 5, 6, 8],
 'tea06': [1, 2, 6, 8],
 'imbxw': [1, 2, 3, 5, 6, 8],
 'zpqo0': [1, 2, 3, 5, 6, 8],
 'os9pk': [1, 2, 3, 5, 6, 8],
 'o7d98': [1, 2, 3, 4, 5, 6, 7],
 '3acri': [1, 2, 3, 4, 5, 6, 7],
 'd7awj': [0, 3, 8],
 '0pthg': [5],
 '8796k': [1, 2, 3, 5, 6, 7, 8],
 'wedp4': [1, 2, 3, 5, 6, 7],
 'l851z': [1, 2, 4, 5, 6, 7],
 'dq7xa': [0, 3, 8],
 'iayad': [1, 2, 3, 5, 6, 8],
 'pgr7c': [1, 2, 3, 5, 6, 8],
 'lqswi': [0, 3, 8],
 '68zwr': [0, 8],
 'nju8j': [1, 2, 3, 5, 6, 8],
 '1f

In [84]:
with open('../data/submission_003.csv', 'w') as sfile:
    sfile.write('business_id,labels\n')
    for bid, labels in test_bid_to_labels.items():
        labels_str = [str(l) for l in labels]
        line = '{bid},{labels}\n'.format(bid=bid, labels=' '.join(labels_str))
        sfile.write(line)

In [85]:
!head ../data/submission_003.csv

business_id,labels
af0iy,3 5 6 8
s4n0d,1 2 3 4 5 6 7
s1e3l,1 2 3 4 5 6 7 8
yinms,1 2 3 5 6 7 8
3i26e,1 2 3 5 6 8
rl851,0 3 8
0zrov,1 2 3 5 6 7
ean2c,1 2 3 5 6 7
toisr,1 2 3 5 6 7 8


In [86]:
!wc -l ../data/submission_003.csv

   10001 ../data/submission_003.csv
