In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [2]:
gnet_all = np.load('../data/models/features/caffenet/train_fc7.npy')

In [3]:
!head -n 4 ../data/train.list

train_photos_256x256_noise/262212.jpg
train_photos_256x256_noise/333665.jpg
train_photos_256x256_noise/421372.jpg
train_photos_256x256_noise/394322.jpg


In [4]:
all_pids = []
with open('../data/train.list', 'r') as lfile:
    for line in lfile:
        _, filename = os.path.split(line)
        pid_str, _ = os.path.splitext(filename)
        all_pids.append(int(pid_str))

In [5]:
len(all_pids)

234842

In [6]:
!head -n 4 ../data/val_caffe.txt

262147.jpg 0
131078.jpg 0
10.jpg 0
393227.jpg 0


In [7]:
val_pids = []
with open('../data/val_caffe.txt', 'r') as lfile:
    for line in lfile:
        pid_str, _ = os.path.splitext(line.split()[0])
        val_pids.append(int(pid_str))

In [8]:
pid_to_index = {pid: i for i, pid in enumerate(all_pids)}

In [9]:
train_df = pd.read_csv('../data/raw/train.csv')

In [10]:
train_photo_to_biz = pd.read_csv('../data/raw/train_photo_to_biz_ids.csv')

In [11]:
train_photo_id_to_biz_id = dict(zip(train_photo_to_biz.photo_id, train_photo_to_biz.business_id))

In [12]:
val_bids = set()
train_bids = set()
sval_pids = set(val_pids) 
for pid, bid in train_photo_id_to_biz_id.items():
    if pid in sval_pids:
        val_bids.add(bid)
    else:
        train_bids.add(bid)

In [13]:
from collections import defaultdict

In [14]:
bid_to_pids = defaultdict(list)
for pid, bid in train_photo_id_to_biz_id.items():
    bid_to_pids[bid].append(pid)

In [15]:
gnet_all.shape

(234842, 4096)

In [16]:
X_train = np.zeros((len(train_bids), 4096))

In [17]:
for i, bid in enumerate(train_bids):
    for pid in bid_to_pids[bid]:
        index = pid_to_index[pid]
        X_train[i] += gnet_all[index]
    X_train[i] /= len(bid_to_pids[bid])

In [18]:
X_val = np.zeros((len(val_bids), 4096))

In [19]:
for i, bid in enumerate(val_bids):
    for pid in bid_to_pids[bid]:
        index = pid_to_index[pid]
        X_val[i] += gnet_all[index]
    X_val[i] /= len(bid_to_pids[bid])

In [20]:
X_val

array([[ 0.26927457,  0.4880292 ,  0.16971176, ...,  0.02643406,
         0.29719473,  0.41695317],
       [ 0.26258598,  0.1271495 ,  0.32735901, ...,  0.08679506,
         0.09604481,  0.11191516],
       [ 0.44470557,  0.53207566,  0.29584659, ...,  0.01935389,
         0.24350455,  0.71762448],
       ..., 
       [ 0.12974235,  0.04550099,  0.19271593, ...,  0.        ,
         0.15098281,  0.17894909],
       [ 0.34340412,  0.10046179,  0.36065254, ...,  0.09029186,
         0.08884564,  0.43177602],
       [ 0.2914922 ,  0.23738647,  0.23275057, ...,  0.18892629,
         0.12190211,  0.7013955 ]])

In [21]:
train_df_cleaned = train_df.dropna()

In [22]:
biz_id_to_labels_str = dict(zip(train_df_cleaned.business_id, train_df_cleaned['labels']))

In [23]:
biz_id_to_labels = dict()
for biz_id, labels_str in biz_id_to_labels_str.items():
    biz_id_to_labels[biz_id] = [int(l) for l in labels_str.split()]

In [24]:
def OHE(labels, size=9):
    ohe = np.zeros((9,))
    for l in labels:
        ohe[l] = 1
    return ohe

In [25]:
biz_id_to_ohe_labels = dict()
for biz_id, labels in biz_id_to_labels.items():
    biz_id_to_ohe_labels[biz_id] = OHE(labels)

In [26]:
y_train = np.zeros((len(train_bids), 9))
for i, bid in enumerate(train_bids):
    if bid in biz_id_to_ohe_labels:
        y_train[i] = biz_id_to_ohe_labels[bid]

In [27]:
y_val = np.zeros((len(val_bids), 9))
for i, bid in enumerate(val_bids):
    if bid in biz_id_to_ohe_labels:
        y_val[i] = biz_id_to_ohe_labels[bid]

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
X_train.shape

(1400, 4096)

In [30]:
from sklearn.preprocessing import StandardScaler

In [31]:
scaler = StandardScaler()

In [32]:
X = np.vstack([X_train, X_val])

In [33]:
y = np.vstack([y_train, y_val])

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [35]:
from sklearn.cross_validation import KFold

In [36]:
best_Cs = np.zeros((9, ))
best_f1s = np.zeros((9, ))
best_thresholds = np.zeros((9, ))

Cs = np.logspace(-5, 1, num=15)
thresholds = np.linspace(0, 1.0, 50)

for att in range(9):
    for C in Cs:
        kfold = KFold(n = X.shape[0], n_folds = 5, shuffle=True)
        
        f1s = [[] for i in range(100)]
        for train_index, test_index in kfold:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index, att], y[test_index, att]
            
            scaler = StandardScaler()
            X_train_s = scaler.fit_transform(X_train)
            
            clf = LogisticRegression(C=C)
            clf.fit(X_train_s, y_train)
            
            X_test_s = scaler.transform(X_test)
            y_probas_test = clf.predict_proba(X_test_s)
            
            for i, threshold in enumerate(thresholds):
                y_pred_test = np.where(y_probas_test[:, 1] > threshold, 1, 0)
                f1s[i].append(f1_score(y_test, y_pred_test))
        
        for i, threshold in enumerate(thresholds):
            f1_mean = np.mean(f1s[i])
            if f1_mean > best_f1s[att]:
                best_f1s[att] = f1_mean
                best_Cs[att] = C
                best_thresholds[att] = threshold
                
    print("Best for Attribute: {a}, C: {C}, T: {t}, F1-score: {f1}".format(a=att, C=best_Cs[att],\
            t=best_thresholds[att], f1=best_f1s[att]))

Best for Attribute: 0, C: 0.00138949549437, T: 0.571428571429, F1-score: 0.717293777028
Best for Attribute: 1, C: 0.01, T: 0.34693877551, F1-score: 0.842553089684
Best for Attribute: 2, C: 0.00138949549437, T: 0.408163265306, F1-score: 0.881934979038
Best for Attribute: 3, C: 0.00138949549437, T: 0.30612244898, F1-score: 0.701959318207
Best for Attribute: 4, C: 0.00372759372031, T: 0.571428571429, F1-score: 0.802491284712
Best for Attribute: 5, C: 0.000517947467923, T: 0.285714285714, F1-score: 0.889229418462
Best for Attribute: 6, C: 0.000517947467923, T: 0.265306122449, F1-score: 0.930753656275
Best for Attribute: 7, C: 0.000517947467923, T: 0.551020408163, F1-score: 0.769101167385
Best for Attribute: 8, C: 0.01, T: 0.428571428571, F1-score: 0.887185178992


  'precision', 'predicted', average, warn_for)


In [37]:
best_Cs

array([ 0.0013895 ,  0.01      ,  0.0013895 ,  0.0013895 ,  0.00372759,
        0.00051795,  0.00051795,  0.00051795,  0.01      ])

In [38]:
best_thresholds

array([ 0.57142857,  0.34693878,  0.40816327,  0.30612245,  0.57142857,
        0.28571429,  0.26530612,  0.55102041,  0.42857143])

In [39]:
best_f1s

array([ 0.71729378,  0.84255309,  0.88193498,  0.70195932,  0.80249128,
        0.88922942,  0.93075366,  0.76910117,  0.88718518])

In [40]:
best_Cs

array([ 0.0013895 ,  0.01      ,  0.0013895 ,  0.0013895 ,  0.00372759,
        0.00051795,  0.00051795,  0.00051795,  0.01      ])

In [42]:
kfold = KFold(n = X.shape[0], n_folds = 5, shuffle=True)

f1s = []
f1s_per_class = []
for train_index, test_index in kfold:
    y_pred_test = np.zeros((len(test_index), 9))
    y_test = y[test_index]
    for att in range(9):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index, att]
            
        scaler = StandardScaler()
        X_train_s = scaler.fit_transform(X_train)
            
        clf = LogisticRegression(C=best_Cs[att])
        clf.fit(X_train_s, y_train)
            
        X_test_s = scaler.transform(X_test)
        y_att_probas_test = clf.predict_proba(X_test_s)
        y_att_pred_test = np.where(y_att_probas_test[:, 1] > best_thresholds[att], 1, 0)
        y_pred_test[:, att] = y_att_pred_test
        
    f1s.append(f1_score(y_test, y_pred_test, 'micro'))
    f1s_per_class.append(f1_score(y_test, y_pred_test, average=None))
print("Mean F1: {f1}".format(f1=np.mean(f1s)))

Mean F1: 0.836721425307


In [47]:
np.mean(np.array(f1s_per_class), axis=0)

array([ 0.7033947 ,  0.83991083,  0.88231244,  0.6921303 ,  0.79369311,
        0.88684395,  0.92920953,  0.77316433,  0.88017681])

# Train on full dataset

In [44]:
scaler = StandardScaler()

In [45]:
X_s = scaler.fit_transform(X)

In [46]:
clfs = []
for att in range(9):
    clf = LogisticRegression(C=best_Cs[att])
    clfs.append(clf.fit(X_s, y[:, att]))

# Submission 004

In [48]:
gnet_test = np.load('../data/models/features/caffenet/test_fc7.npy')

In [49]:
gnet_test.shape

(237152, 4096)

In [50]:
!head -n 3 ../data/test.list

test_photos_256x256_noise/459084.jpg
test_photos_256x256_noise/67847.jpg
test_photos_256x256_noise/215999.jpg


In [51]:
test_pid_to_index = dict()
with open('../data/test.list', 'r') as lfile:
    for i, line in enumerate(lfile):
        _, filename = os.path.split(line)
        pid_str, _ = os.path.splitext(filename)
        test_pid_to_index[int(pid_str)] = i

In [52]:
len(test_pid_to_index)

237152

In [53]:
test_photo_to_biz = pd.read_csv('../data/raw/test_photo_to_biz.csv')

In [54]:
def add_to_dict(x, bids_to_pids):
    bids_to_pids[x.business_id].append(x.photo_id)

In [55]:
test_bids_to_pids = defaultdict(list)

In [56]:
_ = test_photo_to_biz.apply(lambda x: add_to_dict(x, test_bids_to_pids), axis=1)

In [57]:
X_test = np.zeros((len(test_bids_to_pids), 4096))
for index, (bid, pids) in enumerate(test_bids_to_pids.items()):
    for pid in pids:
        X_test[index] += gnet_test[test_pid_to_index[pid]]
    X_test[index] /= len(pids)

In [58]:
X_test.shape

(10000, 4096)

In [59]:
X_test_s = scaler.transform(X_test)

In [60]:
y_test = np.zeros((X_test.shape[0], 9))

In [61]:
for att in range(9):
    y_probas_att = clfs[att].predict_proba(X_test_s)
    y_pred_att = np.where(y_probas_att[:, 1] > best_thresholds[att], 1, 0)
    y_test[:, att] = y_pred_att

In [62]:
y_test

array([[ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
       [ 0.,  0.,  1., ...,  1.,  1.,  1.],
       ..., 
       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
       [ 0.,  1.,  1., ...,  1.,  1.,  0.],
       [ 0.,  1.,  0., ...,  1.,  0.,  0.]])

In [63]:
def inverse_ohe(ohe):
    inv = []
    for i, x in enumerate(ohe):
        if x == 1:
            inv.append(i) 
    return inv

In [64]:
test_bid_to_labels = dict()
for index, (bid, _) in enumerate(test_bids_to_pids.items()):
    test_bid_to_labels[bid] = inverse_ohe(y_test[index])

In [65]:
test_bid_to_labels

{'af0iy': [3, 5, 6, 8],
 's4n0d': [1, 2, 3, 4, 5, 6, 7, 8],
 's1e3l': [2, 3, 4, 5, 6, 7, 8],
 'yinms': [1, 2, 3, 5, 6, 7, 8],
 '3i26e': [1, 2, 3, 4, 5, 6, 7, 8],
 'rl851': [0, 3, 8],
 '0zrov': [1, 2, 3, 4, 5, 6, 7],
 'ean2c': [1, 2, 3, 5, 6, 7],
 'toisr': [1, 3, 4, 5, 6],
 '74ecy': [3, 5, 6, 8],
 '80dw9': [1, 2, 3, 5, 6, 7],
 'aitij': [1, 2, 3, 5, 6, 7],
 's0m7q': [1, 2, 3, 4, 5, 6, 7, 8],
 '7mcjz': [0, 3, 8],
 'm3s1l': [2, 5, 6, 8],
 'l8fvq': [1, 2, 3, 5, 6],
 'z70r9': [1, 2, 3, 5, 6],
 'ugfqb': [1, 2, 4, 5, 6, 7],
 'tea06': [1, 3, 5, 6, 8],
 'imbxw': [1, 2, 3, 5, 6, 7, 8],
 'zpqo0': [1, 2, 3, 4, 5, 6, 7, 8],
 'os9pk': [1, 2, 3, 4, 5, 6, 7, 8],
 'o7d98': [1, 2, 4, 5, 6, 7],
 '3acri': [1, 2, 3, 4, 5, 6, 7],
 'd7awj': [0, 3, 8],
 '0pthg': [1, 2, 4, 5, 6, 7],
 '8796k': [1, 2, 3, 5, 6, 8],
 'wedp4': [1, 2, 3, 4, 5, 6],
 'l851z': [1, 2, 3, 4, 5, 6, 7],
 'dq7xa': [0, 3, 8],
 'iayad': [1, 2, 3, 5, 6, 7],
 'pgr7c': [2, 3, 5, 6, 8],
 'lqswi': [0, 3, 8],
 '68zwr': [0, 3, 5, 6, 8],
 'nju8j': [1,

In [66]:
with open('../data/submission_004.csv', 'w') as sfile:
    sfile.write('business_id,labels\n')
    for bid, labels in test_bid_to_labels.items():
        labels_str = [str(l) for l in labels]
        line = '{bid},{labels}\n'.format(bid=bid, labels=' '.join(labels_str))
        sfile.write(line)

In [67]:
!head ../data/submission_004.csv

business_id,labels
af0iy,3 5 6 8
s4n0d,1 2 3 4 5 6 7 8
s1e3l,2 3 4 5 6 7 8
yinms,1 2 3 5 6 7 8
3i26e,1 2 3 4 5 6 7 8
rl851,0 3 8
0zrov,1 2 3 4 5 6 7
ean2c,1 2 3 5 6 7
toisr,1 3 4 5 6


In [69]:
!wc -l ../data/submission_004.csv

   10001 ../data/submission_004.csv
