In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [2]:
train_df = pd.read_csv('../data/raw/train.csv')

In [3]:
train_photo_to_biz = pd.read_csv('../data/raw/train_photo_to_biz_ids.csv')

In [82]:
!wc -l ../data/raw/test_photo_to_biz.csv

 1190226 ../data/raw/test_photo_to_biz.csv


In [4]:
test_photo_to_biz = pd.read_csv('../data/raw/test_photo_to_biz.csv')

In [5]:
attribute_id_to_label = {
    0: 'good_for_lunch',
    1: 'good_for_dinner',
    2: 'takes_reservations',
    3: 'outdoor_seating',
    4: 'restaurant_is_expensive',
    5: 'has_alcohol',
    6: 'has_table_service',
    7: 'ambience_is_classy',
    8: 'good_for_kids'}

In [6]:
train_photo_id_to_biz_id = dict(zip(train_photo_to_biz.photo_id, train_photo_to_biz.business_id))

In [7]:
train_df_cleaned = train_df.dropna()

In [8]:
biz_id_to_labels_str = dict(zip(train_df_cleaned.business_id, train_df_cleaned['labels']))

In [9]:
biz_id_to_labels = dict()
for biz_id, labels_str in biz_id_to_labels_str.items():
    biz_id_to_labels[biz_id] = [int(l) for l in labels_str.split()]

# Load validation photos ids

We want to predict label for biz id, so I will also split the traning set on buisiness id.

In [10]:
!head -n 2 ../data/models/multitask/val_caffe.txt

262147.jpg 0
131078.jpg 0


In [11]:
val_photos_ids = []
with open('../data/models/multitask/val_caffe.txt', 'r') as lfile:
    for line in lfile:
        filename = line.split()[0]
        id_str, _ = os.path.splitext(filename)
        val_photos_ids.append(int(id_str))

In [12]:
len(val_photos_ids)

71640

In [13]:
pid_to_index = {pid: index for index, pid in enumerate(val_photos_ids)}

## Get the set of associated buisiness ids

In [14]:
val_biz_ids = set()
val_pids = set(val_photos_ids) 
for pid, bid in train_photo_id_to_biz_id.items():
    if pid in val_pids:
        val_biz_ids.add(bid)

# Load attribute prediction on validation photos

In [15]:
y_probas_val = np.load('../data/models/multitask/val_caffe_prob_pos_yelp_attributes.npy')

In [16]:
y_probas_val.shape

(71640, 9)

In [17]:
def threshold(y_probas, threshold=0.5):
    return (y_probas > threshold).astype(int)

In [18]:
y_val = threshold(y_probas_val, 0.5)

In [19]:
y_val[0]

array([0, 0, 0, 0, 0, 0, 1, 0, 1])

# Aggregate prediction per business

In [20]:
from collections import defaultdict

In [21]:
biz_id_to_pids = defaultdict(list)

In [22]:
for pid, bid in train_photo_id_to_biz_id.items():
    if pid in val_pids:
        biz_id_to_pids[bid].append(pid)

## Mean probabilty

In [23]:
len(biz_id_to_pids)

600

In [24]:
biz_id_to_index = {bid: index for index, bid in enumerate(biz_id_to_pids.keys())}

In [25]:
y_probas_val_biz = np.zeros((len(biz_id_to_pids), 9))

In [26]:
for bid, pids in biz_id_to_pids.items():
    index = biz_id_to_index[bid]
    for pid in pids:
        y_probas_val_biz[index] += y_probas_val[pid_to_index[pid]]
    y_probas_val_biz[index] /= len(pids)

In [27]:
y_probas_val_biz.shape

(600, 9)

In [28]:
y_val_biz = threshold(y_probas_val_biz, 0.5)

In [29]:
y_val_biz.shape

(600, 9)

## Majority vote

In [30]:
y_val_biz_majority = np.zeros((len(biz_id_to_pids), 9))
for bid, pids in biz_id_to_pids.items():
    index = biz_id_to_index[bid]
    for pid in pids:
        y_val_biz_majority[index] += y_val[pid_to_index[pid]]
    y_val_biz_majority[index] = threshold(y_val_biz_majority[index], len(pids)/2.0)

In [31]:
y_val_biz_majority

array([[ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  0.,  0., ...,  1.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  0.,  0.],
       [ 0.,  1.,  1., ...,  1.,  0.,  1.]])

# Load groundtruth attributes

In [32]:
def OHE(labels, size=9):
    ohe = np.zeros((9))
    for l in labels:
        ohe[l] = 1
    return ohe

In [33]:
y_biz = np.zeros((len(biz_id_to_index), 9))

In [34]:
for bid, index in biz_id_to_index.items():
    labels = biz_id_to_labels[bid]
    y_biz[index] = OHE(labels)

In [35]:
y_biz.shape

(600, 9)

In [36]:
from sklearn.metrics import f1_score

print "F1 score: ", f1_score(y_biz, y_val_biz, average='micro')
print "Individual Class F1 score: ", f1_score(y_biz, y_val_biz, average=None)

F1 score:  0.760438714643
Individual Class F1 score:  [ 0.23293173  0.81116585  0.85585586  0.58519793  0.65454545  0.82798165
  0.82266527  0.50220264  0.90196078]


In [37]:
y_photos = np.zeros((len(val_photos_ids), 9))
for pid, index in pid_to_index.items():
    bid = train_photo_id_to_biz_id[pid]
    labels = biz_id_to_labels[bid]
    y_photos[index] = OHE(labels)

In [38]:
y_photos

array([[ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  0.,  0.],
       [ 1.,  0.,  1., ...,  1.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.]])

In [39]:
len(val_biz_ids)

600

In [40]:
from sklearn.metrics import f1_score

print "F1 score: ", f1_score(y_biz, y_val_biz_majority, average='micro')
print "Individual Class F1 score: ", f1_score(y_biz, y_val_biz_majority, average=None)

F1 score:  0.757285351066
Individual Class F1 score:  [ 0.26190476  0.81407035  0.85235921  0.5428051   0.63736264  0.83138173
  0.82627119  0.49781659  0.90237467]


In [41]:
for t in np.arange(0.4, 0.5, 0.01):
    y_val_biz = threshold(y_probas_val_biz, t)
    print("{t} - F1 score: {f1}".format(t=t, f1=f1_score(y_biz, y_val_biz, average='micro')))

0.4 - F1 score: 0.767171314741
0.41 - F1 score: 0.769032258065
0.42 - F1 score: 0.770537318308
0.43 - F1 score: 0.770173985087
0.44 - F1 score: 0.772283888328
0.45 - F1 score: 0.77144324602
0.46 - F1 score: 0.771837875611
0.47 - F1 score: 0.772305496074
0.48 - F1 score: 0.771679473106
0.49 - F1 score: 0.767991004498


In [42]:
from sklearn.metrics import accuracy_score

In [43]:
for att in range(9):
    acc = accuracy_score(y_photos[:, att], y_val[:, att])
    print("Accuracy for attribue {a}: {acc}".format(a=att, acc=acc))

Accuracy for attribue 0: 0.761069235064
Accuracy for attribue 1: 0.735482970408
Accuracy for attribue 2: 0.763609715243
Accuracy for attribue 3: 0.541834170854
Accuracy for attribue 4: 0.749413735343
Accuracy for attribue 5: 0.773310999442
Accuracy for attribue 6: 0.768174204355
Accuracy for attribue 7: 0.738581797878
Accuracy for attribue 8: 0.741973757677


In [44]:
accuracy_score(y_biz[:, 0], y_val_biz[:, 0])

0.68166666666666664

In [45]:
from sklearn.metrics import recall_score, precision_score

In [46]:
recall_score(y_biz[:, 0], y_val_biz[:, 0])

0.14084507042253522

In [47]:
precision_score(y_biz[:, 0], y_val_biz[:, 0])

0.78947368421052633

In [48]:
f1_score(y_biz[:, 0], y_val_biz[:, 0])

0.2390438247011952

In [49]:
thresholds = np.zeros((9, ))
for att in range(9):
    best = 0
    f1max = 0
    for t in np.arange(0.0, 1.0, 0.01):
        y_val_biz = threshold(y_probas_val_biz, t)
        f1 = f1_score(y_biz[:, att], y_val_biz[:, att])
        if f1 > f1max:
            best = t
            f1max = f1
    thresholds[att]=best

  'precision', 'predicted', average, warn_for)


In [50]:
thresholds

array([ 0.29,  0.49,  0.59,  0.47,  0.39,  0.63,  0.67,  0.39,  0.51])

In [51]:
def sep_thresholds(y_probas, thresholds):
    y = np.zeros(y_probas.shape)
    for i, t in enumerate(thresholds):
        y[:, i] = (y_probas[:, i] > t).astype(int)
    return y

In [52]:
y_val_biz = sep_thresholds(y_probas_val_biz, thresholds)


In [53]:
print "F1 score: ", f1_score(y_biz, y_val_biz, average='micro')

F1 score:  0.831697564522


In [54]:
def optimize_thresholds(y_probas, y_gt):
    thresholds = np.zeros((9, ))
    for att in range(9):
        best = 0
        f1max = 0
        for t in np.arange(0.0, 1.0, 0.01):
            y_pred = threshold(y_probas, t)
            f1 = f1_score(y_gt[:, att], y_pred[:, att])
            if f1 > f1max:
                best = t
                f1max = f1
        thresholds[att]=best
    return thresholds

In [55]:
from sklearn.cross_validation import KFold

In [56]:
kf = KFold(n=len(biz_id_to_index), n_folds=4, shuffle=True)

In [57]:
thresholds = optimize_thresholds(y_probas_val_biz, y_biz)

In [58]:
y_pred_biz = sep_thresholds(y_probas_val_biz, thresholds)
print "F1 score: ", f1_score(y_biz, y_pred_biz, average='micro')

F1 score:  0.831697564522


In [59]:
thresholds

array([ 0.29,  0.49,  0.59,  0.47,  0.39,  0.63,  0.67,  0.39,  0.51])

In [60]:
all_thresholds = []
for train_index, test_index in kf:
    y_probas_biz_train, y_biz_train = y_probas_val_biz[train_index], y_biz[train_index]
    y_probas_biz_test, y_biz_test = y_probas_val_biz[test_index], y_biz[test_index]
    thresholds = optimize_thresholds(y_probas_biz_train, y_biz_train)
    print thresholds
    all_thresholds.append(thresholds)
    y_pred_biz_test = sep_thresholds(y_probas_biz_test, thresholds)
    print "F1 score: ", f1_score(y_biz_test, y_pred_biz_test, average='micro')

[ 0.35  0.49  0.59  0.47  0.39  0.63  0.68  0.38  0.52]
F1 score:  0.806748466258
[ 0.29  0.49  0.56  0.47  0.41  0.63  0.67  0.39  0.51]
F1 score:  0.839031339031
[ 0.37  0.59  0.6   0.47  0.41  0.63  0.66  0.38  0.51]
F1 score:  0.822388059701
[ 0.29  0.48  0.59  0.47  0.39  0.63  0.67  0.38  0.51]
F1 score:  0.835148874365


In [61]:
thresholds = np.zeros((9,))
for t in all_thresholds:
    print t
    thresholds += t
thresholds /= len(all_thresholds)

[ 0.35  0.49  0.59  0.47  0.39  0.63  0.68  0.38  0.52]
[ 0.29  0.49  0.56  0.47  0.41  0.63  0.67  0.39  0.51]
[ 0.37  0.59  0.6   0.47  0.41  0.63  0.66  0.38  0.51]
[ 0.29  0.48  0.59  0.47  0.39  0.63  0.67  0.38  0.51]


In [62]:
thresholds

array([ 0.325 ,  0.5125,  0.585 ,  0.47  ,  0.4   ,  0.63  ,  0.67  ,
        0.3825,  0.5125])

In [63]:
y_pred_biz = sep_thresholds(y_probas_val_biz, thresholds)

In [64]:
print "F1 score: ", f1_score(y_biz, y_pred_biz, average='micro')

F1 score:  0.832440421208


# Submission on test set

In [66]:
!head ../data/raw/sample_submission.csv

business_id,labels
003sg,1 2 3
00er5,1 2 3
00kad,1 2 3
00mc6,1 2 3
00q7x,1 2 3
00v0t,1 2 3
00y7p,1 2 3
019fg,1 2 3
019r1,1 2 3


In [67]:
sample_sub = pd.read_csv('../data/raw/sample_submission.csv')

In [69]:
test_bids = list(sample_sub.business_id)

In [111]:
test_bids_to_pids = defaultdict(list)

In [112]:
def add_to_dict(x, bids_to_pids):
    bids_to_pids[x.business_id].append(x.photo_id)

In [113]:
_ = test_photo_to_biz.apply(lambda x: add_to_dict(x, test_bids_to_pids), axis=1)

In [114]:
len(test_bids_to_pids)

10000

# Load probabilities on test photo

In [93]:
y_probas_test = np.load('../data/models/multitask/test_prob_pos_yelp_attributes.npy')

In [94]:
y_probas_test.shape

(237152, 9)

In [95]:
test_pid = []

In [96]:
!head '../data/test.list'

test_photos_256x256_noise/459084.jpg
test_photos_256x256_noise/67847.jpg
test_photos_256x256_noise/215999.jpg
test_photos_256x256_noise/199937.jpg
test_photos_256x256_noise/236594.jpg
test_photos_256x256_noise/460385.jpg
test_photos_256x256_noise/308577.jpg
test_photos_256x256_noise/42129.jpg
test_photos_256x256_noise/54088.jpg
test_photos_256x256_noise/36929.jpg


In [97]:
with open('../data/test.list', 'r') as lfile:
    for line in lfile:
        _, filename = os.path.split(line)
        pid_str, _ = os.path.splitext(filename)
        test_pid.append(int(pid_str))

In [98]:
len(test_pid)

237152

In [99]:
test_pid_to_index = {pid: index for index, pid in enumerate(test_pid)}

In [116]:
y_probas_test_biz = np.zeros((len(test_bids_to_pids), 9))

In [117]:
y_probas_test_biz.shape

(10000, 9)

In [118]:
for index, (bid, pids) in enumerate(test_bids_to_pids.items()):
    for pid in pids:
        y_probas_test_biz[index] += y_probas_test[test_pid_to_index[pid]]
    y_probas_test_biz[index] /= len(pids)

In [119]:
y_probas_test_biz

array([[ 0.27286593,  0.49451526,  0.55472141, ...,  0.74021233,
         0.26634882,  0.59438358],
       [ 0.25330923,  0.5848083 ,  0.63217205, ...,  0.77378   ,
         0.36047557,  0.53270324],
       [ 0.25494368,  0.56922001,  0.62791499, ...,  0.76320746,
         0.34509635,  0.53208653],
       ..., 
       [ 0.27307902,  0.53852862,  0.58825422, ...,  0.74084695,
         0.30938106,  0.57695302],
       [ 0.24667973,  0.59442921,  0.64337854, ...,  0.78233279,
         0.3547338 ,  0.52669552],
       [ 0.22825989,  0.61193457,  0.6566023 , ...,  0.79118389,
         0.36879275,  0.50927054]])

In [120]:
thresholds

array([ 0.325 ,  0.5125,  0.585 ,  0.47  ,  0.4   ,  0.63  ,  0.67  ,
        0.3825,  0.5125])

In [121]:
y_pred_test_biz = sep_thresholds(y_probas_test_biz, thresholds)

In [122]:
y_pred_test_biz

array([[ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  0.,  1.],
       ..., 
       [ 0.,  1.,  1., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  0.,  0.]])

In [123]:
def inverse_ohe(ohe):
    inv = []
    for i, x in enumerate(ohe):
        if x == 1:
            inv.append(i) 
    return inv

In [124]:
inverse_ohe(np.array([0, 0, 1, 1, 0]))

[2, 3]

In [125]:
test_bid_to_labels = dict()

In [126]:
for index, (bid, _) in enumerate(test_bids_to_pids.items()):
    test_bid_to_labels[bid] = inverse_ohe(y_pred_test_biz[index])

In [128]:
len(test_bid_to_labels)

10000

In [134]:
!head -n 4 ../data/raw/sample_submission.csv

business_id,labels
003sg,1 2 3
00er5,1 2 3
00kad,1 2 3


In [139]:
' '.join([1, 2])

TypeError: sequence item 0: expected string, int found

In [140]:
with open('../data/submission_001.csv', 'w') as sfile:
    sfile.write('business_id,labels\n')
    for bid, labels in test_bid_to_labels.items():
        labels_str = [str(l) for l in labels]
        line = '{bid},{labels}\n'.format(bid=bid, labels=' '.join(labels_str))
        sfile.write(line)

In [141]:
!head ../data/submission_001.csv

business_id,labels
af0iy,3 5 6 8
s4n0d,1 2 3 5 6 8
s1e3l,1 2 3 5 6 8
yinms,1 3 5 6 8
3i26e,1 2 3 5 6 8
rl851,0 3 8
0zrov,1 2 3 5 6
ean2c,1 2 3 5 6 8
toisr,1 2 3 5 6 8


In [142]:
!wc -l ../data/submission_001.csv

   10001 ../data/submission_001.csv


In [143]:
!wc -l ../data/raw/sample_submission.csv

   10001 ../data/raw/sample_submission.csv
