In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [2]:
train_df = pd.read_csv('../data/raw/train.csv')

In [3]:
train_photo_to_biz = pd.read_csv('../data/raw/train_photo_to_biz_ids.csv')

In [4]:
test_photo_to_biz = pd.read_csv('../data/raw/test_photo_to_biz.csv')

In [5]:
attribute_id_to_label = {
    0: 'good_for_lunch',
    1: 'good_for_dinner',
    2: 'takes_reservations',
    3: 'outdoor_seating',
    4: 'restaurant_is_expensive',
    5: 'has_alcohol',
    6: 'has_table_service',
    7: 'ambience_is_classy',
    8: 'good_for_kids'}

In [6]:
train_photo_id_to_biz_id = dict(zip(train_photo_to_biz.photo_id, train_photo_to_biz.business_id))

In [7]:
train_df_cleaned = train_df.dropna()

In [8]:
biz_id_to_labels_str = dict(zip(train_df_cleaned.business_id, train_df_cleaned['labels']))

In [9]:
biz_id_to_labels = dict()
for biz_id, labels_str in biz_id_to_labels_str.items():
    biz_id_to_labels[biz_id] = [int(l) for l in labels_str.split()]

# Load validation photos ids

We want to predict label for biz id, so I will also split the traning set on buisiness id.

In [24]:
!head -n 2 ../data/models/multitask/val_caffe.txt

262147.jpg 0
131078.jpg 0


In [35]:
val_photos_ids = []
with open('../data/models/multitask/val_caffe.txt', 'r') as lfile:
    for line in lfile:
        filename = line.split()[0]
        id_str, _ = os.path.splitext(filename)
        val_photos_ids.append(int(id_str))

In [36]:
len(val_photos_ids)

71640

In [66]:
pid_to_index = {pid: index for index, pid in enumerate(val_photos_ids)}

## Get the set of associated buisiness ids

In [55]:
val_biz_ids = set()
val_pids = set(val_photos_ids) 
for pid, bid in train_photo_id_to_biz_id.items():
    if pid in val_pids:
        val_biz_ids.add(bid)

# Load attribute prediction on validation photos

In [37]:
y_probas_val = np.load('../data/models/multitask/val_caffe_prob_pos_yelp_attributes.npy')

In [38]:
y_probas_val.shape

(71640, 9)

In [99]:
def threshold(y_probas, threshold=0.5):
    return (y_probas > threshold).astype(int)

In [133]:
y_val = threshold(y_probas_val, 0.5)

In [131]:
y_val[0]

array([0, 0, 0, 0, 0, 0, 1, 0, 1])

# Aggregate prediction per business

In [50]:
from collections import defaultdict

In [56]:
biz_id_to_pids = defaultdict(list)

In [57]:
for pid, bid in train_photo_id_to_biz_id.items():
    if pid in val_pids:
        biz_id_to_pids[bid].append(pid)

## Mean probabilty

In [61]:
len(biz_id_to_pids)

600

In [102]:
biz_id_to_index = {bid: index for index, bid in enumerate(biz_id_to_pids.keys())}

In [103]:
y_probas_val_biz = np.zeros((len(biz_id_to_pids), 9))

In [104]:
for bid, pids in biz_id_to_pids.items():
    index = biz_id_to_index[bid]
    for pid in pids:
        y_probas_val_biz[index] += y_probas_val[pid_to_index[pid]]
    y_probas_val_biz[index] /= len(pids)

In [105]:
y_probas_val_biz.shape

(600, 9)

In [128]:
y_val_biz = threshold(y_probas_val_biz, 0.5)

In [107]:
y_val_biz.shape

(600, 9)

## Majority vote

In [115]:
y_val_biz_majority = np.zeros((len(biz_id_to_pids), 9))
for bid, pids in biz_id_to_pids.items():
    index = biz_id_to_index[bid]
    for pid in pids:
        y_val_biz_majority[index] += y_val[pid_to_index[pid]]
    y_val_biz_majority[index] = threshold(y_val_biz_majority[index], len(pids)/2.0)

In [116]:
y_val_biz_majority

array([[ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  0.,  0., ...,  1.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  0.,  0.],
       [ 0.,  1.,  1., ...,  1.,  0.,  1.]])

# Load groundtruth attributes

In [108]:
def OHE(labels, size=9):
    ohe = np.zeros((9))
    for l in labels:
        ohe[l] = 1
    return ohe

In [109]:
y_biz = np.zeros((len(biz_id_to_index), 9))

In [110]:
for bid, index in biz_id_to_index.items():
    labels = biz_id_to_labels[bid]
    y_biz[index] = OHE(labels)

In [111]:
y_biz.shape

(600, 9)

In [113]:
from sklearn.metrics import f1_score

print "F1 score: ", f1_score(y_biz, y_val_biz, average='micro')
print "Individual Class F1 score: ", f1_score(y_biz, y_val_biz, average=None)

F1 score:  0.760438714643
Individual Class F1 score:  [ 0.23293173  0.81116585  0.85585586  0.58519793  0.65454545  0.82798165
  0.82266527  0.50220264  0.90196078]


In [119]:
y_photos = np.zeros((len(val_photos_ids), 9))
for pid, index in pid_to_index.items():
    bid = train_photo_id_to_biz_id[pid]
    labels = biz_id_to_labels[bid]
    y_photos[index] = OHE(labels)

In [49]:
len(val_biz_ids)

600

In [117]:
from sklearn.metrics import f1_score

print "F1 score: ", f1_score(y_biz, y_val_biz_majority, average='micro')
print "Individual Class F1 score: ", f1_score(y_biz, y_val_biz_majority, average=None)

F1 score:  0.757285351066
Individual Class F1 score:  [ 0.26190476  0.81407035  0.85235921  0.5428051   0.63736264  0.83138173
  0.82627119  0.49781659  0.90237467]


In [139]:
for t in np.arange(0.4, 0.5, 0.01):
    y_val_biz = threshold(y_probas_val_biz, t)
    print("{t} - F1 score: {f1}".format(t=t, f1=f1_score(y_biz, y_val_biz, average='micro')))

0.4 - F1 score: 0.767171314741
0.41 - F1 score: 0.769032258065
0.42 - F1 score: 0.770537318308
0.43 - F1 score: 0.770173985087
0.44 - F1 score: 0.772283888328
0.45 - F1 score: 0.77144324602
0.46 - F1 score: 0.771837875611
0.47 - F1 score: 0.772305496074
0.48 - F1 score: 0.771679473106
0.49 - F1 score: 0.767991004498


In [121]:
from sklearn.metrics import accuracy_score

In [132]:
for att in range(9):
    acc = accuracy_score(y_photos[:, att], y_val[:, att])
    print("Accuracy for attribue {a}: {acc}".format(a=att, acc=acc))

Accuracy for attribue 0: 0.7126186488
Accuracy for attribue 1: 0.648897264098
Accuracy for attribue 2: 0.691219988833
Accuracy for attribue 3: 0.514977666108
Accuracy for attribue 4: 0.729020100503
Accuracy for attribue 5: 0.737576772753
Accuracy for attribue 6: 0.730709101061
Accuracy for attribue 7: 0.69860413177
Accuracy for attribue 8: 0.695616973758


In [155]:
accuracy_score(y_biz[:, 0], y_val_biz[:, 0])

0.72333333333333338

In [141]:
from sklearn.metrics import recall_score, precision_score

In [143]:
recall_score(y_biz[:, 0], y_val_biz[:, 0])

0.14084507042253522

In [144]:
precision_score(y_biz[:, 0], y_val_biz[:, 0])

0.78947368421052633

In [145]:
f1_score(y_biz[:, 0], y_val_biz[:, 0])

0.2390438247011952

In [149]:
thresholds = np.zeros((9, ))
for att in range(9):
    best = 0
    f1max = 0
    for t in np.arange(0.0, 1.0, 0.01):
        y_val_biz = threshold(y_probas_val_biz, t)
        f1 = f1_score(y_biz[:, att], y_val_biz[:, att])
        if f1 > f1max:
            best = t
            f1max = f1
    thresholds[att]=best

In [150]:
thresholds

array([ 0.29,  0.49,  0.59,  0.47,  0.39,  0.63,  0.67,  0.39,  0.51])

In [152]:
def sep_thresholds(y_probas, thresholds):
    y = np.zeros(y_probas.shape)
    for i, t in enumerate(thresholds):
        y[:, i] = (y_probas[:, i] > t).astype(int)
    return y

In [153]:
y_val_biz = sep_thresholds(y_probas_val_biz, thresholds)


In [154]:
print "F1 score: ", f1_score(y_biz, y_val_biz, average='micro')

F1 score:  0.831697564522
