In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [2]:
train_df = pd.read_csv('../data/raw/train.csv')

In [3]:
train_photo_to_biz = pd.read_csv('../data/raw/train_photo_to_biz_ids.csv')

In [4]:
test_photo_to_biz = pd.read_csv('../data/raw/test_photo_to_biz.csv')

In [5]:
attribute_id_to_label = {
    0: 'good_for_lunch',
    1: 'good_for_dinner',
    2: 'takes_reservations',
    3: 'outdoor_seating',
    4: 'restaurant_is_expensive',
    5: 'has_alcohol',
    6: 'has_table_service',
    7: 'ambience_is_classy',
    8: 'good_for_kids'}

In [6]:
train_photo_id_to_biz_id = dict(zip(train_photo_to_biz.photo_id, train_photo_to_biz.business_id))

In [7]:
train_df_cleaned = train_df.dropna()

In [8]:
biz_id_to_labels_str = dict(zip(train_df_cleaned.business_id, train_df_cleaned['labels']))

In [9]:
biz_id_to_labels = dict()
for biz_id, labels_str in biz_id_to_labels_str.items():
    biz_id_to_labels[biz_id] = [int(l) for l in labels_str.split()]

# Split in train/val

We want to predict label for biz id, so I will also split the traning set on buisiness id.

In [10]:
seed = 42

In [11]:
np.random.seed(42)

In [12]:
train_biz_id = set(train_photo_to_biz.business_id)

In [13]:
len(train_biz_id)

2000

In [14]:
train_ratio = 0.7

In [15]:
train_biz_id_permuted = np.random.permutation(list(train_biz_id))

In [16]:
n_train_biz_id = int(len(train_biz_id) * train_ratio)

In [17]:
train_biz_id_cv = set(train_biz_id_permuted[:n_train_biz_id])
val_biz_id_cv = set(train_biz_id_permuted[n_train_biz_id:])

In [18]:
train_photos_ids_cv = {photo_id for photo_id, biz_id in train_photo_id_to_biz_id.items() if biz_id in train_biz_id_cv}
val_photos_ids_cv = {photo_id for photo_id, biz_id in train_photo_id_to_biz_id.items() if biz_id in val_biz_id_cv}

# Create dataset

X_train is (N, 8), y_train contains the label 3

In [125]:
def create_dataset(biz_ids, att=3):
    Xs = []
    ys = []
    for biz_id in biz_ids:
        if biz_id in biz_id_to_labels:
            labels = biz_id_to_labels[biz_id]
            ohe = []
            y = []
            for i in range(9):
                if i != att:
                    if i in labels:
                        ohe.append(1)
                    else:
                        ohe.append(0)
            if att in labels:
                y.append(1)
            else:
                y.append(0)
            Xs.append(ohe)
            ys.append(y)
    return np.array(Xs), np.array(ys).ravel()

In [126]:
X_train, y_train = create_dataset(train_biz_id_cv)
X_val, y_val = create_dataset(val_biz_id_cv)

In [127]:
from sklearn.ensemble import RandomForestClassifier

In [128]:
clf = RandomForestClassifier(n_estimators=100)

In [129]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [130]:
clf.score(X_val, y_val)

0.57666666666666666

# Try with correlated attribute

In [131]:
def predict_att(att):
    X_train, y_train = create_dataset(train_biz_id_cv, att)
    X_val, y_val = create_dataset(val_biz_id_cv, att)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, y_train)
    score = clf.score(X_val, y_val)
    print("Attribute {a}: {s}".format(a=att, s=score))

In [135]:
for i in range(9):
    predict_att(i)

Attribute 0: 0.776666666667
Attribute 1: 0.806666666667
Attribute 2: 0.885
Attribute 3: 0.568333333333
Attribute 4: 0.878333333333
Attribute 5: 0.846666666667
Attribute 6: 0.871666666667
Attribute 7: 0.805
Attribute 8: 0.821666666667
