In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [2]:
train_df = pd.read_csv('../data/raw/train.csv')

In [3]:
train_photo_to_biz = pd.read_csv('../data/raw/train_photo_to_biz_ids.csv')

In [4]:
test_photo_to_biz = pd.read_csv('../data/raw/test_photo_to_biz.csv')

In [5]:
attribute_id_to_label = {
    0: 'good_for_lunch',
    1: 'good_for_dinner',
    2: 'takes_reservations',
    3: 'outdoor_seating',
    4: 'restaurant_is_expensive',
    5: 'has_alcohol',
    6: 'has_table_service',
    7: 'ambience_is_classy',
    8: 'good_for_kids'}

In [6]:
train_photo_id_to_biz_id = dict(zip(train_photo_to_biz.photo_id, train_photo_to_biz.business_id))

In [7]:
train_df_cleaned = train_df.dropna()

In [8]:
biz_id_to_labels_str = dict(zip(train_df_cleaned.business_id, train_df_cleaned['labels']))

In [9]:
biz_id_to_labels = dict()
for biz_id, labels_str in biz_id_to_labels_str.items():
    biz_id_to_labels[biz_id] = [int(l) for l in labels_str.split()]

# Split in train/val

We want to predict label for biz id, so I will also split the traning set on buisiness id.

In [10]:
seed = 42

In [11]:
np.random.seed(42)

In [12]:
train_biz_id = set(train_photo_to_biz.business_id)

In [13]:
len(train_biz_id)

2000

In [14]:
train_ratio = 0.7

In [15]:
train_biz_id_permuted = np.random.permutation(list(train_biz_id))

In [16]:
n_train_biz_id = int(len(train_biz_id) * train_ratio)

In [17]:
train_biz_id_cv = set(train_biz_id_permuted[:n_train_biz_id])
val_biz_id_cv = set(train_biz_id_permuted[n_train_biz_id:])

In [18]:
train_photos_ids_cv = {photo_id for photo_id, biz_id in train_photo_id_to_biz_id.items() if biz_id in train_biz_id_cv}
val_photos_ids_cv = {photo_id for photo_id, biz_id in train_photo_id_to_biz_id.items() if biz_id in val_biz_id_cv}

# Create dataset

X_train is (N, 8), y_train contains the label 3

In [22]:
def create_dataset(photos_ids):
    Xs = []
    ys = []
    for pid in photos_ids:
        biz_id = train_photo_id_to_biz_id[pid]
        if biz_id in biz_id_to_labels:
            labels = biz_id_to_labels[biz_id]
            ohe = []
            y = []
            for i in range(9):
                if i != 3:
                    if i in labels:
                        ohe.append(1)
                    else:
                        ohe.append(0)
            if 3 in labels:
                y.append(1)
            else:
                y.append(0)
            Xs.append(ohe)
            ys.append(y)
    return np.array(Xs), np.array(ys)

In [29]:
X_train, y_train = create_dataset(train_photos_ids_cv)

In [30]:
X_val, y_val = create_dataset(val_photos_ids_cv)

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
clf = RandomForestClassifier(n_estimators=200)

In [52]:
y_train = y_train.ravel()

In [53]:
y_train.shape

(162905,)

In [54]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [55]:
y_val = y_val.ravel()

In [56]:
clf.score(X_val, y_val)

0.54378838637632609