# Yelp and Simple MLP

### Read in CSV

In [1]:
import pandas as pd
import numpy as np
import keras
import tensorflow as tf

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.utils import np_utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
session_conf = tf.ConfigProto(
      intra_op_parallelism_threads=1,
      inter_op_parallelism_threads=1)
sess = tf.Session(config=session_conf)

In [3]:
df = pd.read_csv('HOPEFUL.csv')

In [4]:
df = df.drop(df.columns[df.columns.str.contains('unnamed', case = False)],axis = 1)
df = df.drop(['business_id', 'date', 'user_id', 'user_cool', 'user_funny', 'user_userful', 'business_city', 'business_zip'], axis=1)
df.head(4)

Unnamed: 0,stars,name_of_month,day_of_week,user_average_stars,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,business_catgrylv2,business_catgrylv1,business_lat,business_long,lv1Cat_Av,lv2Cat_Av,zipCodeBusiness_Av,zipCodeUser_Av,zipCodeUser_WeightedAv,zipCodeUserCount
0,5,May,Saturday,4.67,6,1,1953,4.0,Bars,Breakfast & Brunch,45.516373,-73.577537,3.739232,3.731733,3.980257,3.779459,3.720424,2162
1,5,May,Saturday,4.67,6,1,84,4.0,Bars,Food,45.523333,-73.594859,3.719693,3.731733,4.031579,3.875368,3.711489,95
2,5,May,Saturday,4.67,6,1,50,4.5,Bars,Breakfast & Brunch,45.472902,-73.588321,3.739232,3.731733,4.280899,3.843488,3.927304,86
3,5,May,Saturday,4.67,6,1,70,4.0,Bars,Breakfast & Brunch,45.522144,-73.607076,3.739232,3.731733,3.943038,3.837975,3.851323,79


In [5]:
# Label Encode Categorical Data
# name_of_month
df['name_of_month'] = df['name_of_month'].astype('category')
df['name_of_month'] = df['name_of_month'].cat.codes

# day_of_week
df['day_of_week'] = df['day_of_week'].astype('category')
df['day_of_week'] = df['day_of_week'].cat.codes

# business_catgrylv2
df['business_catgrylv2'] = df['business_catgrylv2'].astype('category')
df['business_catgrylv2'] = df['business_catgrylv2'].cat.codes

# business_catgrylv1
df['business_catgrylv1'] = df['business_catgrylv1'].astype('category')
df['business_catgrylv1'] = df['business_catgrylv1'].cat.codes

# Check Categorization
df.head(3)

Unnamed: 0,stars,name_of_month,day_of_week,user_average_stars,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,business_catgrylv2,business_catgrylv1,business_lat,business_long,lv1Cat_Av,lv2Cat_Av,zipCodeBusiness_Av,zipCodeUser_Av,zipCodeUser_WeightedAv,zipCodeUserCount
0,5,8,2,4.67,6,1,1953,4.0,1,9,45.516373,-73.577537,3.739232,3.731733,3.980257,3.779459,3.720424,2162
1,5,8,2,4.67,6,1,84,4.0,1,22,45.523333,-73.594859,3.719693,3.731733,4.031579,3.875368,3.711489,95
2,5,8,2,4.67,6,1,50,4.5,1,9,45.472902,-73.588321,3.739232,3.731733,4.280899,3.843488,3.927304,86


In [6]:
# Normalize
df = (df - df.mean()) / (df.max() - df.min())
df.head(3)

Unnamed: 0,stars,name_of_month,day_of_week,user_average_stars,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,business_catgrylv2,business_catgrylv1,business_lat,business_long,lv1Cat_Av,lv2Cat_Av,zipCodeBusiness_Av,zipCodeUser_Av,zipCodeUser_WeightedAv,zipCodeUserCount
0,0.317834,0.223576,-0.166741,0.233088,-0.009673,-0.007536,0.217769,0.067103,-0.261601,-0.301905,0.065042,0.105853,0.00418,0.000537,0.062301,0.01198,-0.006591,-0.152566
1,0.317834,0.223576,-0.166741,0.233088,-0.009673,-0.007536,-0.03624,0.067103,-0.261601,-0.051905,0.065097,0.105786,-0.006062,0.000537,0.075132,0.035957,-0.008825,-0.161757
2,0.317834,0.223576,-0.166741,0.233088,-0.009673,-0.007536,-0.040861,0.192103,-0.261601,-0.301905,0.064697,0.105812,0.00418,0.000537,0.137462,0.027987,0.045128,-0.161797


### Split into Train and Test Data

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
y = df.pop('stars')
X = df

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X.index,y,test_size=0.20)

In [10]:
print(X_train.shape)
print(X_test.shape)

(4202787,)
(1050697,)


In [11]:
print(y_train.shape)
print(y_test.shape)

(4202787,)
(1050697,)


In [12]:
# We need to get y sets to one-hot-encoded values
def create_one_hot_encoded_array(array):
    uniques, ids = np.unique(array, return_inverse=True)
    return np_utils.to_categorical(ids, len(uniques))

y_train = create_one_hot_encoded_array(y_train)
y_test = create_one_hot_encoded_array(y_test)

print(y_train[:2])

[[0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]]


In [13]:
print(X_train[:2])

Int64Index([2791561, 3129547], dtype='int64')


In [14]:
X_train = X.iloc[X_train]

In [15]:
print(X_train[:2])

         name_of_month  day_of_week  user_average_stars  user_num_reviews  \
2791561      -0.503697     0.166593           -0.059412         -0.006913   
3129547       0.405394    -0.000074            0.253088         -0.009841   

         user_num_friends  business_num_reviews  business_average_stars  \
2791561         -0.006269             -0.047113               -0.307897   
3129547         -0.007603             -0.041813               -0.057897   

         business_catgrylv2  business_catgrylv1  business_lat  business_long  \
2791561           -0.328268            0.467326      0.068113       0.103260   
3129547           -0.261601            0.371172     -0.017267       0.077285   

         lv1Cat_Av  lv2Cat_Av  zipCodeBusiness_Av  zipCodeUser_Av  \
2791561   0.088873  -0.259515           -0.076994        0.011758   
3129547  -0.024896   0.000537           -0.067337       -0.040280   

         zipCodeUser_WeightedAv  zipCodeUserCount  
2791561                0.013796         -

In [16]:
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


In [17]:
print(X_train.shape)

(4202787, 17)


In [18]:
print(y_train.shape)

(4202787, 5)


In [19]:
X_test = X.iloc[X_test]

In [20]:
print(X_test.shape)

(1050697, 17)


In [21]:
print(y_test.shape)

(1050697, 5)


### Define Network

In [42]:
model = Sequential()

In [43]:
model.add(Dense(100, input_shape=(17,)))

In [44]:
model.add(Activation('relu'))

In [45]:
model.add(Dropout(0.05))

In [46]:
model.add(Dense(100))

In [47]:
model.add(Activation('relu'))

In [48]:
model.add(Dropout(0.05))

In [49]:
model.add(Dense(50))

In [50]:
model.add(Activation('relu'))

In [51]:
model.add(Dropout(0.05))

In [52]:
model.add(Dense(5))

In [53]:
model.add(Activation('softmax'))

### Compile Network

In [54]:
from keras import optimizers
model.compile(loss='categorical_crossentropy', optimizer='adagrad', metrics=['accuracy'])

### Fit Network

In [55]:
history = model.fit(X_train, y_train, 
          batch_size=50, epochs=1, verbose=1,
          validation_data=(X_test, y_test)
         )

Train on 4202787 samples, validate on 1050697 samples
Epoch 1/1


In [56]:
np.unique(model.predict_classes(X_train))

array([0, 1, 2, 3, 4])

In [57]:
import matplotlib.pyplot as plt
pd.Series(history.history['loss']).plot(logy=True)
plt.xlabel("Epoch")
plt.ylabel("Train Error")
plt.savefig('Yelp_Simple_MLP_V1')
plt.show()

<Figure size 640x480 with 1 Axes>

In [58]:
from sklearn.metrics import classification_report

Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

Y_train = np.argmax(y_train, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_train)
print(classification_report(Y_train, y_pred))

             precision    recall  f1-score   support

          0       0.55      0.64      0.59    145748
          1       0.23      0.01      0.02     87737
          2       0.36      0.09      0.15    123024
          3       0.39      0.47      0.42    243770
          4       0.65      0.79      0.72    450418

avg / total       0.51      0.55      0.51   1050697

             precision    recall  f1-score   support

          0       0.55      0.64      0.59    582784
          1       0.23      0.01      0.02    349865
          2       0.36      0.09      0.15    491871
          3       0.39      0.47      0.42    978455
          4       0.65      0.79      0.72   1799812

avg / total       0.51      0.55      0.51   4202787

