# Yelp and Simple MLP

### Read in CSV

In [37]:
import pandas as pd
import numpy as np
import keras
import tensorflow as tf

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.utils import np_utils

In [38]:
session_conf = tf.ConfigProto(
      intra_op_parallelism_threads=1,
      inter_op_parallelism_threads=1)
sess = tf.Session(config=session_conf)

In [39]:
df = pd.read_csv('HOPEFUL.csv')

In [40]:
df = df.drop(df.columns[df.columns.str.contains('unnamed', case = False)],axis = 1)
df = df.drop(['business_id', 'date', 'user_id', 'user_cool', 'user_funny', 'user_userful', 'business_city', 'business_zip'], axis=1)
df = df[df["stars"] != 4]
df = df[df["stars"] != 3]
df = df[df["stars"] != 1]
df.to_csv(path_or_buf='HOPEFUL_CROPPED.csv')
df = pd.read_csv('HOPEFUL_CROPPED.csv')
df.head(4)

Unnamed: 0.1,Unnamed: 0,stars,name_of_month,day_of_week,user_average_stars,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,business_catgrylv2,business_catgrylv1,business_lat,business_long,lv1Cat_Av,lv2Cat_Av,zipCodeBusiness_Av,zipCodeUser_Av,zipCodeUser_WeightedAv,zipCodeUserCount
0,0,5,May,Saturday,4.67,6,1,1953,4.0,Bars,Breakfast & Brunch,45.516373,-73.577537,3.739232,3.731733,3.980257,3.779459,3.720424,2162
1,1,5,May,Saturday,4.67,6,1,84,4.0,Bars,Food,45.523333,-73.594859,3.719693,3.731733,4.031579,3.875368,3.711489,95
2,2,5,May,Saturday,4.67,6,1,50,4.5,Bars,Breakfast & Brunch,45.472902,-73.588321,3.739232,3.731733,4.280899,3.843488,3.927304,86
3,3,5,May,Saturday,4.67,6,1,70,4.0,Bars,Breakfast & Brunch,45.522144,-73.607076,3.739232,3.731733,3.943038,3.837975,3.851323,79


In [41]:
# Label Encode Categorical Data
# name_of_month
df['name_of_month'] = df['name_of_month'].astype('category')
df['name_of_month'] = df['name_of_month'].cat.codes

# day_of_week
df['day_of_week'] = df['day_of_week'].astype('category')
df['day_of_week'] = df['day_of_week'].cat.codes

# business_catgrylv2
df['business_catgrylv2'] = df['business_catgrylv2'].astype('category')
df['business_catgrylv2'] = df['business_catgrylv2'].cat.codes

# business_catgrylv1
df['business_catgrylv1'] = df['business_catgrylv1'].astype('category')
df['business_catgrylv1'] = df['business_catgrylv1'].cat.codes

# Check Categorization
df.head(3)

Unnamed: 0.1,Unnamed: 0,stars,name_of_month,day_of_week,user_average_stars,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,business_catgrylv2,business_catgrylv1,business_lat,business_long,lv1Cat_Av,lv2Cat_Av,zipCodeBusiness_Av,zipCodeUser_Av,zipCodeUser_WeightedAv,zipCodeUserCount
0,0,5,8,2,4.67,6,1,1953,4.0,1,9,45.516373,-73.577537,3.739232,3.731733,3.980257,3.779459,3.720424,2162
1,1,5,8,2,4.67,6,1,84,4.0,1,22,45.523333,-73.594859,3.719693,3.731733,4.031579,3.875368,3.711489,95
2,2,5,8,2,4.67,6,1,50,4.5,1,9,45.472902,-73.588321,3.739232,3.731733,4.280899,3.843488,3.927304,86


In [42]:
# Normalize
df = (df - df.mean()) / (df.max() - df.min())
df.head(3)

Unnamed: 0.1,Unnamed: 0,stars,name_of_month,day_of_week,user_average_stars,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,business_catgrylv2,business_catgrylv1,business_lat,business_long,lv1Cat_Av,lv2Cat_Av,zipCodeBusiness_Av,zipCodeUser_Av,zipCodeUser_WeightedAv,zipCodeUserCount
0,-0.498921,0.162809,0.221879,-0.167188,0.173974,-0.006411,-0.005967,0.219832,0.008122,-0.262671,-0.295774,0.06814,0.112131,-0.004203,-0.012993,0.055033,0.008784,-0.008426,-0.146443
1,-0.498921,0.162809,0.221879,-0.167188,0.173974,-0.006411,-0.005967,-0.034177,0.008122,-0.262671,-0.045774,0.068195,0.112064,-0.014446,-0.012993,0.067864,0.033804,-0.01066,-0.155633
2,-0.498921,0.162809,0.221879,-0.167188,0.173974,-0.006411,-0.005967,-0.038798,0.133122,-0.262671,-0.295774,0.067795,0.112089,-0.004203,-0.012993,0.130194,0.025488,0.043294,-0.155673


In [43]:
df = df.drop(df.columns[df.columns.str.contains('unnamed', case = False)],axis = 1)
df.head(3)

Unnamed: 0,stars,name_of_month,day_of_week,user_average_stars,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,business_catgrylv2,business_catgrylv1,business_lat,business_long,lv1Cat_Av,lv2Cat_Av,zipCodeBusiness_Av,zipCodeUser_Av,zipCodeUser_WeightedAv,zipCodeUserCount
0,0.162809,0.221879,-0.167188,0.173974,-0.006411,-0.005967,0.219832,0.008122,-0.262671,-0.295774,0.06814,0.112131,-0.004203,-0.012993,0.055033,0.008784,-0.008426,-0.146443
1,0.162809,0.221879,-0.167188,0.173974,-0.006411,-0.005967,-0.034177,0.008122,-0.262671,-0.045774,0.068195,0.112064,-0.014446,-0.012993,0.067864,0.033804,-0.01066,-0.155633
2,0.162809,0.221879,-0.167188,0.173974,-0.006411,-0.005967,-0.038798,0.133122,-0.262671,-0.295774,0.067795,0.112089,-0.004203,-0.012993,0.130194,0.025488,0.043294,-0.155673


### Split into Train and Test Data

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
y = df.pop('stars')
X = df

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X.index,y,test_size=0.20)

In [47]:
print(X_train.shape)
print(X_test.shape)

(2150265,)
(537567,)


In [48]:
print(y_train.shape)
print(y_test.shape)

(2150265,)
(537567,)


In [49]:
# We need to get y sets to one-hot-encoded values
def create_one_hot_encoded_array(array):
    uniques, ids = np.unique(array, return_inverse=True)
    return np_utils.to_categorical(ids, len(uniques))

y_train = create_one_hot_encoded_array(y_train)
y_test = create_one_hot_encoded_array(y_test)

print(y_train[:2])

[[0. 1.]
 [0. 1.]]


In [50]:
print(X_train[:2])

Int64Index([2605680, 1721423], dtype='int64')


In [51]:
X_train = X.iloc[X_train]

In [52]:
print(X_train[:2])

         name_of_month  day_of_week  user_average_stars  user_num_reviews  \
2605680      -0.505393     0.499478            0.257519         -0.006076   
1721423      -0.141757    -0.000522            0.257519         -0.006829   

         user_num_friends  business_num_reviews  business_average_stars  \
2605680         -0.006034              0.075364                0.008122   
1721423         -0.003899             -0.044778                0.258122   

         business_catgrylv2  business_catgrylv1  business_lat  business_long  \
2605680           -0.262671            0.165765     -0.006422       -0.04932   
1721423            0.070663           -0.007312     -0.025430       -0.03810   

         lv1Cat_Av  lv2Cat_Av  zipCodeBusiness_Av  zipCodeUser_Av  \
2605680  -0.009254  -0.012993            0.001380        0.005165   
1721423   0.243122   0.382080           -0.027471        0.000454   

         zipCodeUser_WeightedAv  zipCodeUserCount  
2605680               -0.000165          

In [53]:
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


In [54]:
print(X_train.shape)

(2150265, 17)


In [55]:
print(y_train.shape)

(2150265, 2)


In [56]:
X_test = X.iloc[X_test]

In [57]:
print(X_test.shape)

(537567, 17)


In [58]:
print(y_test.shape)

(537567, 2)


### Define Network

In [59]:
model = Sequential()

In [60]:
model.add(Dense(100, input_shape=(17,)))

In [61]:
model.add(Activation('relu'))

In [62]:
model.add(Dropout(0.05))

In [63]:
model.add(Dense(100))

In [64]:
model.add(Activation('relu'))

In [65]:
model.add(Dropout(0.05))

In [66]:
model.add(Dense(50))

In [67]:
model.add(Activation('relu'))

In [68]:
model.add(Dropout(0.05))

In [69]:
model.add(Dense(2))

In [70]:
model.add(Activation('softmax'))

### Compile Network

In [71]:
from keras import optimizers
model.compile(loss='categorical_crossentropy', optimizer='adagrad', metrics=['accuracy'])

### Fit Network

In [72]:
history = model.fit(X_train, y_train, 
          batch_size=50, epochs=1, verbose=1,
          validation_data=(X_test, y_test)
         )

Train on 2150265 samples, validate on 537567 samples
Epoch 1/1


In [73]:
np.unique(model.predict_classes(X_train))

array([0, 1])

In [74]:
import matplotlib.pyplot as plt
pd.Series(history.history['loss']).plot(logy=True)
plt.xlabel("Epoch")
plt.ylabel("Train Error")
plt.savefig('Yelp_Simple_MLP_V1')
plt.show()

<Figure size 640x480 with 1 Axes>

In [75]:
from sklearn.metrics import classification_report

Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

Y_train = np.argmax(y_train, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_train)
print(classification_report(Y_train, y_pred))

             precision    recall  f1-score   support

          0       0.67      0.42      0.52     87503
          1       0.90      0.96      0.93    450064

avg / total       0.86      0.87      0.86    537567

             precision    recall  f1-score   support

          0       0.67      0.42      0.52    350099
          1       0.90      0.96      0.93   1800166

avg / total       0.86      0.87      0.86   2150265

