# Wide and Deep Learner

In [59]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pandas as pd
import numpy as np

from keras.models import Sequential, Model, K
from keras.layers import Dense, Activation, BatchNormalization, Input, Dropout, Embedding, merge, Merge, Flatten
from keras.optimizers import Adam
from keras.utils import np_utils

In [60]:
from keras import backend as K
K.set_session(K.tf.Session(config=K.tf.ConfigProto(intra_op_parallelism_threads=20, inter_op_parallelism_threads=20))) 

In [61]:
COLUMNS = ["stars", "name_of_month", "day_of_week", "user_average_stars", "user_cool", "user_funny",
           "user_userful", "user_num_reviews", "user_num_friends", "business_city", "business_num_reviews",
           "business_average_stars", "business_catgry"]
LABEL_COLUMN = "label"
CATEGORICAL_COLUMNS = ["name_of_month", "day_of_week", "business_catgry"]
CONTINUOUS_COLUMNS = ["user_average_stars", "user_cool", "user_funny", "user_userful",
                      "user_num_reviews", "user_num_friends", "business_num_reviews",
                      "business_average_stars"]

In [62]:
df_train = pd.read_csv('binary2.csv')
df_test = pd.read_csv('binary2.csv')

# remove NaN elements
df_train = df_train.dropna(how='any', axis=0)
df_test = df_test.dropna(how='any', axis=0)

df_train[LABEL_COLUMN] = (
  df_train["stars"]).astype(int)
df_test[LABEL_COLUMN] = (
  df_test["stars"]).astype(int)

df_train.head()

Unnamed: 0.1,Unnamed: 0,stars,name_of_month,day_of_week,user_average_stars,user_cool,user_funny,user_userful,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,business_catgry,label
0,0,-0.660753,0.22354,-0.333416,0.098172,-0.00104,-0.000746,-0.001261,-0.006663,-0.007403,-0.013611,-0.182665,-0.245179,0
1,1,0.339247,0.496267,-0.166749,-0.226828,-0.00104,-0.000746,-0.001257,-0.008252,-0.00667,-0.043918,-0.057665,-0.182679,0
2,2,-0.660753,-0.231006,-0.500082,-0.124328,-0.001017,-0.000534,-0.001101,0.001702,-0.00707,0.000251,0.067335,-0.245179,0
3,3,0.339247,-0.140097,-0.333416,0.028172,-0.000136,2.1e-05,-0.000185,0.041354,0.019539,0.006911,0.192335,-0.245179,0
4,4,0.339247,0.041722,-0.166749,-0.184328,-0.00104,-0.000746,-0.001261,-0.009926,-0.005669,-0.045141,-0.182665,-0.245179,0


In [63]:
df_train = df_train.drop(df_train.columns[df_train.columns.str.contains('unnamed', case = False)],axis = 1)
df_train.head(3)

Unnamed: 0,stars,name_of_month,day_of_week,user_average_stars,user_cool,user_funny,user_userful,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,business_catgry,label
0,-0.660753,0.22354,-0.333416,0.098172,-0.00104,-0.000746,-0.001261,-0.006663,-0.007403,-0.013611,-0.182665,-0.245179,0
1,0.339247,0.496267,-0.166749,-0.226828,-0.00104,-0.000746,-0.001257,-0.008252,-0.00667,-0.043918,-0.057665,-0.182679,0
2,-0.660753,-0.231006,-0.500082,-0.124328,-0.001017,-0.000534,-0.001101,0.001702,-0.00707,0.000251,0.067335,-0.245179,0


In [69]:
df_train = df_train.truncate(before=0, after=4209333)
df_test = df_test.truncate(before=4209334, after=5261667)
print('complete')

complete


In [70]:
df_train = df_train.drop(df_train.columns[df_train.columns.str.contains('unnamed', case = False)],axis = 1)
print(len(df_train))
df_train.head(3)

4209334


Unnamed: 0,stars,name_of_month,day_of_week,user_average_stars,user_cool,user_funny,user_userful,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,business_catgry,label
0,-0.660753,0.22354,-0.333416,0.098172,-0.00104,-0.000746,-0.001261,-0.006663,-0.007403,-0.013611,-0.182665,-0.245179,0
1,0.339247,0.496267,-0.166749,-0.226828,-0.00104,-0.000746,-0.001257,-0.008252,-0.00667,-0.043918,-0.057665,-0.182679,0
2,-0.660753,-0.231006,-0.500082,-0.124328,-0.001017,-0.000534,-0.001101,0.001702,-0.00707,0.000251,0.067335,-0.245179,0


In [71]:
df_test = df_test.drop(df_test.columns[df_test.columns.str.contains('unnamed', case = False)],axis = 1)
print(len(df_test))
df_test.head(3)

1052334


Unnamed: 0,stars,name_of_month,day_of_week,user_average_stars,user_cool,user_funny,user_userful,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,business_catgry,label
4209334,0.339247,0.041722,0.499918,-0.241828,-0.001031,-0.000722,-0.001217,-0.009507,-0.00747,-0.046908,0.317335,0.442321,0
4209335,0.339247,-0.412824,0.499918,0.163172,-0.00104,-0.000741,-0.001221,-0.007249,0.023007,-0.030328,0.067335,-0.245179,0
4209336,0.339247,-0.231006,-0.500082,0.065672,-0.00104,-0.000746,-0.001261,-0.010009,-0.007337,-0.044734,0.067335,0.129821,0


In [72]:
def convert_dataframe(dataframe, normalize=False):
    df_cat_one_hot = pd.get_dummies(dataframe[CATEGORICAL_COLUMNS])
    df_continous_cols = dataframe[CONTINUOUS_COLUMNS]
    df_one_hot = pd.concat([df_continous_cols, df_cat_one_hot], axis=1)
    print("shape: %s" % (df_one_hot.shape,))
    return df_one_hot

X_train = convert_dataframe(df_train, normalize=True)
X_test = convert_dataframe(df_test, normalize=True)

shape: (4209334, 11)
shape: (1052334, 11)


In [73]:
X_test.head(2)

Unnamed: 0,user_average_stars,user_cool,user_funny,user_userful,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,name_of_month,day_of_week,business_catgry
4209334,-0.241828,-0.001031,-0.000722,-0.001217,-0.009507,-0.00747,-0.046908,0.317335,0.041722,0.499918,0.442321
4209335,0.163172,-0.00104,-0.000741,-0.001221,-0.007249,0.023007,-0.030328,0.067335,-0.412824,0.499918,-0.245179


In [74]:
X_train.head(2)

Unnamed: 0,user_average_stars,user_cool,user_funny,user_userful,user_num_reviews,user_num_friends,business_num_reviews,business_average_stars,name_of_month,day_of_week,business_catgry
0,0.098172,-0.00104,-0.000746,-0.001261,-0.006663,-0.007403,-0.013611,-0.182665,0.22354,-0.333416,-0.245179
1,-0.226828,-0.00104,-0.000746,-0.001257,-0.008252,-0.00667,-0.043918,-0.057665,0.496267,-0.166749,-0.182679


In [76]:
y_train, y_test = df_train[LABEL_COLUMN].values, df_test[LABEL_COLUMN].values
y_train.shape, y_test.shape

((4209334,), (1052334,))

In [77]:
def fully_connected(dense_size, input):
    x = BatchNormalization()(input)
    x = Dense(dense_size, activation='relu')(x)
    return Dropout(0.5)(x)

In [None]:
input = Input(shape=(X_train.shape[1],))
hidden1 = fully_connected(50, input)
hidden2 = fully_connected(25, hidden1)
output = Dense(1, activation='sigmoid')(hidden2)

simple_model = Model(input, output)

optimizer = Adam(lr=1e-3)
simple_model.compile(loss='binary_crossentropy', 
                     optimizer=optimizer, 
                     metrics=['accuracy'])
simple_model.fit(X_train.values, y_train,
                epochs=5, batch_size=64,
                validation_data=(X_test.values, y_test))

  


Train on 4209334 samples, validate on 1052334 samples
Epoch 1/10
Epoch 2/10
 723840/4209334 [====>.........................] - ETA: 2:29 - loss: 1.0017e-07 - acc: 1.0000

In [None]:
K.set_value(embeddings_model.optimizer.lr, 1e-4)
simple_model.fit(X_train.values, y_train,
                epochs=5, batch_size=64,
                validation_data=(X_test.values, y_test))

In [None]:
K.set_value(embeddings_model.optimizer.lr, 1e-5)
simple_model.fit(X_train.values, y_train,
                epochs=5, batch_size=64,
                validation_data=(X_test.values, y_test))