In [2]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten,Conv2D,Conv1D,MaxPooling1D,  Dropout
from keras.layers import concatenate
from keras.layers import Bidirectional, GlobalMaxPool1D,BatchNormalization
from keras.models import Model, Sequential
from keras.layers import Convolution1D,GlobalMaxPooling1D
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.utils import to_categorical
import numpy as np
import sys
import os

In [3]:
df = pd.read_csv("C:/Users/Harvey/Desktop/Yelp_data_set/restuarant_review_5_label_unbalanced.csv")
train = df.sample(frac = 0.8,random_state = 200)
test = df.drop(train.index)
train.loc[:,'stars'] -= 1
print(np.unique(train['stars']))

[0 1 2 3 4]


In [4]:
max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train['Processed_Reviews'])
list_tokenized_train = tokenizer.texts_to_sequences(train['Processed_Reviews'])


maxlen = 130
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
y = to_categorical(train['stars'])
#####################
# Test data
tokenizer.fit_on_texts(test['Processed_Reviews'])
list_tokenized_test = tokenizer.texts_to_sequences(test['Processed_Reviews'])


maxlen = 130
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)
test.loc[:,'stars'] -=1
y_test = test['stars']
print(np.unique(test['stars']))

[0 1 2 3 4]


In [6]:
#####################################################################
# Using pretrained glove vector
#####################################################################
GLOVE_DIR = "C:/Users/Harvey/Desktop/Yelp_data_set/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding = 'utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

Total 400000 word vectors in Glove 6B 100d.


In [7]:
word_index = tokenizer.word_index
embed_size = 100
embedding_matrix = np.random.random((len(word_index) + 1, embed_size))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            embed_size,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)

In [8]:
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters = 50, kernel_size = 2, padding='same',activation = "relu"))
model.add(GlobalMaxPool1D())
#model.add(Flatten())
model.add(Dense(5, activation="softmax"))
model.summary()
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 130, 100)          27500700  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 130, 50)           10050     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 255       
Total params: 27,511,005
Trainable params: 10,305
Non-trainable params: 27,500,700
_________________________________________________________________


In [9]:
batch_size = 100
epochs = 3
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Train on 377894 samples, validate on 94474 samples
Epoch 1/3
 15700/377894 [>.............................] - ETA: 2:02 - loss: 1.4169 - acc: 0.3989

KeyboardInterrupt: 

In [10]:
#########################################
# Yoon kim architure
embed_size = 100
embedding_layer = Embedding(len(word_index) + 1,
                            embed_size,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)

In [11]:
#############################################
# Original yoon kim 
#############################################
conv_filters = 128
sequence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

# Specify each convolution layer and their kernel siz i.e. n-grams 
conv1_1 = Conv1D(filters=conv_filters, kernel_size=3)(embedded_sequences)
btch1_1 = BatchNormalization()(conv1_1)
actv1_1 = Activation('relu')(btch1_1)
drp1_1  = Dropout(0.2)(actv1_1)
glmp1_1 = GlobalMaxPooling1D()(drp1_1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=4)(embedded_sequences)
btch1_2 = BatchNormalization()(conv1_2)
actv1_2 = Activation('relu')(btch1_2)
drp1_2  = Dropout(0.2)(actv1_2)
glmp1_2 = GlobalMaxPooling1D()(drp1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=5)(embedded_sequences)
btch1_3 = BatchNormalization()(conv1_3)
actv1_3 = Activation('relu')(btch1_3)
drp1_3  = Dropout(0.2)(actv1_3)
glmp1_3 = GlobalMaxPooling1D()(drp1_3)

conv1_4 = Conv1D(filters=conv_filters, kernel_size=6)(embedded_sequences)
btch1_4 = BatchNormalization()(conv1_4)
actv1_4 = Activation('relu')(btch1_4)
drp1_4  = Dropout(0.2)(actv1_4)
glmp1_4 = GlobalMaxPooling1D()(drp1_4)

# Gather all convolution layers
cnct = concatenate([glmp1_1, glmp1_2, glmp1_3, glmp1_4], axis=1)
drp1 = Dropout(0.2)(cnct)

dns1  = Dense(32, activation='relu')(drp1)
btch1 = BatchNormalization()(dns1)
drp2  = Dropout(0.2)(btch1)

out = Dense(5, activation='softmax')(drp2)

In [12]:
model = Model(inputs=sequence_input, outputs=out)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 130)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 130, 100)     27500700    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 128, 128)     38528       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 127, 128)     51328       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_4 (

In [13]:
batch_size = 100
epochs = 3
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Train on 377894 samples, validate on 94474 samples
Epoch 1/3
  4300/377894 [..............................] - ETA: 33:10 - loss: 1.7366 - acc: 0.2814

KeyboardInterrupt: 

In [None]:
prediction = model.predict(X_test)
print(y_pred.shape)
y_pred = np.argmax(prediction,axis = 1)
y_test = np.array(y_test)
print(y_pred[:10])
print(y_test.shape)
print(y_pred.shape)
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, confusion_matrix
print('accuracy :{0}'.format(accuracy_score(y_pred, y_test)))