In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import json

import tensorflow as tf

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [31]:
# reads in the json file, only to the max entries and returns them as json_array, if max entries is set to 0 then it reads the full thing
def read_partial_json_file(filename, max_entries=0, encoding='utf-8'):
    json_array = []
    with open(filename, 'r', encoding=encoding) as file:
        if max_entries == 0:
            for line in file:
                json_array.append(json.loads(line))
        else:
            for _ in range(max_entries):
                line = file.readline()
                if not line:
                    break
                json_array.append(json.loads(line))
    return json_array


def add_missing_keys(json_array):
    for obj in json_array:
        for key in ['stars', 'useful', 'funny', 'cool', 'text']:
            if key not in obj:
                obj[key] = 0
                if key == 'stars':
                    obj[key] = 3
                print("Key {} not found in json".format(key))
    return json_array


# removes specified keys from json array
def remove_keys(json_array, keys_to_remove):
    for obj in json_array:
        for key in keys_to_remove:
            obj.pop(key, None)
    return json_array


def ConvertJSONFileToDataFrame(filename, max_entries=1000, encoding='utf-8'):
    #load in the json array
    json_array = read_partial_json_file(filename, max_entries, encoding)
    #add in the missing keys, will set to 0 for now but a heuristic for this will have to be made.
    json_array = add_missing_keys(json_array)
    df = pd.DataFrame(json_array)
    ColumnsToRemove = ['business_id', 'user_id', 'date', 'review_id']
    df = df.drop(columns=ColumnsToRemove)
    return df

In [32]:
from nltk.corpus import stopwords
from nltk import SnowballStemmer, word_tokenize

filename = 'yelp_academic_dataset_review.json'
dataset = ConvertJSONFileToDataFrame(filename, 1000)

stem = SnowballStemmer("english")
stopWords = stopwords.words('english')

def stemText(text):
    return " ".join([i for i in word_tokenize(text) if not i in stopWords])

#Data preprocessing: convert text to lowercase
X = dataset['text'].map(lambda x: stemText(x.lower()))
#convert star count to categories starting from 0
translation = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4 }
labels = ['1', '2', '3', '4', '5']
y = dataset['stars'].copy()
y.replace(translation, inplace=True)
y = to_categorical(y,5)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=312)

In [33]:
vectorizer = CountVectorizer(lowercase=True)
vectorizer.fit(X_train)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_tok = tokenizer.texts_to_sequences(X_train)

max_length = max([len(x) for x in X_train_tok])
vocab_size = len(tokenizer.word_index)+1 #add 1 to account for unknown word
print("Vocabulary size: {}".format(vocab_size))
print("Max length of sentence: {}".format(max_length))
X_train_tok = pad_sequences(X_train_tok, max_length ,padding='post')

X_train_tok

Vocabulary size: 7270
Max length of sentence: 480


array([[  24,  721, 1375, ...,    0,    0,    0],
       [ 946,   67,  139, ...,    0,    0,    0],
       [  24,    5,   14, ...,    0,    0,    0],
       ...,
       [ 273,  149,  325, ...,    0,    0,    0],
       [7260, 1376, 1916, ...,    0,    0,    0],
       [  38,  612,  950, ...,    0,    0,    0]])

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Dropout
from tensorflow.keras.layers import Bidirectional,Embedding,Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_vector_length=32
num_classes = 5
model = Sequential()
model.add(Embedding(vocab_size,embedding_vector_length, input_shape=(max_length, )))
model.add(Bidirectional(LSTM(250,return_sequences=True)))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16,activation='relu'))
model.add(Dense(num_classes,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
callbacks = [EarlyStopping(monitor='val_accuracy', patience=3), ModelCheckpoint('../model/model.keras', save_best_only=True, save_weights_only=False)]
model.summary()

  super().__init__(**kwargs)


In [35]:
history = model.fit(X_train_tok, y_train, validation_split=0.1, epochs=15, batch_size=32, verbose=1, callbacks=callbacks)

Epoch 1/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 2s/step - accuracy: 0.3851 - loss: 1.5409 - val_accuracy: 0.5000 - val_loss: 1.4304
Epoch 2/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 2s/step - accuracy: 0.3631 - loss: 1.5323 - val_accuracy: 0.5000 - val_loss: 1.4255
Epoch 3/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 2s/step - accuracy: 0.4338 - loss: 1.4446 - val_accuracy: 0.5000 - val_loss: 1.4592
Epoch 4/15
[1m19/23[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m9s[0m 2s/step - accuracy: 0.4287 - loss: 1.4574 

KeyboardInterrupt: 

In [None]:
X_test_token = tokenizer.texts_to_sequences(X_test)
X_test_token = pad_sequences(X_test_token, max_length ,padding='post')
pred = model.predict(X_test_token)
pred = to_categorical(pred,5)

In [None]:
X_valid_token = tokenizer.texts_to_sequences(X_valid)
X_valid_token = pad_sequences(X_valid_token, max_length ,padding='post')

model.evaluate(X_test_token, y_test)

In [None]:
model.save("model_bad2.keras")

In [None]:
m2 = tf.keras.models.load_model('model_bad.h5')