In [84]:
import pandas as pd
import numpy as np

In [85]:
data = pd.read_csv("C:/Users/micha/yelp.csv")

In [86]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   business_id  10000 non-null  object
 1   date         10000 non-null  object
 2   review_id    10000 non-null  object
 3   stars        10000 non-null  int64 
 4   text         10000 non-null  object
 5   type         10000 non-null  object
 6   user_id      10000 non-null  object
 7   cool         10000 non-null  int64 
 8   useful       10000 non-null  int64 
 9   funny        10000 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 781.4+ KB


In [None]:
#Rule for NN when it comes to text data
#1. ANN/CNN understands only padded sequence(NUMBERS) (You need to represent text data into numbers)
#2. Input Size(Number of Tokens) must be FIXED
#3. Labels must be discrete numerical/binarized

In [88]:
data['stars'] = data['stars'].map({1:0, 2:0, 3:1, 4:1, 5:1})

In [89]:
data.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,1,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,1,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,1,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,1,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,1,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [90]:
#Seperate data as features and label
features = data.text.values
label = data.stars.values

In [91]:
label.shape

(10000,)

In [92]:
#Train test split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=1)

In [93]:
#Start with Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer

#Decide the Vocabulary Word Frequency Size. Used only when Regularization is required

vocabFreqWordSize = None

#Convert the sentences into sequence of words

tokenizer = Tokenizer(num_words=vocabFreqWordSize, oov_token="<DontKnow>", split=" ")

#Fit tokenizer with training set

tokenizer.fit_on_texts(X_train)

In [94]:
tokenizer.word_index

{'<DontKnow>': 1,
 'the': 2,
 'and': 3,
 'a': 4,
 'i': 5,
 'to': 6,
 'of': 7,
 'was': 8,
 'is': 9,
 'it': 10,
 'for': 11,
 'in': 12,
 'that': 13,
 'my': 14,
 'with': 15,
 'but': 16,
 'you': 17,
 'this': 18,
 'they': 19,
 'on': 20,
 'we': 21,
 'have': 22,
 'not': 23,
 'had': 24,
 'are': 25,
 'good': 26,
 'so': 27,
 'place': 28,
 'at': 29,
 'food': 30,
 'were': 31,
 'as': 32,
 'be': 33,
 'there': 34,
 'great': 35,
 'like': 36,
 'if': 37,
 'all': 38,
 'me': 39,
 'just': 40,
 'out': 41,
 'very': 42,
 'here': 43,
 'one': 44,
 'their': 45,
 'or': 46,
 'get': 47,
 "it's": 48,
 'from': 49,
 'up': 50,
 'when': 51,
 'go': 52,
 'time': 53,
 'really': 54,
 'our': 55,
 'some': 56,
 'about': 57,
 'service': 58,
 'would': 59,
 'what': 60,
 'an': 61,
 'your': 62,
 'can': 63,
 'been': 64,
 'which': 65,
 'back': 66,
 'more': 67,
 'only': 68,
 'also': 69,
 'no': 70,
 'will': 71,
 "don't": 72,
 'by': 73,
 'too': 74,
 'love': 75,
 'has': 76,
 'little': 77,
 'nice': 78,
 "i'm": 79,
 'well': 80,
 'other': 81

In [95]:
#Lets create Sequence object

seqTrain = tokenizer.texts_to_sequences(X_train)

In [96]:
seqTest = tokenizer.texts_to_sequences(X_test)

In [97]:
#Lets pad the sequence data

from tensorflow.keras.preprocessing.sequence import pad_sequences
train_data = pad_sequences(seqTrain)
T = train_data.shape[1]
T

953

In [98]:
testData = pad_sequences(seqTest,maxlen=T)

In [99]:
train_data.shape

(8000, 953)

In [100]:
testData.shape

(2000, 953)

In [101]:
len(tokenizer.word_index)

27838

In [102]:
#Modelling Phase

import tensorflow as tf

vocabSize=len(tokenizer.word_index)
maxlen=T
embeddingDimension = 20 #Hyperparameter any value between 10 to inf (Natural no)

In [103]:
model = tf.keras.Sequential()
#Embedding converts sequence data into a dense vector. Embedding is responsible to preserve the semantic meaning of the sentence

model.add(tf.keras.layers.Embedding(vocabSize + 1, embeddingDimension, input_length=maxlen))

model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation="relu"))

model.add(tf.keras.layers.GlobalAveragePooling1D())

model.add(tf.keras.layers.Dense(24, activation="relu"))
model.add(tf.keras.layers.Dense(12, activation="relu"))
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))



In [104]:
#Custom Callback for early stopping

class MyThresholdCallBack(tf.keras.callbacks.Callback):
    def __init__(self,cl):
        super(MyThresholdCallBack, self).__init__()
        self.cl = cl

    def on_epoch_end(self, epoch, logs=None):
        test_score = logs["val_accuracy"]
        train_score = logs["accuracy"]

        if test_score > train_score and test_score > self.cl:
        #if test_score > self.cl:
            self.model.stop_training = True

In [105]:
myAccuracyMonitor = MyThresholdCallBack(cl=0.9)

In [106]:
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=['accuracy'])

In [107]:
model.fit(train_data,y_train, epochs=10, validation_data=(testData,y_test), callbacks=[myAccuracyMonitor])

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.8317 - loss: 0.4816 - val_accuracy: 0.8335 - val_loss: 0.4343
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.8376 - loss: 0.4239 - val_accuracy: 0.8840 - val_loss: 0.2980
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.8975 - loss: 0.2390 - val_accuracy: 0.8945 - val_loss: 0.2560
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9414 - loss: 0.1544 - val_accuracy: 0.8990 - val_loss: 0.2689
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.9687 - loss: 0.0898 - val_accuracy: 0.8980 - val_loss: 0.3055
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.9769 - loss: 0.0938 - val_accuracy: 0.8975 - val_loss: 0.3154
Epoch 7/10
[1m250/250

<keras.src.callbacks.history.History at 0x1b099590a70>