In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Luxury_Beauty/luxury_beauty_cLeaned.csv')

df.head()

Unnamed: 0,review,vote
0,this handcream has a beautiful fragrance it do...,1
1,wonderful hand lotion for seriously dry skin s...,1
2,best hand cream around silky thick soaks in a...,1
3,thanks five stars,1
4,great hand lotion soaks right in and leaves s...,1


In [3]:
from sklearn.model_selection import train_test_split

X = df['review'].astype(str)
y = df['vote']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=30)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=.5, random_state=30)

print(f'x train shape: {x_train.shape}')
print(f'x test shape: {x_test.shape}')
print(f'x val shape: {x_val.shape}')

x train shape: (27422,)
x test shape: (3428,)
x val shape: (3428,)


## Data Tokenization

now i will tokenize the data using keras tokenizer

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer

tok = Tokenizer(oov_token='UNK')
#fit the tokenizer on all the data to avoid oov token
tok.fit_on_texts(X)
train_sequence = tok.texts_to_sequences(x_train)
test_sequence = tok.texts_to_sequences(x_test)
val_sequence = tok.texts_to_sequences(x_val)

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max(len(str(x).split()) for x in X)
word_length = len(tok.word_index) + 1

train_padded = pad_sequences(train_sequence, maxlen=max_len)
test_padded = pad_sequences(test_sequence, maxlen=max_len)
val_padded = pad_sequences(val_sequence, maxlen=max_len)

In [6]:
word_length

49736

In [7]:
import tensorflow as tf

def create_dataset(texts, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
    dataset = dataset.shuffle(len(texts))
    dataset = dataset.batch(batch_size, drop_remainder=True).repeat()
    return dataset

batch_size = 64

train_labels = np.array(y_train)
test_labels = np.array(y_test)
val_labels = np.array(y_val)

train_dataset = create_dataset(train_padded, train_labels, batch_size)
test_dataset = create_dataset(test_padded, test_labels, batch_size)
val_dataset = create_dataset(val_padded, val_labels, batch_size)

### Fine Tune a Pretrained Glove Word Embedding

*   first we will pad the sequences to the maximum review length 
*   more importantly i will use mast=True in The LSTM layer to avoid the padded zeros
*   dense layer with 16 units and tanh as activation
*   output layer with binary_crossentropy as loss and sigmoid as activation



In [8]:
#download and unzip glove
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d glove

--2023-05-02 10:08:53--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-05-02 10:08:54--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-05-02 10:08:54--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [9]:
embedding_index = {} 
with open('glove/glove.6B.100d.txt', encoding='utf8') as f:
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1], dtype='float')
    embedding_index[word] = coefs

In [10]:
len(tok.word_index) + 1

49736

In [11]:
#create embedding matrix
embedding_size = 100
embedding_matrix = np.zeros((word_length, embedding_size))
for word, i in tok.word_index.items():
  # if i > 10000:
  #       break
  coefs = embedding_index.get(word)
  if coefs is not None:
    embedding_matrix[i] = coefs

In [12]:
from pandas._libs.writers import word_len
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Input, Embedding, Dropout, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import Constant

input_layer = Input(shape=(max_len,))

x = Embedding(word_length, 
              embedding_size,
              embeddings_initializer=Constant(embedding_matrix),
              input_length=max_len,
              trainable=True)(input_layer)
x = Dropout(.2)(x)
x = Masking(mask_value=0.0)(x)
x = Bidirectional(LSTM(50, return_sequences=True))(x)
x = Bidirectional(LSTM(16))(x)
x = Dense(16, activation='tanh')(x)

output_layer = Dense(1, activation='sigmoid')(x)

model = Model(input_layer, output_layer)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2882)]            0         
                                                                 
 embedding (Embedding)       (None, 2882, 100)         4973600   
                                                                 
 dropout (Dropout)           (None, 2882, 100)         0         
                                                                 
 masking (Masking)           (None, 2882, 100)         0         
                                                                 
 bidirectional (Bidirectiona  (None, 2882, 100)        60400     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 32)               14976     
 nal)                                                        

In [13]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['binary_accuracy'])

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=1)
tensorboard_callback = TensorBoard(log_dir='./logs2')
epochs = 10

model.fit(train_dataset,
          steps_per_epoch=428,
          validation_data=val_dataset,
          validation_steps=53,
          epochs=epochs,
          batch_size=batch_size,
          callbacks=[early_stopping_callback, tensorboard_callback])

Epoch 1/10
 61/428 [===>..........................] - ETA: 1:32:06 - loss: 0.5006 - binary_accuracy: 0.8064

KeyboardInterrupt: ignored