In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Luxury_Beauty/luxury_beauty_cLeaned.csv')

df.head()

Unnamed: 0,review,vote
0,this handcream has a beautiful fragrance it do...,1
1,wonderful hand lotion for seriously dry skin s...,1
2,best hand cream around silky thick soaks in a...,1
3,thanks five stars,1
4,great hand lotion soaks right in and leaves s...,1


In [3]:
df.shape

(34278, 2)

lets devide the dataset into train, dev and test set

In [4]:
from sklearn.model_selection import train_test_split

X = df['review'].astype(str)
y = df['vote']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=30)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=.5, random_state=30)

print(f'x train shape: {x_train.shape}')
print(f'x test shape: {x_test.shape}')
print(f'x val shape: {x_val.shape}')

x train shape: (27422,)
x test shape: (3428,)
x val shape: (3428,)


In [5]:
y_train.shape

(27422,)

In [6]:
print(f'train count values: \n{y_train.value_counts()}')
print(f'test count values: \n{y_test.value_counts()}')
print(f'val count values: \n{y_val.value_counts()}')

train count values: 
1    22240
0     5182
Name: vote, dtype: int64
test count values: 
1    2779
0     649
Name: vote, dtype: int64
val count values: 
1    2784
0     644
Name: vote, dtype: int64


## Data Tokenization

now i will tokenize the data using keras tokenizer

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer

tok = Tokenizer(oov_token='UNK')
#fit the tokenizer on all the data to avoid oov token
tok.fit_on_texts(X)
train_sequence = tok.texts_to_sequences(x_train)
test_sequence = tok.texts_to_sequences(x_test)
val_sequence = tok.texts_to_sequences(x_val)

## model selection

in this part i will try 3 diffrent algorithms 

1.   Bidirectional lstm model
2.   Fine tune a pretrained glove embeddings
3.   BERT algorithm



### Bidirectional LSTM

*   first we will pad the sequences to the maximum review length 
*   more importantly i will use mast=True in The LSTM layer to avoid the padded zeros
*   dense layer with 16 units and tanh as activation
*   output layer with binary_crossentropy as loss and sigmoid as activation



In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max(len(str(x).split()) for x in X)
word_length = len(tok.word_index) + 1

train_padded = pad_sequences(train_sequence, maxlen=max_len)
test_padded = pad_sequences(test_sequence, maxlen=max_len)
val_padded = pad_sequences(val_sequence, maxlen=max_len)

In [9]:
word_length

49736

In [10]:
import tensorflow as tf

def create_dataset(texts, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
    dataset = dataset.shuffle(len(texts))
    dataset = dataset.batch(batch_size, drop_remainder=True).repeat()
    return dataset

batch_size = 64

train_labels = np.array(y_train)
test_labels = np.array(y_test)
val_labels = np.array(y_val)

train_dataset = create_dataset(train_padded, train_labels, batch_size)
test_dataset = create_dataset(test_padded, test_labels, batch_size)
val_dataset = create_dataset(val_padded, val_labels, batch_size)


In [11]:
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Input, Embedding, Dropout, Masking
from tensorflow.keras.models import Model

input_layer = Input(shape=(max_len,))

x = Embedding(word_length, 50)(input_layer)
x = Dropout(.2)(x)
x = Masking(mask_value=0.0)(x)
x = Bidirectional(LSTM(50, return_sequences=True))(x)
x = Bidirectional(LSTM(16))(x)
x = Dense(16, activation='tanh')(x)

output_layer = Dense(1, activation='sigmoid')(x)

model = Model(input_layer, output_layer)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2882)]            0         
                                                                 
 embedding (Embedding)       (None, 2882, 50)          2486800   
                                                                 
 dropout (Dropout)           (None, 2882, 50)          0         
                                                                 
 masking (Masking)           (None, 2882, 50)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 2882, 100)        40400     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 32)               14976     
 nal)                                                        

In [12]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['binary_accuracy'])

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)
tensorboard_callback = TensorBoard(log_dir='./logs')
num_epochs = 10

history = model.fit(train_dataset, 
                    epochs=num_epochs, 
                    batch_size=batch_size,
                    steps_per_epoch=428,
                    validation_data=val_dataset, 
                    validation_steps=53,
                    callbacks=[early_stopping_callback, tensorboard_callback]
                    )


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [16]:
model.evaluate(test_dataset, steps=54)



[0.30969423055648804, 0.9100115895271301]

## coculosion

as we can see we got 91% accuracy on the setiment analysis model