In [250]:
import pandas as pd
import numpy as np
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import matplotlib.pyplot as plt
%matplotlib inline


In [251]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Read Dataset serta cek isinya

In [252]:
yelp_reviews=pd.read_csv("/content/drive/MyDrive/Deep Learning/yelp_review_polarity_csv/train.csv", names = ["label", 'text'])
yelp_reviews.head(2)

Unnamed: 0,label,text
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...


In [253]:
#let's check out a sample review
yelp_reviews.text[5]

'Wing sauce is like water. Pretty much a lot of butter and some hot sauce (franks red hot maybe).  The whole wings are good size and crispy, but for $1 a wing the sauce could be better. The hot and extra hot are about the same flavor/heat.  The fish sandwich is good and is a large portion, sides are decent.'

In [254]:
##checking for nulls
yelp_reviews.isnull().any()

label    False
text     False
dtype: bool

In [255]:
yelp_reviews.label.unique()

array([1, 2])

In [256]:
yelp_reviews.describe()

Unnamed: 0,label
count,560000.0
mean,1.5
std,0.5
min,1.0
25%,1.0
50%,1.5
75%,2.0
max,2.0


In [257]:
reviews = yelp_reviews[:10000]

In [258]:
reviews['label'].value_counts()

1    5293
2    4707
Name: label, dtype: int64

In [259]:
texts = reviews["text"].values
labels = reviews["label"].values

In [260]:
reviews["label"].unique()

array([1, 2])


#Konversi teks menjadi representasi numerical


In [261]:
MAX_NUM_WORDS=1000 # Jumlah kata unik yang ingin digunakan
MAX_SEQUENCE_LENGTH=100 # maksimum jumlah review dalam kalimat


tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 30298 unique tokens.


In [262]:
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


Shape of data tensor: (10000, 100)
Shape of label tensor: (10000, 3)


In [263]:
# menghilangkan column tambahan yang diakibatkan karena function to_categorical yang harus dimulai dari angka 0
labels = labels[:,1:]
labels.shape

(10000, 2)

<a id='split'></a>

###  split the data into a training set and a validation set

In [264]:
VALIDATION_SPLIT=0.2

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

<a id='embedding_layer'></a>

### Preparing the Embedding layer

GloVe (Global Vectors for Word Representation) is a model for learning word embeddings. The "6B" in "glove.6B.100d" refers to the dataset that the model was trained on, which contains 6 billion tokens (words or word pieces). The "100d" refers to the dimensionality of the word vectors, meaning each word is represented by a 100-dimensional vector. These vectors capture the meaning and context of the words in a numerical form that can be used as input to machine learning models.

In [265]:
GLOVE_DIR='/home/ashish/test/glove.6B/'

import os
embeddings_index = {}
f = open('/content/drive/MyDrive/Deep Learning/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


<a id='embedding_matrix'></a>


### compute  embedding matrix
At this point we can leverage our embedding_index dictionary and our word_index to compute our embedding matrix


In [266]:
EMBEDDING_DIM = 100 # 

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # jika kata tidak ditemukan dalam embedding index maka akan dikembalikan 0
        embedding_matrix[i] = embedding_vector

### Define Embedding Layer 

In [267]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


### Training model

In [None]:
pip install keras-tuner

In [269]:
from keras.layers import Bidirectional, GlobalMaxPool1D,Conv1D
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation

from keras.models import Model


inp = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedded_sequences = embedding_layer(inp)
x = Bidirectional(LSTM(100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(100, activation="relu")(x)
x = Dropout(0.01)(x)
x = Dense(2, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [270]:
# Hyperparameter Tuning Test
# def build_model(hp):
#     inp = Input(shape=(MAX_SEQUENCE_LENGTH,))
#     x = embedded_sequences = embedding_layer(inp)
#     x = Bidirectional(LSTM(100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
#     x = GlobalMaxPool1D()(x)
#     x = Dense(100, activation="relu")(x)
#     x = Dropout(0.01)(x)
#     x = Dense(2, activation="sigmoid")(x)
#     model = Model(inputs=inp, outputs=x)
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  
#     return model

# tuner = RandomSearch(build_model,
#                      max_trials=5,
#                      executions_per_trial=3)

# tuner.search(x_train, y_train, epochs=5, validation_data=(x_val, y_val))


In [271]:
# DC-Bidirectional LSTM
# from keras.layers import Bidirectional, Dense, LSTM
# from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
# from keras.models import Sequential
# # Define the number of hidden units in the LSTM layers
# hidden_units = 50

# # Create a sequential model
# model = Sequential()

# # Add a bidirectional LSTM layer with dense connections
# inp = Input(shape=(MAX_SEQUENCE_LENGTH,))
# x = embedded_sequences = embedding_layer(inp)
# model.add(Bidirectional(LSTM(hidden_units, return_sequences=True, kernel_initializer='glorot_uniform'),input_shape=(50,), merge_mode='concat'))
# model.add(Dense(hidden_units, activation='relu'))
# model.add(Dense(50, activation="relu"))
# # Add additional LSTM and dense layers as needed
# model.add(LSTM(hidden_units, return_sequences=True))
# model.add(Dense(hidden_units, activation='relu'))
# model.add(Dropout(0,1))

# # Add a final dense layer for the output
# model.add(Dense(2, activation='softmax'))

# # Compile the model
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [272]:
model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=5, batch_size=64);

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [273]:
model.evaluate(x_train, y_train)



[0.26099473237991333, 0.8951249718666077]

In [274]:
test = pd.read_csv("/content/drive/MyDrive/Deep Learning/yelp_review_polarity_csv/test.csv", names = ["label", 'text'])

In [275]:
test.label.unique()

array([2, 1])

In [276]:
test.describe()

Unnamed: 0,label
count,38000.0
mean,1.5
std,0.500007
min,1.0
25%,1.0
50%,1.5
75%,2.0
max,2.0


In [277]:
test = test[:10000]

In [278]:
test_texts = test["text"].values
test_labels = test["label"].values

In [279]:
test['label'].value_counts()

1    5367
2    4633
Name: label, dtype: int64

In [280]:
MAX_NUM_WORDS=1000 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH=100 # max number of words in a review to use

sequences = tokenizer.texts_to_sequences(test_texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

test_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 30298 unique tokens.


In [281]:
test_labels = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', test_data.shape)
print('Shape of label tensor:', test_labels.shape)

Shape of data tensor: (10000, 100)
Shape of label tensor: (10000, 3)


In [282]:
test_labels = test_labels[:,1:]

In [283]:
pred =  model.predict(test_data)
pred



array([[0.86218655, 0.15692313],
       [0.9721109 , 0.02475358],
       [0.26698536, 0.7376033 ],
       ...,
       [0.08582633, 0.9072478 ],
       [0.01594464, 0.98400074],
       [0.6215094 , 0.3690671 ]], dtype=float32)

In [284]:
y_pred_bool = np.argmax(pred, axis=1)

In [285]:
test_labels

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [286]:
y_pred_bool = to_categorical(np.asarray(y_pred_bool))
y_pred_bool

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [287]:
from sklearn.metrics import classification_report

print(classification_report(test_labels, y_pred_bool))

              precision    recall  f1-score   support

           0       0.85      0.90      0.87      5367
           1       0.87      0.82      0.85      4633

   micro avg       0.86      0.86      0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000
 samples avg       0.86      0.86      0.86     10000



In [288]:
model.evaluate(test_data,test_labels)



[0.3171703517436981, 0.8619999885559082]