In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from keras.layers import Conv1D, Dense, Embedding, GlobalMaxPooling1D, Input, MaxPooling1D

from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score

from tensorflow.keras.optimizers import Adam

MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 10000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1
BATCH_SIZE = 128
EPOCHS = 5

In [4]:
cd drive/MyDrive/

/content/drive/MyDrive


In [7]:
train = pd.read_csv('train_clean_removed_emoticons.csv')
sentences = train["Review Text"].fillna("DUMMY_VALUE").values
possible_labels = list(set(train["Rating"]))
y = train["Rating"].values
targets = to_categorical(y)

In [10]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
s = sorted(len(s) for s in sequences)
word2idx = tokenizer.word_index

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
#print("Shape of data tensor:", data.shape)

num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)

In [11]:
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    input_length=MAX_SEQUENCE_LENGTH,
)

In [13]:
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
# --------------------------------------
x = Conv1D(256, 3, activation="relu")(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation="relu")(x)
# x = MaxPooling1D(3)(x)
# x = Conv1D(128, 3, activation="relu")(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation="relu")(x)
output = Dense(len(possible_labels) + 1, activation="softmax")(x)

model = Model(input_, output)
model.compile(
    loss="categorical_crossentropy",
    optimizer=Adam(learning_rate=0.01),
    metrics=["accuracy"],
)

In [14]:
r = model.fit(
    data,
    targets,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=VALIDATION_SPLIT,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
p = model.predict(data)
y_pred = []
for prediction in p:
    y_pred.append(np.argmax(prediction, axis=0))

In [16]:
print(confusion_matrix(y, y_pred))

[[ 415   12  195   18   17]
 [ 239   48  841   86   23]
 [  75   11 1410  695   70]
 [   4    0  210 2848  858]
 [   0    1   47  551 9439]]


In [17]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           1       0.57      0.63      0.60       657
           2       0.67      0.04      0.07      1237
           3       0.52      0.62      0.57      2261
           4       0.68      0.73      0.70      3920
           5       0.91      0.94      0.92     10038

    accuracy                           0.78     18113
   macro avg       0.67      0.59      0.57     18113
weighted avg       0.78      0.78      0.76     18113



In [18]:
print(f1_score(y, y_pred, average="micro"))

0.7817589576547231


**Test**

In [22]:
test = pd.read_csv('validation_clean_removed_emoticons.csv')
test_text = test["Review Text"].values
test_text = [str(x).lower() for x in test_text]

sequences = tokenizer.texts_to_sequences(test_text)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

y_test = test["Rating"].values

In [23]:
p = model.predict(data)
y_pred = []
for prediction in p:
    y_pred.append(np.argmax(prediction, axis=0))

In [24]:
print(confusion_matrix(y_test, y_pred))

[[  27    2   29   14   10]
 [  19    1   79   44   14]
 [  18    2  101  122   39]
 [   1    0   55  196  243]
 [   3    0   29  201 1013]]


In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.40      0.33      0.36        82
           2       0.20      0.01      0.01       157
           3       0.34      0.36      0.35       282
           4       0.34      0.40      0.37       495
           5       0.77      0.81      0.79      1246

    accuracy                           0.59      2262
   macro avg       0.41      0.38      0.38      2262
weighted avg       0.57      0.59      0.57      2262



In [26]:
print(f1_score(y_test, y_pred, average="micro"))

0.5915119363395226
