In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load data

In [None]:
data = pd.read_csv('data/reviews.csv', encoding='latin-1')

print(data.shape)
data.head()

In [None]:
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.utils import pad_sequences


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Review'], data['Rating'], test_size=0.2, random_state=42)

# Convert labels to numerical values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Tokenize and convert text to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_seq_length = max([len(seq) for seq in X_train_sequences])
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_seq_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_seq_length)

## The convolutional neural network (CNN)

You can either <b>train</b> the model yourself (this will take some time!!)

In [None]:
from gensim.models import KeyedVectors
from keras.layers import Embedding


# Loading pre-trained GoogleNews embedding https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300
word2vec_model = KeyedVectors.load('data/word2vec_model')
vocab_size = vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

# Creating embedding layer for the CNN
embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=300,
    weights=[embedding_matrix],
    input_length=max_seq_length,
    trainable=False
)

In [None]:
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization, LSTM, MaxPooling2D, Flatten
from keras.regularizers import l2


model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(Conv1D(64, 3, padding='valid', activation='relu', kernel_regularizer=l2(0.01)))
model.add(BatchNormalization())
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
from keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight


# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

# Train model
model.fit(
    X_train_padded,
    y_train,
    batch_size=64,
    epochs=10,
    validation_data=(X_test_padded, y_test),
    class_weight=class_weights,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)]
)

# Save model
model.save('models/cnn.h5')

or alternatively <b>load</b> the pre-trained model

In [None]:
from keras.models import load_model


# Load model
model = load_model('models/cnn_balanced.h5')

## Evaluating the CNN

In [None]:
from sklearn.metrics import classification_report, confusion_matrix


# Calculate class probabilities and convert to class labels
y_pred_prob = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_prob, axis=1)

# Calculate confuison matrix
cm = confusion_matrix(y_test, y_pred)

# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
class_names = ['1', '2', '3', '4', '5']
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
# plt.title('Confusion Matrix - Convolutional Neural Network')
plt.show()