In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))


# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
# upload your file to your Google Drive folder and load it
try:
    from google.colab import drive
    drive.mount('/content/drive/')
    %cd 'drive/My Drive/Colab Notebooks'
except ImportError as e:
    pass

In [3]:
# to set the seed for ensuring the generated result will be exactly same in every execution
import numpy as np
import random
import torch
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

In [None]:
# read the dataset into this jupyter file
import pandas as pd

df = pd.read_csv('data.csv')
df.head()

# Visualization of Data

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

tokenized_sentence = []

# tokenize the sentence
for i in range (len(df)):
  tokenized_sentence.append(word_tokenize(df.loc[i, 'Sentence']))

df['tokenized sentence'] = tokenized_sentence

In [None]:
import matplotlib.pyplot as plt

# Count the length of the sentence
for i in range (len(df)):
  df.loc[i, 'length of sentence'] = int(len(df.loc[i, 'tokenized sentence']))

# view the distribution of the sentence length
plt.hist(df['length of sentence'])
plt.title('The histogram of sentence length after tokenizing')
plt.xlabel('Length of sentence')
plt.ylabel('Frequency')
plt.show()

In [None]:
max_words = df['length of sentence'].max()
print(f"The maximum length of the sentence is {max_words}")

In [None]:
# view whether the data is imbalance or not
import seaborn as sns
sns.countplot(x ='Sentiment', data = df)
plt.title('Distribution of the sentiment attributes')
plt.show()

# Preprocessing on Data

In [None]:
# removing punctuation from the given sentence
import string
from sklearn import preprocessing

# transform label variables into ordinal variable
labelEncoder = preprocessing.LabelEncoder()
df['Sentiment'] = labelEncoder.fit_transform(df['Sentiment'])

df['Sentence'] = df['Sentence'].str.lower()
df['Sentence'] = df['Sentence'].str.replace('\d+', '')
for char in string.punctuation:
    df['Sentence'] = df['Sentence'].str.replace(char, ' ')

In [10]:
tokenized_sentence = []

# tokenize the sentence
for i in range (len(df)):
  tokenized_sentence.append(word_tokenize(df.loc[i, 'Sentence']))

df['tokenized sentence'] = tokenized_sentence

In [None]:
# removing stopwords from the given sentences
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
without_stopwords = []
for index in range (len(df)):
  tokenized_sentence = df.loc[index,'tokenized sentence']
  temp_filt = []
  for token in tokenized_sentence:
    if (token not in stop_words):
      temp_filt.append(token)

  without_stopwords.append(temp_filt)

df['tokenized sentence'] = without_stopwords

In [None]:
nltk.download('averaged_perceptron_tagger')

# generate pos tagging for each token
for index, row in df.iterrows():
    tokens = row['tokenized sentence']
    pos_tags = nltk.pos_tag(tokens)
    df.at[index, 'tokenized sentence'] = pos_tags

In [13]:
from nltk.corpus import wordnet

# define the pos tagging that refer to the wordnet for lemmatization
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

# doing lemmatization on those given tokens within each sentence
lemmatizer = WordNetLemmatizer()

for index in range(len(df)):
    for token in range(len(df.loc[index, 'tokenized sentence'])):
        word = df.loc[index, 'tokenized sentence'][token][0]
        pos = df.loc[index, 'tokenized sentence'][token][1]
        wordnet_pos = get_wordnet_pos(pos)
        if wordnet_pos is not None:
            lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
        else:
            lemma = lemmatizer.lemmatize(word)
        df.loc[index, 'tokenized sentence'][token] = (lemma, pos)

In [16]:
# remove the pos tagging from the tokens
for index in range (len(df)):
  for token in range (len(df.loc[index, 'tokenized sentence'])):
    df.loc[index, 'tokenized sentence'][token] = str(df.loc[index, 'tokenized sentence'][token]).split(',')[0][2:-1]

In [None]:
# detokenizer the sentence back to sentence
!pip install keras_preprocessing
from nltk.tokenize.treebank import TreebankWordDetokenizer

for index in range (len(df)):
  df.loc[index, 'Sentence'] = TreebankWordDetokenizer().detokenize(df.loc[index, 'tokenized sentence'])

In [None]:
# count the vocabolary size of the tokenized sentence
%%time
from collections import Counter
cnt = Counter()
for text in df["tokenized sentence"].values:
    for word in text:
        cnt[word] += 1

vocab_size = len(cnt)

In [19]:
import numpy as np

# splitting the data into x and y for preparing training
# converting the dataframe's column into list
X = df.drop(columns = ['Sentiment', 'length of sentence', 'tokenized sentence'])
X = X['Sentence'].values.tolist()
X = np.array(X)
y = df['Sentiment']
y = np.array(y)

In [20]:
# tokenize the sentence and set to use the 8664 vocabulary (since we already removed the stopwords from those given sentences)
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words = 8664, lower = True)
tokenizer.fit_on_texts(X)

# compute the index for each token
X = tokenizer.texts_to_sequences(X)

In [21]:
from keras_preprocessing.sequence import pad_sequences

# adding pad into each sentence after tokenizing
max_length = 81
X = pad_sequences(X, padding = 'post', maxlen = max_length)

In [22]:
from sklearn.model_selection import train_test_split

# splitting data into train test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.1)


# Hyperparameter optimization (RNN)

In [None]:
!pip install Optuna

import optuna
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, LSTM, GRU, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Define the objective function for Optuna to minimize
def objective(trial):
    # Define the search space for hyperparameters
    embd_len = trial.suggest_categorical('embd_len', [32, 64, 128])
    rnn_units = trial.suggest_categorical('rnn_units', [64, 128, 256])
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    epochs = trial.suggest_int('epochs', 5, 25)


    # Create the RNN model with the suggested hyperparameters
    model = Sequential(name = "Simple_RNN")
    model.add(Embedding(vocab_size, embd_len, input_length = max_words))
    model.add(SimpleRNN(rnn_units, activation = 'tanh'))
    model.add(Dense(1, activation = 'sigmoid'))

    # Compile the model
    model.compile(loss = 'binary_crossentropy', optimizer = Adam(learning_rate), metrics = ['accuracy'])

    # Train the model
    model.fit(x_train, y_train, batch_size=64, epochs = epochs, verbose = 1)

    # Evaluate the model on the validation set
    y_pred = model.predict(x_valid)
    accuracy = accuracy_score(y_valid, y_pred)

    return accuracy

# Define the study object for Optuna
study = optuna.create_study(direction = 'maximize')

# Start the hyperparameter optimization
study.optimize(objective, n_trials = 25)

# Print the best hyperparameters and accuracy
best_params = study.best_params
best_accuracy = study.best_value
print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)


# Training model (Recurrent Neural Network)

In [None]:
# fixing every word's embedding size to be 64
embd_len = 128

# vocab_size is the unique words within the given sentence
# embd_len is the dimensional of the word embedding for each token
# input_length is the size of the sentence for inserting into the model

# Creating a RNN model
RNN_model = Sequential(name="Simple_RNN")
RNN_model.add(Embedding(vocab_size,
						embd_len,
						input_length=max_length))

# In case of a stacked(more than one layer of RNN)
# use return_sequences=True
# simpleRNN is each layer as known as the state of the RNN
RNN_model.add(SimpleRNN(64,
						activation='tanh',
						return_sequences=False))

# dense be the last layer for classifying those embedding into according sentiment
RNN_model.add(Dense(1, activation = 'sigmoid'))

# printing model summary
print(RNN_model.summary())

# Compiling model
# Stochastic Gradient Descent for back propagation
RNN_model.compile(
	loss="binary_crossentropy",
	optimizer=Adam(learning_rate = 0.006185314591866337),
	metrics=['accuracy']
)

# Training the model
history = RNN_model.fit(x_train, y_train,
						batch_size = 64,
						epochs = 4,
						verbose = 1,
						validation_data = (x_valid, y_valid))

# Printing model score on test data
print()
print("Simple_RNN Score---> ", RNN_model.evaluate(x_test, y_test, verbose = 1))


# Hyperparameter Optimization (LSTM)

In [None]:
# Define the objective function for Optuna to minimize
def objective(trial):
    # Define the search space for hyperparameters
    embd_len = trial.suggest_categorical('embd_len', [32, 64, 128])
    rnn_units = trial.suggest_categorical('lstm_units', [64, 128, 256])
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    epochs = trial.suggest_int('epochs', 5, 25)


    # Create the RNN model with the suggested hyperparameters
    model = Sequential(name = "LSTM_Model")
    model.add(Embedding(vocab_size, embd_len, input_length = max_length))
    model.add(LSTM(rnn_units, activation = 'tanh'))
    model.add(Dense(1, activation = 'sigmoid'))

    # Compile the model
    model.compile(loss = 'binary_crossentropy', optimizer = Adam(learning_rate), metrics = ['accuracy'])

    # Train the model
    model.fit(x_train, y_train, batch_size = 64, epochs = epochs, verbose = 1)

    # Evaluate the model on the validation set
    y_pred = model.predict(x_valid)
    accuracy = accuracy_score(y_valid, y_pred)

    return accuracy

# Define the study object for Optuna
study = optuna.create_study(direction = 'maximize')

# Start the hyperparameter optimization
study.optimize(objective, n_trials = 25)

# Print the best hyperparameters and accuracy
best_params = study.best_params
best_accuracy = study.best_value
print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)


# Model Training (Long-Short Term Memory)

In [None]:
# Defining LSTM model
lstm_model = Sequential(name = "LSTM_Model")
lstm_model.add(Embedding(vocab_size,
						embd_len,
						input_length = max_length))
lstm_model.add(LSTM(128,
					activation = 'relu',
					return_sequences = False))
lstm_model.add(Dense(1, activation = 'sigmoid'))

# Printing Model Summary
print(lstm_model.summary())

# Compiling the model
lstm_model.compile(
	loss = "binary_crossentropy",
	optimizer = 'adam',
	metrics = ['accuracy']
)

# Training the model
history3 = lstm_model.fit(x_train, y_train,
						batch_size = 64,
						epochs = 5,
						verbose = 2,
						validation_data = (x_valid, y_valid))

# Displaying the model accuracy on test data
print()
print("LSTM model Score---> ", lstm_model.evaluate(x_test, y_test, verbose = 1))


# Hyperparameter Optimization (Bi-LSTM)

In [None]:
# Define the objective function for Optuna to minimize
def objective(trial):
    # Define the search space for hyperparameters
    embd_len = trial.suggest_categorical('embd_len', [32, 64, 128])
    rnn_units = trial.suggest_categorical('bi_lstm_units', [64, 128, 256])
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    epochs = trial.suggest_int('epochs', 5, 25)


    # Create the RNN model with the suggested hyperparameters
    model = Sequential(name = "Bidirectional_LSTM")
    model.add(Embedding(vocab_size, embd_len, input_length = max_length))
    model.add(Bidirectional(LSTM(rnn_units, activation = 'tanh')))
    model.add(Dense(1, activation = 'sigmoid'))

    # Compile the model
    model.compile(loss = 'binary_crossentropy', optimizer = Adam(learning_rate), metrics = ['accuracy'])

    # Train the model
    model.fit(x_train, y_train, batch_size = 64, epochs = epochs, verbose=1)

    # Evaluate the model on the validation set
    y_pred = model.predict(x_valid)
    accuracy = accuracy_score(y_valid, y_pred)

    return accuracy

# Define the study object for Optuna
study = optuna.create_study(direction = 'maximize')

# Start the hyperparameter optimization
study.optimize(objective, n_trials = 25)

# Print the best hyperparameters and accuracy
best_params = study.best_params
best_accuracy = study.best_value
print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)


# Train Model (Bi-LSTM)

In [None]:
# Defining Bidirectional LSTM model
bi_lstm_model = Sequential(name = "Bidirectional_LSTM")
bi_lstm_model.add(Embedding(vocab_size,
							embd_len,
							input_length = max_length))

bi_lstm_model.add(Bidirectional(LSTM(128,
									activation = 'tanh',
									return_sequences = False)))
bi_lstm_model.add(Dense(1, activation = 'sigmoid'))

# Printing model summary
print(bi_lstm_model.summary())

# Compiling model summary
bi_lstm_model.compile(
loss="binary_crossentropy",
optimizer = 'adam',
metrics = ['accuracy']
)

# Training the model
history4 = bi_lstm_model.fit(x_train, y_train,
							batch_size = 64,
							epochs = 5,
							verbose = 2,
							validation_data = (x_valid, y_valid))

# Printing model score on test data
print()
print("Bidirectional LSTM model Score---> ",
	bi_lstm_model.evaluate(x_test, y_test, verbose = 1))


# Testing on customizing activation function

In [None]:
"""
import tensorflow as tf
def custom_activation(x):
    # Define your custom activation function logic
    return tf.square(tf.sin(x))

# Defining Bidirectional LSTM model
bi_lstm_model = Sequential(name = "Bidirectional_LSTM")
bi_lstm_model.add(Embedding(vocab_size,
							embd_len,
							input_length = max_length))

bi_lstm_model.add(Bidirectional(LSTM(128,
									activation = 'tanh',
									return_sequences = False)))
bi_lstm_model.add(Dense(1, activation = custom_activation))

# Printing model summary
print(bi_lstm_model.summary())

# Compiling model summary
bi_lstm_model.compile(
loss = "binary_crossentropy",
optimizer = 'adam',
metrics = ['accuracy']
)

# Training the model
history4 = bi_lstm_model.fit(x_train, y_train,
							batch_size = 64,
							epochs = 5,
							verbose = 2,
							validation_data = (x_valid, y_valid))

# Printing model score on test data
print()
print("Bidirectional LSTM model Score---> ",
	bi_lstm_model.evaluate(x_test, y_test, verbose = 1))
"""