# 2024 CITS4012 Project

# Readme

notes for marker

# 1. Dataset Processing

## Import the libraries

In [2]:
import re
import json
import nltk
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## Loading the datasets

In [28]:
# Load the training data
with open('train.json', 'r') as file:
    data = json.load(file)
    train_data = pd.DataFrame(data['data'], columns=data['columns'])

# Load the test data
with open('test.json', 'r') as file:
    data = json.load(file)
    test_data = pd.DataFrame(data['data'], columns=data['columns'])
    
# Load the validation data
with open('val.json', 'r') as file:
    data = json.load(file)
    val_data = pd.DataFrame(data['data'], columns=data['columns'])

# Get the x and y lists for training, test and validation data
training_x = train_data['sentence'].tolist()
training_y = [(train_data['aspect'][i], train_data['polarity'][i]) for i in range(len(train_data))]
test_x = test_data['sentence'].tolist()
test_y = [(test_data['aspect'][i], test_data['polarity'][i]) for i in range(len(test_data))]
val_x = val_data['sentence'].tolist()
val_y = [(val_data['aspect'][i], val_data['polarity'][i]) for i in range(len(val_data))]

print("Training data size\t", train_data.shape)
print("Test data size\t\t", test_data.shape)
print("Validation data size\t", val_data.shape)


Training data size	 (7090, 3)
Test data size		 (901, 3)
Validation data size	 (888, 3)


## Data Preprocessing

In [4]:
# Punctuation Removal
# maybe keep emoticons !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# handle contractions (i've -> i have)
def remove_punctuation_re(x):
    x = re.sub(r'[^\w\s]','',x)
    return x

nltk.download('punkt')

# Stopwords Removal
nltk.download('stopwords')
from nltk.corpus import stopwords as sw
from nltk.tokenize import word_tokenize
stopwords = sw.words('english')

# Stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# Lemmatisation
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# POS Tagging
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag

# English Contractions Dictionary
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have",
                    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not",
                    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did",
                    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have",
                    "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have",
                    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us",
                    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                    "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
                    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                    "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
                    "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have",
                    "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will",
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have",
                    "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\allis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\allis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\allis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\allis\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
# Function to preprocess the data
def preprocess_data(sentence_list):
    output_list = []
    for sentence in sentence_list:
        sentence = sentence.lower()                     # Case folding
        for word, new_word in contraction_dict.items(): # Deal with contractions
            sentence = sentence.replace(word, new_word)
        sentence = remove_punctuation_re(sentence)      # Remove punctuation
        tokens = word_tokenize(sentence)                # Tokenise
        output_list.append(tokens)
    return output_list

# Preprocess the data and get the tokenised sentence lists
train_x_token = preprocess_data(training_x)
test_x_token = preprocess_data(test_x)
val_x_token = preprocess_data(val_x)

In [37]:
# Word vocabulary to index dictionary {word: index}
word_to_idx = {"<PAD>": 0, "<UNK>": 1, "<BOS>": 2, "<EOS>": 3}
for sentence in train_x_token:
    for word in sentence:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)
word_list = list(word_to_idx.keys())
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
vocab_size = len(word_to_idx)

# Aspect vocabulary to index dictionary {aspect: index}
aspect_to_idx = {
    "food": 0,
    "service": 1,
    "staff": 2,
    "price": 3,
    "ambience": 4,
    "menu": 5,
    "place": 6,
    "miscellaneous": 7
}
idx_to_aspect = {idx: aspect for aspect, idx in aspect_to_idx.items()}
num_aspects = len(aspect_to_idx)

# Polarity vocabulary to index dictionary {polarity: index}
polarity_to_idx = {
    'positive': 0,
    'neutral': 1,
    'negative': 2
}
idx_to_polarity = {idx: polarity for polarity, idx in polarity_to_idx.items()}
num_polarities = len(polarity_to_idx)

# Aspect polarity vocabulary to index dictionary {(aspect_idx, polarity_idx): index}
aspects_polarity = [(aspect, polarity) for aspect in aspect_to_idx for polarity in polarity_to_idx]
aspect_polarity_to_idx = {aspect_polarity: idx for idx, aspect_polarity in enumerate(aspects_polarity)}

In [26]:
# Add paddings
longest_sentence = max(train_x_token, key=len)
max_seq_length = len(longest_sentence)

def add_paddings(sentence, max_seq_length):
    return sentence + ['<PAD>'] * (max_seq_length - len(sentence))

train_x_padded = [add_paddings(sentence, max_seq_length) for sentence in train_x_token]

In [46]:
# Token index lists for training data
train_x_idx = []
for sentence in train_x_token:
    sentence_idx = [word_to_idx[word] for word in sentence]
    train_x_idx.append(sentence_idx)

train_y_idx = []
for aspect, polarity in training_y:
    aspect_idx = aspect_to_idx[aspect]
    polarity_idx = polarity_to_idx[polarity]
    train_y_idx.append((aspect_idx, polarity_idx))
    
# One-hot encoding for training data
train_x_onehot = []
for sentence in train_x_idx:
    sentence_onehot = np.zeros(len(word_to_idx))
    for idx in sentence:
        sentence_onehot[idx] = 1
    train_x_onehot.append(sentence_onehot)

train_y_onehot = []
for pair_idx in train_y_idx:
    target_onehot = np.zeros(len(aspect_polarity_to_idx))
    pair = (idx_to_aspect[pair_idx[0]], idx_to_polarity[pair_idx[1]])
    target_onehot[aspect_polarity_to_idx[pair]] = 1
    train_y_onehot.append(target_onehot)

# for aspect, polarity in train_y_idx:
#     aspect_onehot = np.zeros(num_aspects)
#     aspect_onehot[aspect] = 1
#     polarity_onehot = np.zeros(num_polarities)
#     polarity_onehot[polarity] = 1
#     train_y_onehot.append((aspect_onehot, polarity_onehot))

7090


In [9]:
# for i in range(len(train_data)):
    
#     sentence = train_data.loc[i, 'sentence']
    
#     # Lowercase
#     sentence = sentence.lower()
#     # print("1", sentence)
    
#     # Tokenise
#     tokens = word_tokenize(sentence)
#     # print("2", tokens)
    
#     # Remove punctuation
#     re_tokens = [remove_punctuation_re(word) for word in tokens]
#     # print("3", re_tokens)
    
#     # Remove stopwords
#     sw_tokens = [word for word in re_tokens if word.lower() not in stopwords and word != '']
#     # print("4", sw_tokens)
    
#     # Stemming
#     stem_tokens = [stemmer.stem(word) for word in sw_tokens]
#     # print("5", stem_tokens)
    
#     # Lemmatisation
#     lemma_tokens = [lemmatizer.lemmatize(word) for word in stem_tokens]
#     # print("6", lemma_tokens)
    
#     # POS Tagging
#     pos_tokens = pos_tag(lemma_tokens)
#     # print("7", pos_tokens)
    
#     # Reconstruct sentence
#     sentence = " ".join(lemma_tokens)

# 2. Model Implementation

## Model Architecture

In [48]:
# Encoder
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size)
        
    def forward(self, input, encoder_hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, encoder_hidden)
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size)

# Decoder
class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, input):
        embedded = self.embedding(input)
        output, hidden = self.gru(embedded, self.hidden_size)
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size)

## Model Training

In [49]:
num_epochs = 100
display_interval = 10
learning_rate = 0.01
hidden_size = 50
embed_size = 50

x_size = vocab_size
y_size = num_polarities*num_aspects

encoder = Encoder(x_size, embed_size, hidden_size)
decoder = Decoder(y_size, embed_size, hidden_size)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

for epoch in range(num_epochs):
    
    x_tensor = torch.tensor(train_x_onehot, dtype=torch.float32)
    y_tensor = torch.tensor(train_y_onehot, dtype=torch.float32)
    x_length = x_tensor.size(0)
    y_length = y_tensor.size(0)
    
    encoder_hidden = encoder.init_hidden()
    
    loss = 0
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    # Feed the input data to the encoder
    encoder_hiddens = torch.zeros(x_length, encoder.hidden_size)
    for i in range(len(train_data)):
        encoder_output, encoder_hidden = encoder(x_tensor[i], encoder_hidden)
        encoder_hiddens[i] = encoder_output

    decoder_input = torch.tensor([[0.0]])  # Assuming the start token is represented by 0.0
    decoder_hidden = encoder_hidden

    # Feed the target data to the decoder with teacher forcing
    for i in range(len(train_data)):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        loss += criterion(decoder_output, y_tensor[i])
        decoder_input = y_tensor[i]
    
    # for i in range(len(train_data)):
    #     encoder_output, encoder_hidden = encoder(x_tensor[i], encoder_hidden)
    #     decoder_input = torch.tensor([[0.0]])  # Assuming the start token is represented by 0.0
    #     decoder_hidden = encoder_hidden
        
    #     for i in range(len(train_data)):
    #         decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    #         loss += criterion(decoder_output, y_tensor[i])
    #         decoder_input = y_tensor[i]
    
    # Backpropagation
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    loss = loss.item() / len(train_data)
    if (epoch+1) % display_interval == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')   

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

# 3. Testing and Evaluation