# 2024 CITS4012 Project

# Readme

notes for marker

# 1. Dataset Processing

## Import the libraries

In [2]:
import re
import json
import nltk
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## Loading the datasets

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the training data
with open('/content/drive/MyDrive/data/train.json', 'r') as file:
    data = json.load(file)
    train_data = pd.DataFrame(data['data'], columns=data['columns'])

# Load the test data
with open('/content/drive/MyDrive/data/test.json', 'r') as file:
    data = json.load(file)
    test_data = pd.DataFrame(data['data'], columns=data['columns'])

# Load the validation data
with open('/content/drive/MyDrive/data/val.json', 'r') as file:
    data = json.load(file)
    val_data = pd.DataFrame(data['data'], columns=data['columns'])

# Get the x and y lists for training, test and validation data
training_x = train_data['sentence'].tolist()
training_y = [(train_data['aspect'][i], train_data['polarity'][i]) for i in range(len(train_data))]
test_x = test_data['sentence'].tolist()
test_y = [(test_data['aspect'][i], test_data['polarity'][i]) for i in range(len(test_data))]
val_x = val_data['sentence'].tolist()
val_y = [(val_data['aspect'][i], val_data['polarity'][i]) for i in range(len(val_data))]

# Set number of polarities and aspects
num_polarities = 3
num_aspects = 8

print("Training data size\t", train_data.shape)
print("Test data size\t\t", test_data.shape)
print("Validation data size\t", val_data.shape)


Training data size	 (7090, 3)
Test data size		 (901, 3)
Validation data size	 (888, 3)


## Data Preprocessing

In [4]:
# Punctuation Removal
# maybe keep emoticons !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# handle contractions (i've -> i have)
def remove_punctuation_re(x):
    x = re.sub(r'[^\w\s]','',x)
    return x

nltk.download('punkt')

# Stopwords Removal
nltk.download('stopwords')
from nltk.corpus import stopwords as sw
from nltk.tokenize import word_tokenize
stopwords = sw.words('english')

# Stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# Lemmatisation
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# POS Tagging
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag

# English Contractions Dictionary
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have",
                    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not",
                    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did",
                    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have",
                    "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have",
                    "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us",
                    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                    "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
                    "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                    "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
                    "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have",
                    "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will",
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have",
                    "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [5]:
# Function to preprocess the data
def preprocess_data(sentence_list):
    output_list = []
    for sentence in sentence_list:
        sentence = sentence.lower()                     # Case folding
        for word, new_word in contraction_dict.items(): # Deal with contractions
            sentence = sentence.replace(word, new_word)
        sentence = remove_punctuation_re(sentence)      # Remove punctuation
        tokens = word_tokenize(sentence)                # Tokenise
        output_list.append(tokens)
    return output_list

# Preprocess the data and get the tokenised sentence lists
train_x_token = preprocess_data(training_x)
test_x_token = preprocess_data(test_x)
val_x_token = preprocess_data(val_x)

In [None]:
# Word vocabulary to index dictionary {word: index}
word_to_idx = {}
for sentence in train_x_token:
    for word in sentence:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)
word_list = list(word_to_idx.keys())

# Aspect vocabulary to index dictionary {aspect: index}
aspect_to_idx = {
    "food": 0,
    "service": 1,
    "staff": 2,
    "price": 3,
    "ambience": 4,
    "menu": 5,
    "place": 6,
    "miscellaneous": 7
}

# Polarity vocabulary to index dictionary {polarity: index}
polarity_to_idx = {
    'positive': 0,
    'neutral': 1,
    'negative': 2
}

In [None]:
# Token index lists for training data
train_x_idx = []
for sentence in train_x_token:
    sentence_idx = [word_to_idx[word] for word in sentence]
    train_x_idx.append(sentence_idx)

train_y_idx = []
for aspect, polarity in training_y:
    aspect_idx = aspect_to_idx[aspect]
    polarity_idx = polarity_to_idx[polarity]
    train_y_idx.append((aspect_idx, polarity_idx))

# One-hot encoding for training data
train_x_onehot = []
for sentence in train_x_idx:
    sentence_onehot = np.zeros(len(word_to_idx))
    for idx in sentence:
        sentence_onehot[idx] = 1
    train_x_onehot.append(sentence_onehot)

train_y_onehot = []
for aspect, polarity in train_y_idx:
    aspect_onehot = np.zeros(num_aspects)
    aspect_onehot[aspect] = 1
    polarity_onehot = np.zeros(num_polarities)
    polarity_onehot[polarity] = 1
    train_y_onehot.append((aspect_onehot, polarity_onehot))

(array([1., 0., 0., 0., 0., 0., 0., 0.]), array([1., 0., 0.]))
(array([0., 0., 0., 0., 0., 0., 1., 0.]), array([0., 1., 0.]))
(array([0., 0., 1., 0., 0., 0., 0., 0.]), array([1., 0., 0.]))
(array([0., 0., 0., 0., 0., 0., 0., 1.]), array([0., 1., 0.]))
(array([0., 0., 0., 0., 0., 0., 0., 1.]), array([0., 1., 0.]))


In [None]:
for i in range(len(train_data)):

    sentence = train_data.loc[i, 'sentence']

    # Lowercase
    sentence = sentence.lower()
    # print("1", sentence)

    # Tokenise
    tokens = word_tokenize(sentence)
    # print("2", tokens)

    # Remove punctuation
    re_tokens = [remove_punctuation_re(word) for word in tokens]
    # print("3", re_tokens)

    # Remove stopwords
    sw_tokens = [word for word in re_tokens if word.lower() not in stopwords and word != '']
    # print("4", sw_tokens)

    # Stemming
    stem_tokens = [stemmer.stem(word) for word in sw_tokens]
    # print("5", stem_tokens)

    # Lemmatisation
    lemma_tokens = [lemmatizer.lemmatize(word) for word in stem_tokens]
    # print("6", lemma_tokens)

    # POS Tagging
    pos_tokens = pos_tag(lemma_tokens)
    # print("7", pos_tokens)

    # Reconstruct sentence
    sentence = " ".join(lemma_tokens)

KeyboardInterrupt: 

## Pretrained word embeddings

# 2. Model Implementation

## Model 1

In [None]:
# Assuming you have already preprocessed the data and have access to the following:
# - word embeddings (word2vec / GloVe)
# - aspect embeddings
# - tokenized input sentences
# - aspect terms for each sentence

aspect_terms = train_data['aspect'].tolist()
word_embeddings = {}
sentence_embeddings = torch.tensor([[word_embeddings.get(token, np.zeros(300)) for token in sentence.split()] for sentence in df['sentence']])
aspect_embeddings = torch.tensor([[word_embeddings.get(token, np.zeros(300)) for token in aspect.split()] for aspect in aspect_terms])
polarity_labels = torch.tensor([ for polarity in train_data['polarity']])

# Example data (replace with your actual data)
input_data = np.random.randint(0, 100, (10, 20))  # Example input data (batch_size=10, seq_len=20)
aspect_terms = np.random.randint(0, 10, (10,))  # Example aspect terms (batch_size=10)

# Define the model
class Model1(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, dropout):
        super(Model1, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.randn(100, embedding_dim))  # Example vocab size=100
        self.aspect_embedding = nn.Embedding.from_pretrained(torch.randn(10, embedding_dim))  # Example aspect vocab size=10
        self.rnn = nn.LSTM(embedding_dim * 2, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, 3)  # Positive, Negative, Neutral

    def forward(self, x, aspect):
        embedded = self.embedding(x)
        aspect_embedded = self.aspect_embedding(aspect).unsqueeze(1).repeat(1, x.size(1), 1)
        combined_embedded = torch.cat((embedded, aspect_embedded), dim=2)
        rnn_out, _ = self.rnn(combined_embedded)
        logits = self.fc(rnn_out[:, -1, :])  # Taking the last hidden state
        return logits

# Setting Hyperparameters
embedding_dim = 300  # Assuming 300-dimensional word embeddings
hidden_dim = 128
num_layers = 1
dropout = 0.2
learning_rate = 0.001
num_epochs = 50

# Initialize the model
model = Model1(embedding_dim, hidden_dim, num_layers, dropout)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Convert data to tensors
sentence_embeddings = torch.FloatTensor(sentence_embeddings)
aspect_embeddings = torch.FloatTensor(aspect_embeddings)
polarity = torch.

= torch.LongTensor([sentiment.index(polarity) for polarity in df['polarity']])

# Training loop
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(input_data, aspect_terms)
    loss = criterion(outputs, torch.randint(0, 2, (10,)))  # Example target labels (binary)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 9:
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")


## Model 2

## Model 3

# 3. Testing and Evaluation