In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import string
import math
import nltk
from nltk.corpus import words
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec


import torch
import torch.nn as nn

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [36]:
# !pip install pyenchant
# !apt-get install -y libenchant-2-2
import enchant
from enchant.tokenize import get_tokenizer
from enchant.tokenize import basic_tokenize

In [37]:
# !pip install tensorflow
from tensorflow.keras.utils import pad_sequences

In [38]:
# from google.colab import drive
# drive.mount('/content/drive')
dataset = './dataset/ielts-writing-essays.csv'

"""
  NOTE WORD2VEC

  1. remove punctuation
  2. apply word2vec

  vector_size:
      Defines the length of the vector representations for each word.
      window: The size of the context window around the target word.
      min_count: Ignores words with a total frequency lower than this threshold.
      workers: Number of CPU cores to use for training.

  note: bisa juga ambil yang pre-trained
  """

'\n  NOTE WORD2VEC\n\n  1. remove punctuation\n  2. apply word2vec\n\n  vector_size:\n      Defines the length of the vector representations for each word.\n      window: The size of the context window around the target word.\n      min_count: Ignores words with a total frequency lower than this threshold.\n      workers: Number of CPU cores to use for training.\n\n  note: bisa juga ambil yang pre-trained\n  '

In [39]:
# Load dataset & Prepare Dataframes
df = pd.read_csv(dataset)
print(df.head())
sentences = pd.DataFrame(df['Essay'])
scores = pd.DataFrame(df['Overall'])

# Define functions
def tokenize_and_filter_sentence(sentence):
    tokenizer = enchant.tokenize.get_tokenizer("en_GB")
    tokens = list(tokenizer(sentence))
    tokens = [token[0] for token in tokens]
    tokens = [token.lower() for token in tokens]
    return tokens

def convert_to_sequence(sentence, word_index):
    tokens = tokenize_and_filter_sentence(sentence)
    return [word_index.get(token, word_index['<OOV>']) for token in tokens]

def alternative_word2vec(sen, model):
    return [model.wv[word] for word in sen if word in model.wv]

def calculate_average_length(sentences):
    total_len = 0
    for sentence in sentences:
        total_len += len(sentence)
    average_len = int(round(total_len / len(sentences), 0))
    return average_len

def pad_word2vec(sentence_vectors, maxlen, vector_size):
    sentence_length = len(sentence_vectors)
    if sentence_length > maxlen:
        return np.array(sentence_vectors[:maxlen])
    else:
        padding = [np.zeros(vector_size) for _ in range(maxlen - sentence_length)]
        return np.array(sentence_vectors + padding)

                                               Essay  Overall
0  Between 1995 and 2010, a study was conducted r...      5.5
1  Poverty represents a worldwide crisis. It is t...      6.5
2  The left chart shows the population change hap...      5.0
3  Human beings are facing many challenges nowada...      5.5
4  Information about the thousands of visits from...      7.0


In [40]:
# Preprocess Data
tokenized_sentences = sentences['Essay'].apply(tokenize_and_filter_sentence)
all_sentences = sentences['Essay']
all_tokens = [token for sentence in all_sentences for token in tokenize_and_filter_sentence(sentence)]
unique_tokens = sorted(set(all_tokens))
word_index = {word: idx + 1 for idx, word in enumerate(unique_tokens)}
word_index['<OOV>'] = len(word_index) + 1
sequences = [convert_to_sequence(sentence, word_index) for sentence in all_sentences]

vector_size = 100
maxlen = calculate_average_length(tokenized_sentences)
padding_type = 'pre'
truncating_type = 'post'
padded_sequences = pad_sequences(
    sequences,
    truncating = truncating_type,
    padding = padding_type,
    maxlen = maxlen,
    dtype='int32'
)

print("Padded Sequences:", padded_sequences[:5]) #--> use this for training the first LSTM for Label Encoded Data

sentences_WV = sentences.copy()
tokenized_sentences_list = tokenized_sentences.tolist()
model = Word2Vec(tokenized_sentences_list, vector_size=100, window=5, min_count=1, workers=4)
sentences_WV['Essay'] = sentences_WV['Essay'].apply(lambda x: alternative_word2vec(x, model))

vector_size = 100
maxlen = calculate_average_length(tokenized_sentences)
sentences_WV['Padded_Essay'] = sentences_WV['Essay'].apply(lambda x: pad_word2vec(x, maxlen, vector_size))

word2vec_data = np.array(sentences_WV['Padded_Essay'].tolist())
print("Word2Vec Data Shape:", word2vec_data.shape) #--> use this for training the second LSTM for Word Vectorized Data

trainX_labelEncoded = torch.tensor(padded_sequences[:, :, None], dtype=torch.float32)
trainY_labelEncoded = torch.tensor(scores.values, dtype=torch.float32)[:, None]

trainX_word2Vec = torch.tensor(word2vec_data, dtype=torch.float32)
trainY_word2Vec = torch.tensor(scores.values, dtype=torch.float32)[:, None]

Padded Sequences: [[    0     0     0 ...   716   560 12498]
 [ 9296 10281     1 ...  9495 12254  5637]
 [    0     0     0 ... 12260  8800 13625]
 [ 5994  1172   714 ... 11948  2242  4847]
 [    0     0     0 ...  8365 12260  8918]]
Word2Vec Data Shape: (1435, 253, 100)


In [55]:
class LSTMModelWord2Vec(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, dropout_rate):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_dim, 
            num_layers=num_layers, 
            dropout=dropout_rate, 
            batch_first=True
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, 1)  # Output is a single value (regression)
        
        self.init_weights()

    def forward(self, src, hidden):
        output, hidden = self.lstm(src, hidden)
        output = self.dropout(output[:, -1, :])  # Use the last output
        prediction = self.fc(output)
        return prediction, hidden

    def init_weights(self):
        init_range = 1 / math.sqrt(self.hidden_dim)
        self.fc.weight.data.uniform_(-init_range, init_range)
        self.fc.bias.data.zero_()

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell


In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [46]:
# PARAMS
vocab_size = len(unique_tokens)
embedding_dim = 256             # 400 in the paper
hidden_dim = 256                # 1150 in the paper
num_layers = 3                   # 3 in the paper
dropout_rate = 0.2                            
lr = 1e-3
input_layer = 1

In [56]:
# Data Loader
batch_size = 32
dataset = torch.utils.data.TensorDataset(trainX_word2Vec, trainY_word2Vec)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Model Initialization
embedding_dim = vector_size  # Word2Vec vector size
hidden_dim = 256
num_layers = 3
dropout_rate = 0.2
lr = 1e-3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_wv = LSTMModelWord2Vec(embedding_dim, hidden_dim, num_layers, dropout_rate).to(device)
optimizer = torch.optim.Adam(model_wv.parameters(), lr=lr)
criterion = nn.MSELoss()  # Regression loss

# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    model_wv.train()
    epoch_loss = 0
    hidden, cell = model_wv.init_hidden(batch_size, device)

    for inputs, targets in data_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        hidden, cell = hidden.detach(), cell.detach()  # Detach hidden states
        optimizer.zero_grad()
        predictions, (hidden, cell) = model_wv(inputs, (hidden, cell))
        loss = criterion(predictions, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(data_loader):.4f}")


  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: Expected hidden[0] size (3, 27, 256), got [3, 32, 256]