In [5]:
import pandas as pd
import math

import torch

from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

from string import punctuation

from tqdm import tqdm

import enchant
from enchant.tokenize import get_tokenizer
from enchant.tokenize import basic_tokenize

import pickle


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [7]:
df = pd.read_csv('./dataset/ielts-writing-essays.csv')
df.head(5)

label_encoder = LabelEncoder()

In [8]:
essay = df['Essay'].to_list()

essays_token = []
all_token = []

for i in essay:
    token = []
    words = word_tokenize(i)
    for word in words:
        if word not in punctuation and word.isalpha():
            word = word.lower()
            all_token.append(word)
        
            token.append(word)
    essays_token.append(token)
all_vocab = list(set(all_token))
all_vocab = ['<unk>'] + all_vocab
label_encoder.fit(all_vocab)

with open("label_encoder.pkl", "wb") as file:
    pickle.dump(label_encoder, file)

print(f"Len Essays Token :{len(essays_token)}")
print(f"Len All Token :{len(all_token)}")
print(f"Len All Vocab :{len(all_vocab)}")
SEQ_LEN = 50

Len Essays Token :1435
Len All Token :359245
Len All Vocab :13350


In [9]:
def calculate_average_length(sentences):
    total_len = 0
    for sentence in sentences:
        total_len += len(sentence)
    average_len = int(round(total_len / len(sentences), 0))
    return average_len

In [10]:
def get_data(dataset, batch_size):
    """Convert tokenized dataset to tensor batches."""
    data = [torch.LongTensor(label_encoder.transform(tokens)) for tokens in dataset]
    data = pad_sequence(data, batch_first=True, padding_value=0)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, -1) 
    return data

In [11]:
train_len = round(len(essays_token) * 0.7)
test_val_len = round(len(essays_token) * 0.15)

train_data = essays_token[:train_len]
test_data = essays_token[train_len: train_len + test_val_len]
val_data = essays_token[train_len + test_val_len:]
# 0.7 0.15

batch_size = 16
print(len(train_data))
print(len(test_data))

train_data = get_data(train_data,batch_size)
test_data = get_data(test_data,batch_size)
val_data = get_data(val_data,batch_size)

1004
215


In [12]:
print(train_data)

tensor([[ 1210,   553,     1,  ...,     0,     0,     0],
        [  103, 12133, 11974,  ...,     0,     0,     0],
        [12133,  7822,  5780,  ...,     0,     0,     0],
        ...,
        [12016, 12411,  5410,  ...,     0,     0,     0],
        [ 6625, 10862, 10804,  ...,     0,     0,     0],
        [12046,  5407, 10804,  ...,     0,     0,     0]])


In [13]:

class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
        
        self.init_weights()

    def forward(self, src, hidden):
        embedding = self.dropout(self.embedding(src))
        output, hidden = self.lstm(embedding, hidden)          
        output = self.dropout(output) 
        prediction = self.fc(output)
        return prediction, hidden
    
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hidden_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim, 
                    self.hidden_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell
    
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

In [14]:
VOCAB_SIZE = len(all_vocab)
EMBED_DIM = 256
HIDDEN_DIM = 128       
NUM_LAYERS = 3   
DROPOUT_RATE = 0.75
lr = 1e-3
EPOCHS = 50
clip = 0.25


In [15]:

model = LSTM(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT_RATE).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 5,601,574 trainable parameters


In [16]:
def get_batch(data, seq_len, idx):
    src = data[:, idx:idx + seq_len]
    tgt = data[:, idx + 1:idx + seq_len + 1]
    return src, tgt

def train_epoch(model, data, optimizer, criterion, batch_size, seq_len, clip,device):
    model.train()
    epoch_loss = 0
    num_batches = data.shape[1] // seq_len
    hidden = model.init_hidden(batch_size,device)

    for idx in tqdm(range(0, num_batches * seq_len, seq_len), desc='Training', leave=False):
        src, tgt = get_batch(data, seq_len, idx)
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)
        predictions, hidden = model(src, hidden)

        predictions = predictions.reshape(-1, VOCAB_SIZE)
        tgt = tgt.reshape(-1)
        loss = criterion(predictions, tgt)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / num_batches

def evaluate_epoch(model, data, criterion, batch_size, seq_len,device):
    model.eval()
    epoch_loss = 0
    num_batches = data.shape[1] // seq_len
    hidden = model.init_hidden(batch_size,device)

    with torch.no_grad():
        for idx in range(0, num_batches * seq_len, seq_len):
            src, tgt = get_batch(data, seq_len, idx)
            src, tgt = src.to(device), tgt.to(device)

            hidden = model.detach_hidden(hidden)
            predictions, hidden = model(src, hidden)

            predictions = predictions.reshape(-1, VOCAB_SIZE)
            tgt = tgt.reshape(-1)
            loss = criterion(predictions, tgt)

            epoch_loss += loss.item()

    return epoch_loss / num_batches

In [17]:
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_data, optimizer, criterion, batch_size, SEQ_LEN, clip,device)
    valid_loss = evaluate_epoch(model, val_data, criterion, batch_size, SEQ_LEN,device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    print(f"Epoch {epoch+1}:")
    print(f"Train Loss: {train_loss:.4f} | Train Perplexity: {math.exp(train_loss):.3f}")
    print(f"Valid Loss: {valid_loss:.4f} | Validation Perplexity: {math.exp(valid_loss):.3f}")

model.load_state_dict(torch.load('best-val-lstm_lm.pt'))
test_loss = evaluate_epoch(model, test_data, criterion, batch_size, SEQ_LEN,device)
print(f"Test Loss: {test_loss:.4f} | Test Perplexity: {math.exp(test_loss):.3f}")

torch.save(model, "entire_model.pth")



                                                            

Epoch 1:
Train Loss: 3.7256 | Train Perplexity: 41.498
Valid Loss: 3.7468 | Validation Perplexity: 42.384


                                                            

Epoch 2:
Train Loss: 2.8289 | Train Perplexity: 16.927
Valid Loss: 3.6988 | Validation Perplexity: 40.400


                                                            

Epoch 3:
Train Loss: 2.7831 | Train Perplexity: 16.169
Valid Loss: 3.6737 | Validation Perplexity: 39.396


                                                            

Epoch 4:
Train Loss: 2.7547 | Train Perplexity: 15.716
Valid Loss: 3.6574 | Validation Perplexity: 38.761


                                                            

Epoch 5:
Train Loss: 2.7281 | Train Perplexity: 15.304
Valid Loss: 3.6435 | Validation Perplexity: 38.224


                                                            

Epoch 6:
Train Loss: 2.7063 | Train Perplexity: 14.974
Valid Loss: 3.6333 | Validation Perplexity: 37.838


                                                            

Epoch 7:
Train Loss: 2.6870 | Train Perplexity: 14.687
Valid Loss: 3.6228 | Validation Perplexity: 37.443


                                                            

Epoch 8:
Train Loss: 2.6672 | Train Perplexity: 14.399
Valid Loss: 3.6167 | Validation Perplexity: 37.216


                                                            

Epoch 9:
Train Loss: 2.6517 | Train Perplexity: 14.177
Valid Loss: 3.6103 | Validation Perplexity: 36.977


                                                            

Epoch 10:
Train Loss: 2.6356 | Train Perplexity: 13.951
Valid Loss: 3.6056 | Validation Perplexity: 36.803


                                                            

Epoch 11:
Train Loss: 2.6194 | Train Perplexity: 13.728
Valid Loss: 3.6030 | Validation Perplexity: 36.709


                                                            

Epoch 12:
Train Loss: 2.6049 | Train Perplexity: 13.530
Valid Loss: 3.5949 | Validation Perplexity: 36.413


                                                            

Epoch 13:
Train Loss: 2.5929 | Train Perplexity: 13.368
Valid Loss: 3.5863 | Validation Perplexity: 36.102


                                                            

Epoch 14:
Train Loss: 2.5786 | Train Perplexity: 13.179
Valid Loss: 3.5780 | Validation Perplexity: 35.801


                                                            

Epoch 15:
Train Loss: 2.5666 | Train Perplexity: 13.022
Valid Loss: 3.5732 | Validation Perplexity: 35.631


                                                            

Epoch 16:
Train Loss: 2.5559 | Train Perplexity: 12.883
Valid Loss: 3.5706 | Validation Perplexity: 35.538


                                                            

Epoch 17:
Train Loss: 2.5442 | Train Perplexity: 12.733
Valid Loss: 3.5637 | Validation Perplexity: 35.293


                                                            

Epoch 18:
Train Loss: 2.5324 | Train Perplexity: 12.584
Valid Loss: 3.5560 | Validation Perplexity: 35.023


                                                            

Epoch 19:
Train Loss: 2.5248 | Train Perplexity: 12.488
Valid Loss: 3.5580 | Validation Perplexity: 35.092


                                                            

Epoch 20:
Train Loss: 2.5113 | Train Perplexity: 12.321
Valid Loss: 3.5570 | Validation Perplexity: 35.057


                                                            

Epoch 21:
Train Loss: 2.5053 | Train Perplexity: 12.248
Valid Loss: 3.5545 | Validation Perplexity: 34.969


                                                            

Epoch 22:
Train Loss: 2.5019 | Train Perplexity: 12.206
Valid Loss: 3.5572 | Validation Perplexity: 35.065


                                                            

Epoch 23:
Train Loss: 2.4997 | Train Perplexity: 12.179
Valid Loss: 3.5491 | Validation Perplexity: 34.784


                                                            

Epoch 24:
Train Loss: 2.4982 | Train Perplexity: 12.160
Valid Loss: 3.5502 | Validation Perplexity: 34.819


                                                            

Epoch 25:
Train Loss: 2.4990 | Train Perplexity: 12.171
Valid Loss: 3.5444 | Validation Perplexity: 34.618


                                                            

Epoch 26:
Train Loss: 2.4976 | Train Perplexity: 12.153
Valid Loss: 3.5439 | Validation Perplexity: 34.601


                                                            

Epoch 27:
Train Loss: 2.4961 | Train Perplexity: 12.135
Valid Loss: 3.5432 | Validation Perplexity: 34.577


                                                            

Epoch 28:
Train Loss: 2.4963 | Train Perplexity: 12.138
Valid Loss: 3.5440 | Validation Perplexity: 34.606


                                                            

Epoch 29:
Train Loss: 2.4958 | Train Perplexity: 12.132
Valid Loss: 3.5413 | Validation Perplexity: 34.513


                                                            

Epoch 30:
Train Loss: 2.4962 | Train Perplexity: 12.136
Valid Loss: 3.5414 | Validation Perplexity: 34.514


                                                            

Epoch 31:
Train Loss: 2.4952 | Train Perplexity: 12.125
Valid Loss: 3.5416 | Validation Perplexity: 34.522


                                                            

Epoch 32:
Train Loss: 2.4962 | Train Perplexity: 12.137
Valid Loss: 3.5418 | Validation Perplexity: 34.530


                                                            

Epoch 33:
Train Loss: 2.4957 | Train Perplexity: 12.130
Valid Loss: 3.5418 | Validation Perplexity: 34.530


                                                            

Epoch 34:
Train Loss: 2.4956 | Train Perplexity: 12.129
Valid Loss: 3.5420 | Validation Perplexity: 34.535


                                                            

Epoch 35:
Train Loss: 2.4948 | Train Perplexity: 12.120
Valid Loss: 3.5421 | Validation Perplexity: 34.538


                                                            

Epoch 36:
Train Loss: 2.4970 | Train Perplexity: 12.146
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 37:
Train Loss: 2.4959 | Train Perplexity: 12.132
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 38:
Train Loss: 2.4953 | Train Perplexity: 12.125
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 39:
Train Loss: 2.4955 | Train Perplexity: 12.128
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 40:
Train Loss: 2.4952 | Train Perplexity: 12.124
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 41:
Train Loss: 2.4953 | Train Perplexity: 12.125
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 42:
Train Loss: 2.4951 | Train Perplexity: 12.122
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 43:
Train Loss: 2.4957 | Train Perplexity: 12.130
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 44:
Train Loss: 2.4966 | Train Perplexity: 12.141
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 45:
Train Loss: 2.4962 | Train Perplexity: 12.136
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 46:
Train Loss: 2.4961 | Train Perplexity: 12.135
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 47:
Train Loss: 2.4956 | Train Perplexity: 12.128
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 48:
Train Loss: 2.4945 | Train Perplexity: 12.116
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 49:
Train Loss: 2.4946 | Train Perplexity: 12.117
Valid Loss: 3.5420 | Validation Perplexity: 34.537


                                                            

Epoch 50:
Train Loss: 2.4958 | Train Perplexity: 12.131
Valid Loss: 3.5420 | Validation Perplexity: 34.537


  model.load_state_dict(torch.load('best-val-lstm_lm.pt'))


Test Loss: 2.9507 | Test Perplexity: 19.120


In [18]:
def generate_text(model, prompt, seq_len, vocab, label_encoder, device, max_length=50):
    tokens = word_tokenize(prompt)
    token_indices = label_encoder.transform(tokens)
    
    input_tensor = torch.LongTensor(token_indices).unsqueeze(0).to(device)  
    
    hidden = model.init_hidden(1,device)
    
    # Generate text
    model.eval()
    generated_text = tokens.copy()  
    for _ in range(max_length):
        # Forward pass
        with torch.no_grad():
            predictions, hidden = model(input_tensor, hidden)
        predicted_idx = predictions[0, -1].argmax().item()
        if predicted_idx == 0:  
            continue
        predicted_word = label_encoder.inverse_transform([predicted_idx])[0]
        
        generated_text.append(predicted_word)
        
        input_tensor = torch.cat((input_tensor, torch.LongTensor([predicted_idx]).unsqueeze(0).to(device)), dim=1)
    
    generated_text_str = ' '.join(generated_text)
    return generated_text_str


In [None]:
prompt = "school"
generated_text = generate_text(model, prompt, SEQ_LEN, all_vocab, label_encoder, device, max_length=100)
print(f"Generated text: {generated_text}")


Generated text: school and the other hand the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of the number of


: 