## Getting the dataset

In [1]:
import pandas as pd
import glob
import re
import os 

prompts_paths = glob.glob(".\\prompts\\*\\*.txt")
essays_paths = glob.glob(".\\prompts\\*\\*\\*.txt")
marks_paths = glob.glob(".\\prompts\\*\\*\\*.csv")

df = pd.DataFrame(columns=['prompt', 'essay', 'mark1', 'mark2', 'mark3', 'mark4', 'mark5'])

for prompt_path in prompts_paths:
    prompt_number = re.search("(\d+)", prompt_path).group(0)
    with open(prompt_path, 'r') as f:
        prompt = f.read()
        find1 = re.findall("(.*?)Como enviar sua redação", prompt)
        find2 = re.findall("(.*?)ObservaçõesSeu", prompt)
        if find1:
            prompt = find1[0]
        if find2:
            prompt = find2[0]
    essays = []
    marks = pd.DataFrame({0: [], 1:[], 2:[], 3:[], 4:[]})
    for essay_path in essays_paths:
        #Get essays for this prompt, and the original one
        if "prompt"+str(prompt_number)+"\\" in essay_path and "original" in essay_path:
            with open(essay_path, "r", encoding='utf-8') as original:
                essay = original.read()
                essays.append(essay)
            mark_path = essay_path.replace("_original.txt", "_mark.csv")
            mark = pd.read_csv(mark_path).transpose().drop(axis=0, index="Topics").drop(columns=[5])
            marks = pd.concat([marks, mark], ignore_index=True)
    prompts = [prompt] * len(essays)
    sub_df = pd.DataFrame({'prompt': prompts, 'essay': essays, 'mark1': marks[0], 'mark2':marks[1], 'mark3':marks[2], 'mark4':marks[3], 'mark5':marks[4]})
    df = pd.concat([df, sub_df], ignore_index=True)
len(df)

456

In [2]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

df.head()

Unnamed: 0,prompt,essay,mark1,mark2,mark3,mark4,mark5
0,O número de pessoas desempregadas no mundo dev...,Qualificações para o mercado de trabalho\n\nAn...,80,120,120,80,80
1,O número de pessoas desempregadas no mundo dev...,Futuro da crise do desemprego\n\nA Qualificaçã...,80,80,80,80,80
2,O número de pessoas desempregadas no mundo dev...,O progresso da tecnologia\n\nNo decorrer dos s...,160,160,160,160,120
3,O número de pessoas desempregadas no mundo dev...,O advento tecnológico \n\nAs constantes mudanç...,160,120,120,120,120
4,O número de pessoas desempregadas no mundo dev...,"Transformações laborais\n\nA noção de emprego,...",120,120,120,120,120


In [3]:
from transformers import BertTokenizer, BertModel
print_gpu_utilization()
tokenizer = BertTokenizer.from_pretrained('adalbertojunior/distilbert-portuguese-cased')
bertLM = BertModel.from_pretrained("adalbertojunior/distilbert-portuguese-cased", output_attentions=True).to("cuda") #, cache_dir="D:\Laure\Documents\GitHub\cache", local_files_only=False)
print_gpu_utilization()

GPU memory occupied: 894 MB.
GPU memory occupied: 1199 MB.


In [4]:
for param in bertLM.base_model.parameters():
    param.requires_grad = False

bertLM.config

BertConfig {
  "_name_or_path": "adalbertojunior/distilbert-portuguese-cased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_attentions": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 29794
}

In [5]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import pytorch_lightning as pl

class GradeNet(pl.LightningModule):
    def __init__(self,n_grades=5,steps_per_epoch=None,n_epochs=3, lr=2e-5):
        super().__init__()
        self.bert=bertLM
        self.classifier=nn.Linear(self.bert.config.hidden_size,n_grades).to("cuda")
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.criterion = nn.BCEWithLogitsLoss()

    def forward(self,input_ids, attn_mask):
        output = self.bert(input_ids=input_ids,attention_mask=attn_mask, return_dict=True)
        output = self.classifier(output.pooler_output)          
        return output
    
    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['scores']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('train_loss',loss , prog_bar=True,logger=True)
        
        return {"loss" :loss, "predictions":outputs, "labels": labels }


    def validation_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['scores']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('val_loss',loss , prog_bar=True,logger=True)
        
        return loss

    def test_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['scores']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('test_loss',loss , prog_bar=True,logger=True)
        
        return loss
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters() , lr=self.lr)
        warmup_steps = self.steps_per_epoch//3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,total_steps)

        return [optimizer], [scheduler]
    

In [6]:
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler
import torch

class QTagDataset (Dataset):
    def __init__(self,text,scores, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.scores = scores
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item_idx):
        text = self.text[item_idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length= self.max_len,
            padding = 'max_length',
            return_token_type_ids= False,
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'
          )
        
        input_ids = inputs['input_ids'].flatten().to("cuda")
        attn_mask = inputs['attention_mask'].flatten().to("cuda")
               
        return {
          'input_ids': input_ids ,
          'attention_mask': attn_mask,
          'scores':torch.tensor(self.scores.iloc[item_idx].array,dtype= torch.float).to("cuda")
        }

In [7]:
class QTagDataModule (pl.LightningDataModule):
    def __init__(self,x_tr,y_tr,x_val,y_val,x_test,y_test,tokenizer,batch_size=16,max_token_len=200):
        super().__init__()
        self.tr_text = x_tr
        self.tr_label = y_tr
        self.val_text = x_val
        self.val_label = y_val
        self.test_text = x_test
        self.test_label = y_test
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = QTagDataset(text=self.tr_text,  scores=self.tr_label,tokenizer=self.tokenizer,max_len= self.max_token_len)
        self.val_dataset= QTagDataset(text=self.val_text, scores=self.val_label,tokenizer=self.tokenizer,max_len = self.max_token_len)
        self.test_dataset =QTagDataset(text=self.test_text, scores=self.test_label,tokenizer=self.tokenizer,max_len = self.max_token_len)
        
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset,batch_size= self.batch_size, shuffle = True , num_workers=8)

    def val_dataloader(self):
        return DataLoader (self.val_dataset,batch_size=16, num_workers=8)

    def test_dataloader(self):
        return DataLoader (self.test_dataset,batch_size=16, num_workers=8)

In [2]:
from sklearn.model_selection import train_test_split
X = df['essay']
y = df[['mark1', 'mark2', 'mark3', 'mark4', 'mark5']]
RANDOM_SEED = 1

# First Split for Train and Test
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.1, random_state=RANDOM_SEED,shuffle=True)
# Next split Train in to training and validation
x_tr,x_val,y_tr,y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=RANDOM_SEED,shuffle=True)

In [3]:
print(f"Size of train: {x_tr.shape}")
print(f"Size of test:  {x_test.shape}")
print(f"Size of valid: {x_val.shape}")

Size of train: (328,)
Size of test:  (46,)
Size of valid: (82,)


In [10]:
BATCH_SIZE = 8
MAX_LEN = 200
QTdata_module = QTagDataModule(x_tr.reset_index(drop=True),y_tr.reset_index(drop=True),
                               x_val.reset_index(drop=True),y_val.reset_index(drop=True),
                               x_test.reset_index(drop=True),y_test.reset_index(drop=True),tokenizer,BATCH_SIZE,MAX_LEN)
QTdata_module.setup()

In [11]:
from pytorch_lightning.callbacks import ModelCheckpoint
# saves a file like: input/QTag-epoch=02-val_loss=0.32.ckpt
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',# monitored quantity
    filename='QTag-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3, #  save the top 3 models
    mode='min', # mode of the monitored quantity  for optimization
)

In [12]:
# Initialize the parameters that will be use for training
N_EPOCHS = 12
BATCH_SIZE = 16
MAX_LEN = 300
LR = 2e-05

In [None]:
model = GradeNet(steps_per_epoch=len(x_tr)//BATCH_SIZE)
# Instantiate the Model Trainer
trainer = pl.Trainer(max_epochs = N_EPOCHS , gpus = 1, callbacks=[checkpoint_callback])
# Train the Classifier Model
trainer.fit(model, QTdata_module)

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params
-------------------------------------------------
0 | bert       | BertModel         | 66.4 M
1 | classifier | Linear            | 3.8 K 
2 | criterion  | BCEWithLogitsLoss | 0     
-------------------------------------------------
3.8 K     Trainable params
66.4 M    Non-trainable params
66.4 M    Total params
265.599   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

In [2]:
def to_list(values):
    print(values.tolist())
df['marks'] = df[df.columns[2:7]].values.astype(int).tolist()
df

Unnamed: 0,prompt,essay,mark1,mark2,mark3,mark4,mark5,marks
0,O número de pessoas desempregadas no mundo dev...,Qualificações para o mercado de trabalho\n\nAn...,80,120,120,80,80,"[80, 120, 120, 80, 80]"
1,O número de pessoas desempregadas no mundo dev...,Futuro da crise do desemprego\n\nA Qualificaçã...,80,80,80,80,80,"[80, 80, 80, 80, 80]"
2,O número de pessoas desempregadas no mundo dev...,O progresso da tecnologia\n\nNo decorrer dos s...,160,160,160,160,120,"[160, 160, 160, 160, 120]"
3,O número de pessoas desempregadas no mundo dev...,O advento tecnológico \n\nAs constantes mudanç...,160,120,120,120,120,"[160, 120, 120, 120, 120]"
4,O número de pessoas desempregadas no mundo dev...,"Transformações laborais\n\nA noção de emprego,...",120,120,120,120,120,"[120, 120, 120, 120, 120]"
...,...,...,...,...,...,...,...,...
451,Reportagem publicada pelo UOL Economia no mês ...,"Vencer na vida\n\nProlongada pela ONU em 1948,...",120,80,20,20,0,"[120, 80, 20, 20, 0]"
452,Reportagem publicada pelo UOL Economia no mês ...,Vencer ou ser?\n\nVencer é algo que desde cedo...,160,160,120,160,120,"[160, 160, 120, 160, 120]"
453,Reportagem publicada pelo UOL Economia no mês ...,“E essa é a vitória que vence o mundo: a nossa...,160,120,160,160,120,"[160, 120, 160, 160, 120]"
454,Reportagem publicada pelo UOL Economia no mês ...,O que é mais importante para vencer na vida?\n...,20,80,20,20,20,"[20, 80, 20, 20, 20]"


In [3]:
from transformers import BertTokenizer, BertModel, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

tokenizer = BertTokenizer.from_pretrained('adalbertojunior/distilbert-portuguese-cased')
model = BertModel.from_pretrained("adalbertojunior/distilbert-portuguese-cased", output_attentions=True).to("cuda")

ds = Dataset.from_pandas(df)

train_testvalid = ds.train_test_split(test_size=0.2,seed=15)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)

# gather everyone if you want to have a single DatasetDict
data = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})


def tokenize(batch):
    return tokenizer(batch["essay"], truncation=True,max_length=512)

tokenized_dataset = data.map(tokenize, batched=True)
tokenized_dataset

Map:   0%|          | 0/364 [00:00<?, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'essay', 'mark1', 'mark2', 'mark3', 'mark4', 'mark5', 'marks', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 364
    })
    test: Dataset({
        features: ['prompt', 'essay', 'mark1', 'mark2', 'mark3', 'mark4', 'mark5', 'marks', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 46
    })
    valid: Dataset({
        features: ['prompt', 'essay', 'mark1', 'mark2', 'mark3', 'mark4', 'mark5', 'marks', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 46
    })
})

In [4]:
from transformers import DataCollatorWithPadding

tokenized_dataset.set_format("torch",columns=["input_ids", "attention_mask", "marks"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
import torch.nn as nn

class CustomModel(nn.Module):
    def __init__(self,num_labels): 
        super(CustomModel,self).__init__() 
        self.num_labels = num_labels 

        #Load Model with given checkpoint and extract its body
        self.model = model = BertModel.from_pretrained("adalbertojunior/distilbert-portuguese-cased",output_attentions=True,output_hidden_states=True)
        self.dropout = nn.Dropout(0.1) 
        self.classifier = nn.Linear(768,num_labels) # load and initialize weights

    def forward(self, input_ids=None, attention_mask=None,marks=None):
        #Extract outputs from the body
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        #Add custom layers
        sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

        logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), marks.view(-1))

        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [6]:
import torch 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=CustomModel(num_labels=5).to(device)

In [7]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=32, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["valid"], batch_size=32, collate_fn=data_collator
)

In [8]:
from transformers import get_scheduler
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

36


In [10]:
from evaluate import load
metric = load("f1")

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [11]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))


for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)

    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar_eval.update(1)

    print(metric.compute())

  0%|          | 0/36 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 384.00 MiB (GPU 0; 4.00 GiB total capacity; 2.21 GiB already allocated; 331.55 MiB free; 2.30 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF