# Notebook 2: Skill 4 analysis and solution
"Conhecimento dos mecanismos linguísticos necessários para a construção da argumentação"
- Prepositions
- Logical connection between periods and phrases

### Preprocessing

In [92]:
import pandas as pd

df_train = pd.read_csv("data/train.csv", index_col=0)
df_test = pd.read_csv("data/test.csv", index_col=0)
df_valid = pd.read_csv("data/validation.csv", index_col=0)

In [114]:
def clean_list_columns(dataset: pd.DataFrame):
    df = dataset[['essay', 'competence']].copy()
    df['essay'] = dataset['essay'].apply(lambda x: x.strip('][\'').split('\', \''))
    df['competence'] = dataset['competence'].apply(lambda x: x.strip('\'][\'').split(', ')).apply(lambda x: [int(y) for y in x])
    return df

In [119]:
df_train_clean = clean_list_columns(df_train)
df_test_clean = clean_list_columns(df_test)
df_valid_clean = clean_list_columns(df_valid)

In [120]:
lens = df_train_clean['essay'].apply(len)

print("------ TRAIN ---------")
print("média: ", sum(lens)/len(lens))
print("min:   ", min(lens))
print("max:   ", max(lens))

lens = df_test_clean['essay'].apply(len)
print("------ TEST ---------")
print("média: ", sum(lens)/len(lens))
print("min:   ", min(lens))
print("max:   ", max(lens))

lens = df_valid_clean['essay'].apply(len)
print("------ VALID ---------")
print("média: ", sum(lens)/len(lens))
print("min:   ", min(lens))
print("max:   ", max(lens))

------ TRAIN ---------
média:  4.033458411507192
min:    1
max:    34
------ TEST ---------
média:  3.978134110787172
min:    1
max:    11
------ VALID ---------
média:  4.107871720116618
min:    1
max:    30


### Architecture definition

Using BERTimbau-base + classification layer

In [145]:
from sentence_transformers import SentenceTransformer

example_sentences = ['Esta é uma frase em português.', 'Esta é outra frase em português.']

embedder = SentenceTransformer('juridics/bertimbau-base-portuguese-sts-scale')
embeddings = embedder.encode(example_sentences)
print(embeddings.shape)

(2, 768)


### Table example
Features: embeddings
Rows: essays
Target: score
Zero-padding

In [122]:
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [155]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

False
0


AssertionError: Torch not compiled with CUDA enabled

In [123]:
array_2d = np.array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]])
zeros = np.zeros((5-len(array_2d), 5))
np.concatenate((array_2d, zeros))

array([[1., 2., 3., 4., 5.],
       [1., 2., 3., 4., 5.],
       [1., 2., 3., 4., 5.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

Dataloader: https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

Quickstart: https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html

In [149]:

class EmbeddingsDataset(Dataset):
    def __init__(self, embedder: SentenceTransformer, dataset: pd.DataFrame):
        self.embedder = embedder
        self.dataset = dataset['essay']
        self.scores = dataset['competence']

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        essay_sentences = self.dataset.iloc[idx]
        embeddings_raw = self.embedder.encode(essay_sentences)
        zeros = np.zeros((34-len(essay_sentences), 768))
        embeddings = np.concatenate((embeddings_raw, zeros)) # zero-padding
        score_skill2 = torch.DoubleTensor(self.scores.iloc[idx][1])
        score_skill4 = torch.DoubleTensor(self.scores.iloc[idx][3])
        return embeddings, score_skill2, score_skill4

In [150]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Define model
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(34 * 768, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        log_probs = F.log_softmax(logits, dim=1)
        return log_probs

model = Classifier().to(device)
print(model)

Using cpu device
Classifier(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=26112, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=1, bias=True)
  )
)


In [151]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [152]:
from torch.utils.data import DataLoader

training_data = EmbeddingsDataset(embedder, df_train_clean)
test_data = EmbeddingsDataset(embedder, df_test_clean)

train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

In [153]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [154]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------


RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [160] at entry 2