## Intro to Huggingface

### Load the model

In [None]:
# Loading a model (e.g. )
from transformers import AutoModel
encoder = AutoModel.from_pretrained("xlm-roberta-base", add_pooling_layer=False)

In [None]:
[k for k, _ in encoder.named_parameters()]

### Load and preprocess the data

In [None]:
# Loading datasets
from datasets import load_dataset
stsb = load_dataset(path="stsb_multi_mt", name="en")

In [None]:
stsb

In [None]:
# Preprocess a dataset
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [None]:
train_examples = tokenizer(stsb["train"]["sentence1"][0], stsb["train"]["sentence2"][0], truncation=True, padding="max_length", max_length=32, return_tensors='pt')
print(train_examples)

In [None]:
len(train_examples["input_ids"][0])

In [None]:
output = encoder(input_ids=train_examples["input_ids"], attention_mask=train_examples["attention_mask"])
print(output)

In [None]:
# Output representations
output[0].shape

In [None]:
# Pooled CLS token
# output[1].shape

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length", max_length=256)

In [None]:
tokenized_datasets = stsb.map(tokenize_function, batched=True, remove_columns=["sentence1", "sentence2"])

In [None]:
tokenized_datasets

In [None]:
# Change the format of all columns to torch tensors
tokenized_datasets["train"].set_format("pt")

In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True)

In [None]:
batch = next(iter(train_dataloader))

In [None]:
batch["input_ids"].shape

# ==> Proceed for val and test in similar fashion

### Dynamic padding

In [None]:
from transformers import DataCollatorWithPadding
collator = DataCollatorWithPadding(tokenizer, padding=True)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding=False, max_length=256)

In [None]:
tokenized_datasets = stsb.map(tokenize_function, batched=True, remove_columns=["sentence1", "sentence2"])

In [None]:
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True, collate_fn=collator)

In [None]:
batch = next(iter(train_dataloader))

In [None]:
batch["input_ids"].shape