## Import libraries

In [None]:
!pip3 install transformers

In [None]:
!pip install datasets

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import torch
from progressbar import progressbar

# Define parameters

In [11]:
# Define datasets
#['mrpc', 'sst2', 'cola', 'rte', 'qnli']
glue_classification = {'mrpc': ['sentence1', 'sentence2'], 'sst2':  ['sentence'], 'qnli' : ['question', 'sentence']}
superglue_classification = {'wic': ['sentence1', 'sentence2']}


# Define the model

# hugging_face_model flag
hugging_face_model = True

# encoder flag
model_type = 'decoder'

encoder_model_name = "bert-base-uncased"
decoder_model_name = "gpt2"

# Encoder

## Functions

In [5]:
def dataset_to_cls(dataset_name, features, tokenizer, model, debag = False):
    # Load the dataset
    if debag:
        datasets = {"train" : load_dataset(*dataset_name, split="train[:20]"),
                    "test" : load_dataset(*dataset_name, split="test[:20]"),
                    "validation": load_dataset(*dataset_name, split="validation[:20]")}
    else:
        datasets = {"train" : load_dataset(*dataset_name, split="train"),
                "test" : load_dataset(*dataset_name, split="test"),
                "validation": load_dataset(*dataset_name, split="validation")}

    # List to store the [CLS] embeddings
    cls_embeddings = {"train" : [],
                      "test" : [],
                      "validation": []}

    # features = list(dataset.features)[:-2]
    # Iterate over the dataset examples
    for name, dataset in datasets.items():
        for example in progressbar(dataset):
            # Encode the input sentences
            encoded_inputs = tokenizer.encode(*list(map(lambda x: example[x] , features)), truncation=True, return_tensors='pt')

            encoded_inputs = encoded_inputs.to(device)

            # Forward pass through the model
            with torch.no_grad():
                outputs = model(encoded_inputs)

            # Get the embedding of the [CLS] token
            cls_embedding = outputs.last_hidden_state[:, 0, :]

            # Append the [CLS] embedding to the list
            cls_embeddings[name].append(cls_embedding)

    # Concatenate the embeddings into a single tensor
    # cls_embeddings_conat = torch.cat(cls_embeddings, dim=0)
    return cls_embeddings, datasets

In [6]:
def train_linear(embedings, data):
    #  Train a linear regression model for classification (using the encoded glue dataset)
    classifier = LinearRegression()
    # print(embedings['train'])
    # print( data["train"]["label"].size)
    classifier.fit(torch.cat(embedings['train'], dim=0).cpu(), data["train"]["label"])

    # Make predictions on the validation set (using the encoded superglue dataset)
    predictions = classifier.predict(torch.cat(embedings['validation'], dim=0).cpu())

    # Evaluate classification metrics
    accuracy = accuracy_score(data["validation"]["label"], predictions.round(), )
    f1 = f1_score(data["validation"]["label"], predictions.round(),  average='weighted')
    precision = precision_score(data["validation"]["label"], predictions.round(),  average='weighted')
    recall = recall_score(data["validation"]["label"], predictions.round(),  average='weighted', zero_division='warn')
    roc_auc = roc_auc_score(data["validation"]["label"], predictions)

    # Print classification metrics
    print("Accuracy:", accuracy)
    print("F1-score:", f1)
    print("Precision:", precision)
    print("Recall:", recall)
    print("ROC-AUC:", roc_auc)

    return predictions


## Process

In [10]:
if model_type == 'encoder':

    if hugging_face_model:
        tokenizer = AutoTokenizer.from_pretrained(encoder_model_name)
        model = AutoModel.from_pretrained(encoder_model_name)
    else:
        # TODO
        pass

    # Define the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Encode the datasets
    glue_cls_data = {}
    glue_data = {}
    for dataset_name, features in glue_classification.items(): # all glue datasets
        glue_cls_data[dataset_name], glue_data[dataset_name]= dataset_to_cls(['glue', dataset_name], features, tokenizer, model, True)

    superglue_cls_data = {}
    superglue_data = {}
    for dataset_name, features in superglue_classification.items(): # all glue datasets
        superglue_cls_data[dataset_name], superglue_data[dataset_name] = dataset_to_cls(['super_glue', dataset_name], features, tokenizer, model, True)

    for dataset_name, features in glue_classification.items(): # all glue datasets
        print(dataset_name)
        train_linear(glue_cls_data[dataset_name], glue_data[dataset_name])

    print()
    for dataset_name, features in superglue_classification.items(): # all glue datasets
        print(dataset_name)
        train_linear(superglue_cls_data[dataset_name], superglue_data[dataset_name])
else:
  print(f'Your model_type is {model_type}, not encoder')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100% (20 of 20) |########################| Elapsed Time: 0:00:04 Time:  0:00:04
100% (20 of 20) |########################| Elapsed Time: 0:00:05 Time:  0:00:05
100% (20 of 20) |############

mrpc
Accuracy: 0.6
F1-score: 0.6
Precision: 0.65
Recall: 0.6
ROC-AUC: 0.6666666666666667
sst2
Accuracy: 0.85
F1-score: 0.8465473145780052
Precision: 0.8846153846153847
Recall: 0.85
ROC-AUC: 0.91
qnli
Accuracy: 0.6
F1-score: 0.5736263736263736
Precision: 0.6
Recall: 0.6
ROC-AUC: 0.5656565656565656

wic
Accuracy: 0.55
F1-score: 0.5327365728900256
Precision: 0.5916666666666666
Recall: 0.55
ROC-AUC: 0.6363636363636364


# Decoder

## Functions

In [15]:
def dataset_to_translations(tokenizer, model, debag = False):
    dataset_name =["wmt14", "fr-en"]

    # Load the dataset
    if debag:
        datasets = {"train" : load_dataset(*dataset_name, split="train[:5]"),
                    "test" : load_dataset(*dataset_name, split="test[:5]"),
                    "validation": load_dataset(*dataset_name, split="validation[:5]")}
    else:
        datasets = {"train" : load_dataset(*dataset_name, split="train"),
                "test" : load_dataset(*dataset_name, split="test"),
                "validation": load_dataset(*dataset_name, split="validation")}

    # List to store the [CLS] translations
    translations = {"train" : [],
                    "test" : [],
                    "validation": []}

    print(translations)

    # Add a padding token to the tokenizer
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

    # Iterate over the datasets
    for name, dataset in datasets.items():
        # Perform translation and obtain translations
        with torch.no_grad():
            for example in dataset:
                # Encode the input
                input_ids = tokenizer.encode(example["translation"]["en"], truncation=True, padding=True, max_length=512, return_tensors="pt")
                attention_mask = torch.ones_like(input_ids)  # Create attention mask with ones

                # Perform the generation
                translation = model.generate(input_ids.to(device), attention_mask=attention_mask.to(device))

                # Decode the generated output
                translation = tokenizer.decode(translation[0], skip_special_tokens=True)

                translations[name].append(translation)

    return translations, datasets

In [13]:
def get_scores(translations, data):
    # Prepare references for BLEU score calculation
    references = [example["translation"]['fr'] for example in data['validation']]
    print(references)
    print()
    print(translations['validation'])

    # Calculate BLEU score
    smoothie = SmoothingFunction().method3

    bleu_score = corpus_bleu(references, translations['validation'], smoothing_function=smoothie)
    print("BLEU score:", bleu_score)

    # Calculate exact match (EM) score
    exact_match = sum(1 for translation, reference in zip(translations, references[0]) if translation == reference) / len(translations)
    print("Exact Match (EM) score:", exact_match)
    return bleu_score, exact_match

## Process

In [41]:
#TODO why \n on end

In [None]:
if model_type == 'decoder':
    if hugging_face_model:
        tokenizer = AutoTokenizer.from_pretrained(decoder_model_name)
        model = GPT2LMHeadModel.from_pretrained(decoder_model_name)
    else:
        # TODO
        pass

    # Define the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    translations, datasets  = dataset_to_translations(tokenizer, model, True)

    bleu_score, exact_match = get_scores(translations, datasets)
else:
  print(f'Your model_type is {model_type}, not decoder')

Downloading and preparing dataset wmt14/fr-en to /root/.cache/huggingface/datasets/wmt14/fr-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4...


Downloading data files:   0%|          | 0/7 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/7 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/40836715 [00:00<?, ? examples/s]