In [89]:
# for loading in data and splitting into test and train
import pandas as pd
from datasets import load_dataset
import numpy as np
from datasets import load_metric
import datasets

from transformers import AutoTokenizer
from attack.model_def import ElectraClassifier

# for fine tuning in pytorch with transformers trainer api
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer

#from transformers import ElectraModel
import torch
#import torch.nn.functional as F
#import torch.nn as nn
#from torch.utils.data import Dataset, RandomSampler, DataLoader

# python engineer
#from torch.optim import lr_scheduler
#import time
#import os
#import copy

In [64]:
df = pd.read_csv('label8.csv', index_col=0)

In [65]:
df.shape

(900, 8)

In [66]:
df = df.loc[:, ['tweet_full_text', 'label']]

In [67]:
df.shape

(900, 2)

In [68]:
df.to_csv('traindata.csv', index=False)

In [69]:
dataset = load_dataset('csv', data_files = ['traindata.csv'])



Using custom data configuration default-b1f8101fa99dd650


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /Users/jeppefoldberg/.cache/huggingface/datasets/csv/default-b1f8101fa99dd650/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset csv downloaded and prepared to /Users/jeppefoldberg/.cache/huggingface/datasets/csv/default-b1f8101fa99dd650/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0. Subsequent calls will reuse this data.


In [70]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tweet_full_text', 'label'],
        num_rows: 900
    })
})

In [71]:
dataset['train'][0]

{'tweet_full_text': 'Godt at @regeringDK (efter moderat pres) også gør sig grønne tanker - men nu må vi se på de konkrete tiltag. Det bliver en spændende dag i morgen. #dkpol #dkgreen https://t.co/vqwvEo0htA',
 'label': 2}

In [72]:
dataset = dataset['train'].train_test_split()

In [74]:
tokenizer = AutoTokenizer.from_pretrained("Maltehb/-l-ctra-danish-electra-small-cased")

# tokenizing the datasets
def tokenize_function(examples):
    # pads or truncates the text so it fits with the maximum length the nn can take
    return tokenizer(examples['tweet_full_text'], max_length = 280, padding='max_length', truncation=True)

In [75]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [76]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'token_type_ids', 'tweet_full_text'],
        num_rows: 675
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'token_type_ids', 'tweet_full_text'],
        num_rows: 225
    })
})

In [77]:
# creating the model for finetuning
model = AutoModelForSequenceClassification.from_pretrained('Maltehb/-l-ctra-danish-electra-small-cased', num_labels=4)


Some weights of the model checkpoint at Maltehb/-l-ctra-danish-electra-small-cased were not used when initializing ElectraForSequenceClassification: ['generator.encoder.layer.3.attention.output.dense.weight', 'generator.encoder.layer.6.output.LayerNorm.bias', 'generator.embeddings.LayerNorm.bias', 'generator.encoder.layer.5.attention.output.LayerNorm.bias', 'generator.encoder.layer.7.attention.self.query.weight', 'generator.encoder.layer.0.attention.self.key.bias', 'generator.encoder.layer.5.output.LayerNorm.bias', 'generator.encoder.layer.7.attention.output.LayerNorm.bias', 'generator.encoder.layer.11.intermediate.dense.bias', 'generator.encoder.layer.4.attention.output.dense.bias', 'generator.encoder.layer.1.intermediate.dense.weight', 'generator.embeddings.token_type_embeddings.weight', 'generator.encoder.layer.4.output.LayerNorm.bias', 'generator.encoder.layer.5.attention.self.query.weight', 'generator.encoder.layer.5.attention.self.query.bias', 'generator.encoder.layer.0.attention

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at Maltehb/-l-ctra-danish-electra-small-cased and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [78]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(50))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(50))
full_train_dataset = tokenized_datasets['train']
full_eval_dataset = tokenized_datasets['test']

In [79]:
metric = load_metric("glue", 'cola')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1848.0, style=ProgressStyle(description…




In [80]:
metric

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

In [91]:
training_args = TrainingArguments(
    output_dir='test_results',
    num_train_epochs=5,
    evaluation_strategy='epoch',      # computes metrics every epoch!
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,
    logging_steps=10,
    metric_for_best_model='accuracy'
)

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.387724,0.14
2,No log,1.387674,0.14
3,1.389100,1.387562,0.14


In [85]:
trainer.evaluate()

{'eval_loss': 1.387768268585205,
 'eval_accuracy': 0.14,
 'eval_runtime': 6.8318,
 'eval_samples_per_second': 7.319,
 'epoch': 5.0}

## Trying to use &tals algorithm

In [70]:
def load_model():
    model_checkpoint = 'Maltehb/-l-ctra-danish-electra-small-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

    model = ElectraClassifier(model_checkpoint,2)
    model_path = 'attack/pytorch_model.bin'
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

    model.eval()

    return(model, tokenizer)

def make_prediction(text):
    tokenized_text = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors='pt',
    )
    input_ids = tokenized_text['input_ids']
    attention_masks = tokenized_text['attention_mask']
    logits = model(input_ids, attention_masks)
    
    logit,preds = torch.max(logits, dim=1)
    return(int(preds))

In [71]:
model, tokenizer = load_model()

Some weights of the model checkpoint at Maltehb/-l-ctra-danish-electra-small-cased were not used when initializing ElectraModel: ['generator.encoder.layer.1.intermediate.dense.weight', 'generator.encoder.layer.5.attention.output.LayerNorm.bias', 'generator.encoder.layer.4.attention.output.dense.bias', 'generator.encoder.layer.6.attention.self.key.weight', 'generator.encoder.layer.10.attention.output.LayerNorm.weight', 'generator.encoder.layer.10.intermediate.dense.weight', 'generator.encoder.layer.2.attention.self.key.weight', 'generator.encoder.layer.3.output.dense.bias', 'generator.encoder.layer.0.attention.output.LayerNorm.bias', 'discriminator_predictions.dense.weight', 'generator.encoder.layer.7.output.dense.bias', 'generator.encoder.layer.4.output.LayerNorm.bias', 'generator.encoder.layer.7.intermediate.dense.bias', 'generator.encoder.layer.6.attention.output.LayerNorm.bias', 'generator.encoder.layer.2.attention.self.value.weight', 'generator.encoder.layer.8.output.dense.weight',

In [76]:
make_prediction('Helt sikkert din torsk')

0

In [None]:
path_to_model = "models/Ælæctra_uncased_32k/pytorch_model.bin"
tokenizer = AutoTokenizer.from_pretrained("models/-l-ctra-danish-electra-small-uncased-ner-dane")
model = AutoModelForTokenClassification.from_pretrained("models/Ælæctra_uncased_32k/pytorch_model.bin")

## Trying to finetune Ælæctra in the same way that &TAL did it

In [2]:
class ElectraClassifier(nn.Module):
    
    def __init__(self, pretrained_model_name, num_labels=4):
        super(ElectraClassifier, self).__init__()
        self.num_labels = num_labels
        self.electra = ElectraModel.from_pretrained(pretrained_model_name)
        self.dense = nn.Linear(self.electra.config.hidden_size, self.electra.config.hidden_size)
        self.dropout = nn.Dropout(self.electra.config.hidden_dropout_prob)
        self.out_proj = nn.Linear(self.electra.config.hidden_size, self.num_labels)

    def classifier(self, sequence_output):
        x = sequence_output[:, 0, :]
        x = self.dropout(x)
        x = F.gelu(self.dense(x))
        x = self.dropout(x)
        x = F.gelu(self.dense(x))
        x = self.dropout(x)
        x = F.gelu(self.dense(x))
        x = self.dropout(x)
        logits = self.out_proj(x)
        return logits

    def forward(self, input_ids=None, attention_mask=None):
        discriminator_hidden_states = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = discriminator_hidden_states[0]
        logits = self.classifier(sequence_output)
        return logits

In [25]:
class custom_dataset(Dataset):
    def __init__(self, text, targets, tokenizer, max_len):
        self.text = text
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = self.text[item]
        target = self.targets[item]
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt',
        )
        return {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long),
        }

def get_data_loader(path, tokenizer, max_len, batch_size):
    # data is stored with its context, in case we want to train a model using the context as well
    dataset = pd.read_csv(path, index_col = 0)
    dataset = remove_invalid_inputs(dataset, 'tweet_full_text')

    data = custom_dataset(
                    text= dataset.tweet_full_text.to_numpy(),  # used to be text
                    targets= dataset.label.to_numpy(),  # used to be target
                    tokenizer=tokenizer,
                    max_len=max_len
                    )

    sampler = RandomSampler(data)
    dataloader = DataLoader(data,batch_size=batch_size,sampler=sampler,pin_memory=True)
    return dataloader, data

def remove_invalid_inputs(dataset,text_column):
    'Simpel metode til at fjerne alle rækker fra en dataframe, baseret på om værdierne i en kolonne er af typen str'
    dataset['valid'] = dataset[text_column].apply(lambda x: isinstance(x, str))
    return dataset.loc[dataset.valid]


In [68]:
model = ElectraClassifier('Maltehb/-l-ctra-danish-electra-small-cased')



Some weights of the model checkpoint at Maltehb/-l-ctra-danish-electra-small-cased were not used when initializing ElectraModel: ['generator.encoder.layer.1.intermediate.dense.weight', 'generator.encoder.layer.5.attention.output.LayerNorm.bias', 'generator.encoder.layer.4.attention.output.dense.bias', 'generator.encoder.layer.6.attention.self.key.weight', 'generator.encoder.layer.10.attention.output.LayerNorm.weight', 'generator.encoder.layer.10.intermediate.dense.weight', 'generator.encoder.layer.2.attention.self.key.weight', 'generator.encoder.layer.3.output.dense.bias', 'generator.encoder.layer.0.attention.output.LayerNorm.bias', 'discriminator_predictions.dense.weight', 'generator.encoder.layer.7.output.dense.bias', 'generator.encoder.layer.4.output.LayerNorm.bias', 'generator.encoder.layer.7.intermediate.dense.bias', 'generator.encoder.layer.6.attention.output.LayerNorm.bias', 'generator.encoder.layer.2.attention.self.value.weight', 'generator.encoder.layer.8.output.dense.weight',

In [23]:
model_checkpoint = 'Maltehb/-l-ctra-danish-electra-small-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [51]:
dataloader, data = get_data_loader('label6.csv', tokenizer=tokenizer, max_len=280, batch_size=5)

In [54]:
dataiter = iter(dataloader)

In [56]:
data = dataiter.next()

In [58]:
data

{'text': ['Fremskrivning fra @Energistyr viser, at øget produktion af biogas gør, at vi er tættere på at nå klimamålet end forventet. Klimavenlig biogas leverer konkrete CO2-reduktioner nu og her og rummer et stort eksportpotentiale for DK. Det har jeg talt med @tv2fyn om #dkpol #dkgreen https://t.co/jqWOX3WaLM',
  'Små skridt i den rigtige retning med ny bilaftale, men slet ikke ambitiøst nok. Godt at flere får mulighed for elbil, og at de største og mest forurenende biler bliver dyrere. Men vi er ikke færdige med at finde CO2-reduktioner på transport #dkpol #dkgreen',
  '@tselsmark @okologidk Det danske klima er for koldt til soja, i stedet kan vi fodre dyr med protein fra græs, ærter, lupiner, hestebønner mm.',
  '»Man skal lede med luppen«: Dan Jørgensen har fremlagt sine grønne bedrifter et år efter klimavalget 🔐 \nhttps://t.co/ARxQno3ajg',
  'Havde håbet at 40% af landbrugsstøtten skulle være grøn. Nu bliver det kun 20 % - hvordan kan det være godt @MogensJensenS ? . @enhedsliste

In [63]:
# taken from python engineer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [67]:
# freezing layers
model = model.to(device)

# can be given weights might be useful since we have uneven distribution of classes
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

step_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size = 7, gamma=0.1)

model = train_model(model, criterion, optimizer, step_lr_scheduler, num_epochs=1)

Epoch 0/0
----------


NameError: name 'dataloaders' is not defined