In [136]:
# for loading in data and splitting into test and train
import pandas as pd
from datasets import load_dataset
import numpy as np
from datasets import load_metric
import datasets
from exam_utils import timeParser
import tqdm

from transformers import AutoTokenizer
from attack.model_def import ElectraClassifier

# for fine tuning in pytorch with transformers trainer api
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import EarlyStoppingCallback

import torch
#import torch.nn.functional as F
#import torch.nn as nn
#from torch.utils.data import Dataset, RandomSampler, DataLoader

import os

In [120]:
df_all = pd.read_csv('lemma_all.csv', compression='zip')

In [126]:
df_all.loc[:, 'tweet_created_at'] = df_all.tweet_created_at.apply(lambda t: timeParser(t))

In [128]:
df_all = df_all.loc[df_all.tweet_created_at > '2019-06-05']

In [132]:
df_sample = df_all.sample(59000)

In [134]:
df_sample.to_csv('sample_for_prediction.csv')

In [111]:
df_all.to_csv('lemma_all_2.csv')

In [137]:
dataset = load_dataset('csv', data_files = ['sample_for_prediction.csv'])



Using custom data configuration default-3d0ec83b214b51ae


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /Users/jeppefoldberg/.cache/huggingface/datasets/csv/default-3d0ec83b214b51ae/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset csv downloaded and prepared to /Users/jeppefoldberg/.cache/huggingface/datasets/csv/default-3d0ec83b214b51ae/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0. Subsequent calls will reuse this data.


In [147]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'user_screen_name', 'tweet_id', 'tweet_created_at', 'tweet_full_text', 'tweet_text_lemma', 'tweet_text_lemma_reduced'],
        num_rows: 59000
    })
})

In [10]:
dataset['train'][0]

{'Unnamed: 0': 100193,
 'user_screen_name': 'JonBurgwald',
 'tweet_id': 1341060279845720064,
 'tweet_created_at': '2020-12-21',
 'tweet_full_text': '.@MaiVilladsen på Folketingets talerstol: "Vi har indgået mange grønne aftaler det sidste halve år. De fleste har været fremskridt, men fremskridtene har ikke været store nok". Nemlig rigtigt. Vi kommer ikke udenom at genåbne flere af dem. #dkgreen #dkpol',
 'tweet_text_lemma': 'folketing talerstol indgå grøn aftale sidste halv år fremskridt fremskridt stor rigtig udenom genåbne',
 'tweet_text_lemma_reduced': 'folketing talerstol indgå aftale år fremskridt fremskridt genåbne',
 'max_proba': 0.2537778234568019,
 'label': 1,
 'Unnamed: 0.1': None}

In [11]:
dataset = dataset['train'].train_test_split()

In [139]:
tokenizer = AutoTokenizer.from_pretrained("Maltehb/-l-ctra-danish-electra-small-cased")

# tokenizing the datasets
def tokenize_function(examples):
    # pads or truncates the text so it fits with the maximum length the nn can take
    return tokenizer(examples['tweet_full_text'], max_length = 512, padding='max_length', truncation=True)

In [140]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

HBox(children=(FloatProgress(value=0.0, max=59.0), HTML(value='')))




In [151]:
#tokenized_datasets
sample_predict_dataset = tokenized_datasets['train']

In [36]:
# creating the model for finetuning
model = AutoModelForSequenceClassification.from_pretrained('Maltehb/-l-ctra-danish-electra-small-cased', num_labels=4)


Some weights of the model checkpoint at Maltehb/-l-ctra-danish-electra-small-cased were not used when initializing ElectraForSequenceClassification: ['generator.encoder.layer.4.intermediate.dense.weight', 'generator.encoder.layer.9.output.dense.bias', 'discriminator_predictions.dense.bias', 'generator.embeddings.token_type_embeddings.weight', 'generator.encoder.layer.0.attention.self.value.weight', 'generator.encoder.layer.1.attention.self.value.weight', 'generator.encoder.layer.6.output.dense.weight', 'generator.encoder.layer.2.output.LayerNorm.bias', 'generator.encoder.layer.3.attention.output.LayerNorm.bias', 'generator.encoder.layer.8.output.dense.bias', 'generator.encoder.layer.6.output.LayerNorm.bias', 'generator.encoder.layer.7.attention.self.key.weight', 'generator.encoder.layer.3.attention.output.dense.bias', 'generator.encoder.layer.4.attention.output.dense.weight', 'generator.encoder.layer.10.attention.self.key.bias', 'generator.encoder.layer.5.attention.self.key.bias', 'gen

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at Maltehb/-l-ctra-danish-electra-small-cased and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(8))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(8))
full_train_dataset = tokenized_datasets['train']
full_eval_dataset = tokenized_datasets['test']

### Training the classifier with our data
We do not freeze since this is not good practice with 🤗-transformers! 

In [None]:
training_args = TrainingArguments(
    output_dir='final_results',
    num_train_epochs=30,
    evaluation_strategy='epoch',      # computes metrics every epoch!
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.02,               # strength of weight decay higher means less overfitting
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,
    logging_steps=10,
    metric_for_best_model='accuracy'
)

metric = load_metric("accuracy")
cb = [EarlyStoppingCallback(early_stopping_patience=5)]

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_metrics,
    callbacks = cb
)

In [50]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.3773,1.368871,0.349091
2,1.3665,1.362964,0.349091
3,1.3754,1.354841,0.349091
4,1.3484,1.3381,0.349091
5,1.3212,1.297163,0.429091
6,1.1882,1.219107,0.48
7,1.1068,1.1802,0.538182
8,0.9563,1.108454,0.545455
9,0.7942,1.095311,0.545455
10,0.5816,1.098373,0.549091


TrainOutput(global_step=832, training_loss=0.8217360883360155, metrics={'train_runtime': 15136.4916, 'train_samples_per_second': 0.103, 'total_flos': 0, 'epoch': 16.0})

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

## predicting labels for the dataset

In [159]:
def get_labels(trainer, dataset):
    predict = trainer.predict(dataset)
    print('Done with the first part')
    labels = [np.argmax(predict.predictions[i]) for i in range(len(predict.predictions))]
    return labels

In [162]:
labels = get_labels(trainer, sample_predict_dataset)
os.system('say "jeg er færdig"')

Done with the first part


0

In [170]:
len(labels)

59000

In [171]:
sample_predicted = sample_predict_dataset.add_column('label_pred', labels)

In [172]:
sample_predicted.to_csv('full_59000_predicted.csv')

328448359

In [173]:
sample_predicted

Dataset({
    features: ['Unnamed: 0', 'attention_mask', 'input_ids', 'token_type_ids', 'tweet_created_at', 'tweet_full_text', 'tweet_id', 'tweet_text_lemma', 'tweet_text_lemma_reduced', 'user_screen_name', 'label_pred'],
    num_rows: 59000
})

In [106]:
torch.save(model, 'saving_models_attempt/full_model.pt')

In [32]:
def load_model():
    model_checkpoint = 'Maltehb/-l-ctra-danish-electra-small-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    
    model = torch.load('saving_models_attempt/nearly_done_full_model.pt')

    #model = ElectraClassifier(model_checkpoint, 4)
    #model_path = 'nearly_done_full_model.pt'
    #model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

    model.eval()

    return(model, tokenizer)

def make_prediction(dataset):
    input_ids = dataset['input_ids']
    attention_masks = dataset['attention_mask']
    logits = model(input_ids, attention_masks)
    
    logit, preds = torch.max(logits, dim=1)
    return(int(preds))

In [29]:
model, tokenizer = load_model()

In [39]:
data = tokenizer.tokenize('spolitik har ikke gjort noget som helst godt for klimaet')

## Trying to use &tals algorithm

In [70]:
def load_model():
    model_checkpoint = 'Maltehb/-l-ctra-danish-electra-small-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

    model = ElectraClassifier(model_checkpoint, 2)
    model_path = 'attack/pytorch_model.bin'
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

    model.eval()

    return(model, tokenizer)

def make_prediction(text):
    tokenized_text = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors='pt',
    )
    input_ids = tokenized_text['input_ids']
    attention_masks = tokenized_text['attention_mask']
    logits = model(input_ids, attention_masks)
    
    logit, preds = torch.max(logits, dim=1)
    return(int(preds))

In [71]:
model, tokenizer = load_model()

Some weights of the model checkpoint at Maltehb/-l-ctra-danish-electra-small-cased were not used when initializing ElectraModel: ['generator.encoder.layer.1.intermediate.dense.weight', 'generator.encoder.layer.5.attention.output.LayerNorm.bias', 'generator.encoder.layer.4.attention.output.dense.bias', 'generator.encoder.layer.6.attention.self.key.weight', 'generator.encoder.layer.10.attention.output.LayerNorm.weight', 'generator.encoder.layer.10.intermediate.dense.weight', 'generator.encoder.layer.2.attention.self.key.weight', 'generator.encoder.layer.3.output.dense.bias', 'generator.encoder.layer.0.attention.output.LayerNorm.bias', 'discriminator_predictions.dense.weight', 'generator.encoder.layer.7.output.dense.bias', 'generator.encoder.layer.4.output.LayerNorm.bias', 'generator.encoder.layer.7.intermediate.dense.bias', 'generator.encoder.layer.6.attention.output.LayerNorm.bias', 'generator.encoder.layer.2.attention.self.value.weight', 'generator.encoder.layer.8.output.dense.weight',

In [76]:
make_prediction('Helt sikkert din torsk')

0

In [None]:
path_to_model = "models/Ælæctra_uncased_32k/pytorch_model.bin"
tokenizer = AutoTokenizer.from_pretrained("models/-l-ctra-danish-electra-small-uncased-ner-dane")
model = AutoModelForTokenClassification.from_pretrained("models/Ælæctra_uncased_32k/pytorch_model.bin")

## Trying to finetune Ælæctra in the same way that &TAL did it

In [2]:
class ElectraClassifier(nn.Module):
    
    def __init__(self, pretrained_model_name, num_labels=4):
        super(ElectraClassifier, self).__init__()
        self.num_labels = num_labels
        self.electra = ElectraModel.from_pretrained(pretrained_model_name)
        self.dense = nn.Linear(self.electra.config.hidden_size, self.electra.config.hidden_size)
        self.dropout = nn.Dropout(self.electra.config.hidden_dropout_prob)
        self.out_proj = nn.Linear(self.electra.config.hidden_size, self.num_labels)

    def classifier(self, sequence_output):
        x = sequence_output[:, 0, :]
        x = self.dropout(x)
        x = F.gelu(self.dense(x))
        x = self.dropout(x)
        x = F.gelu(self.dense(x))
        x = self.dropout(x)
        x = F.gelu(self.dense(x))
        x = self.dropout(x)
        logits = self.out_proj(x)
        return logits

    def forward(self, input_ids=None, attention_mask=None):
        discriminator_hidden_states = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = discriminator_hidden_states[0]
        logits = self.classifier(sequence_output)
        return logits

In [25]:
class custom_dataset(Dataset):
    def __init__(self, text, targets, tokenizer, max_len):
        self.text = text
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = self.text[item]
        target = self.targets[item]
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt',
        )
        return {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long),
        }

def get_data_loader(path, tokenizer, max_len, batch_size):
    # data is stored with its context, in case we want to train a model using the context as well
    dataset = pd.read_csv(path, index_col = 0)
    dataset = remove_invalid_inputs(dataset, 'tweet_full_text')

    data = custom_dataset(
                    text= dataset.tweet_full_text.to_numpy(),  # used to be text
                    targets= dataset.label.to_numpy(),  # used to be target
                    tokenizer=tokenizer,
                    max_len=max_len
                    )

    sampler = RandomSampler(data)
    dataloader = DataLoader(data,batch_size=batch_size,sampler=sampler,pin_memory=True)
    return dataloader, data

def remove_invalid_inputs(dataset,text_column):
    'Simpel metode til at fjerne alle rækker fra en dataframe, baseret på om værdierne i en kolonne er af typen str'
    dataset['valid'] = dataset[text_column].apply(lambda x: isinstance(x, str))
    return dataset.loc[dataset.valid]


In [23]:
model_checkpoint = 'Maltehb/-l-ctra-danish-electra-small-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [51]:
dataloader, data = get_data_loader('label6.csv', tokenizer=tokenizer, max_len=280, batch_size=5)

In [54]:
dataiter = iter(dataloader)

In [56]:
data = dataiter.next()