# BERT trained on cleaned data
Trained on the cleaned dataset

In [None]:
import torch
import torch.nn as nn

from transformers import BertTokenizer ,AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, BertConfig, TrainingArguments, Trainer

from datasets import load_dataset
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
tokenizer = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')
model = AutoModelForSequenceClassification.from_pretrained('KB/bert-base-swedish-cased')

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

In [None]:
class BertModelWrapper(nn.Module):
    
    def __init__(self, model):
        super(BertModelWrapper, self).__init__()
        self.model = model
        
    def forward(self, input_ids):        
        outputs = self.model.bert(input_ids=input_ids)
        logits = outputs[1]
        return torch.softmax(logits, dim=1)[:, 1].unsqueeze(1)

In [None]:
def input_ref(model_wrapper, sentence):
    input_ids = torch.tensor([tokenizer.encode(sentence, add_special_tokens=True)], device=device)
    
    ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
    sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
    cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence
    
    ref_input_ids = [cls_token_id] + (input_ids.size(1)-2) * [ref_token_id] + [sep_token_id]
    ref_input_ids = torch.tensor([ref_input_ids], device=device)
    
    return input_ids, ref_input_ids

In [None]:
np.random.seed(0)
with pd.option_context('display.max_colwidth', None):
    df = pd.read_csv("../data/dataset_no_recipe.csv")
    df.columns = ['text', 'label']

    random = df.iloc[np.random.permutation(len(df))]
    train = random.iloc[:round(len(df)*.8)]
    test = random.iloc[round(len(df)*.8):]  
print(train.shape)
print(test.shape)

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(df.head(10))

In [None]:
train['label'].value_counts(normalize=True)

In [None]:
test['label'].value_counts(normalize=True)

In [None]:
with pd.option_context('display.max_colwidth', None):
    train.to_csv('../data/train2.csv', index = False)
    test.to_csv('../data/test2.csv', index = False)

In [None]:
train_dataset = load_dataset("csv", data_files='../data/train2.csv')
test_dataset = load_dataset("csv", data_files='../data/test2.csv')

In [None]:
train_dataset['train']['text'][0]

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True,  max_length = 512, add_special_tokens = True)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
train_dataset['train']

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    do_eval=True,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset['train'],
    eval_dataset=test_dataset['train']
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("../models/kb_bert.pt")