# BERT trained on uncleaned data
Initially trained models on the uncleaned dataset   
The following notebooks (02_BERT) uses the cleaned dataset

In [None]:
import torch
import torch.nn as nn
import tqdm.notebook as tq
import pandas as pd
import numpy as np

from torch.utils.data import DataLoader

from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments, AdamW, BertForSequenceClassification

from datasets import load_dataset

from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [None]:
tokenizer = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')
model = BertForSequenceClassification.from_pretrained('KB/bert-base-swedish-cased')

In [None]:
with pd.option_context('display.max_colwidth', None):
    df = pd.read_csv("../data/dataset.csv")
    df.columns = ['text', 'label']

    random = df.iloc[np.random.permutation(len(df))]
    train = random.iloc[:8700]
    test = random.iloc[8700:]
    
print(train.shape)
print(test.shape)

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(df)

In [None]:
train['label'].value_counts(normalize=True)

In [None]:
test['label'].value_counts(normalize=True)

In [None]:
with pd.option_context('display.max_colwidth', None):
    train.to_csv('../data/train.csv', index = False)
    test.to_csv('../data/test.csv', index = False)
    random.to_csv('../data/random.csv', index = False)

In [None]:
train_dataset = load_dataset("csv", data_files='../data/train.csv')
test_dataset = load_dataset("csv", data_files='../data/test.csv')

In [None]:
train_dataset['train']['text'][0]

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True,  max_length = 512, add_special_tokens = True)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
c = 0
allc = []
for i in train_dataset['train']['input_ids']:
    l = len(i)
    allc.append(l)
    if l>c:
        c = l
print(c)
print(np.mean(allc))

In [None]:
train_dataset['train']['label']

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    do_eval=True,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset['train'],
    eval_dataset=test_dataset['train']
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("../models/kb_bert.pt")

In [None]:
"""
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset['train'], batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()"""