In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m97.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [3]:
import os
import pandas as pd
import numpy as np
import random
import json
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification
from sklearn.metrics import classification_report, fbeta_score
from timeit import default_timer as timer
from scipy.optimize import basinhopping
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification

# Load dataset

In [4]:
df = pd.read_csv('/content/drive/MyDrive/data.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,_id,full_text_with_title,category,råvaremarked,aksjer,børs,valuta,other
0,0,62eb8f5b148b7abcd515bcf4,På bare noen uker ble et «lite» Hongkong-baser...,[{'name': 'børs'}],0,0,1,0,0
1,1,62eb8f92148b7abcd515c662,Usikkerhet preger aksjemarkedet: – Det er en ø...,[{'name': 'børs'}],0,0,1,0,0
2,2,62ebc71d148b7abcd51f3313,Bloomberg: Schibsted forbereder salg av Lendo ...,[{'name': 'børs'}],0,0,1,0,0
3,3,62ebcb6d148b7abcd51ff3cb,Wall Street åpner flatt Oppturen fra onsdag sm...,[{'name': 'børs'}],0,0,1,0,0
4,4,62f0b7b9148b7abcd50013ed,Oslo Børs stiger fra start På ukens første han...,"[{'name': 'aksjer'}, {'name': 'børs'}]",0,1,1,0,0


In [6]:
df['category'][0]

"[{'name': 'børs'}]"

# Process dataset


## take labels out of dict

In [7]:
def label_extract(row):
  cats = []
  if type(row['category']) is str:
    # print(type(row['category']))
    cat_list = eval(row['category'])
    for i in cat_list:
      cats.append(i['name'])
  else:
    cats.append('other')

  return cats

df['category'] = df.apply(label_extract, axis = 1)

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,_id,full_text_with_title,category,råvaremarked,aksjer,børs,valuta,other
0,0,62eb8f5b148b7abcd515bcf4,På bare noen uker ble et «lite» Hongkong-baser...,[børs],0,0,1,0,0
1,1,62eb8f92148b7abcd515c662,Usikkerhet preger aksjemarkedet: – Det er en ø...,[børs],0,0,1,0,0
2,2,62ebc71d148b7abcd51f3313,Bloomberg: Schibsted forbereder salg av Lendo ...,[børs],0,0,1,0,0
3,3,62ebcb6d148b7abcd51ff3cb,Wall Street åpner flatt Oppturen fra onsdag sm...,[børs],0,0,1,0,0
4,4,62f0b7b9148b7abcd50013ed,Oslo Børs stiger fra start På ukens første han...,"[aksjer, børs]",0,1,1,0,0


## Encode classes and split train-test dataset

In [9]:
# encode classes using multi label Binarizer
def get_classes_list():
    classes = ['råvaremarked', 'aksjer', 'børs', 'valuta', 'other']
    return classes, len(classes)
classes, num_classes = get_classes_list()
mlb = MultiLabelBinarizer(classes=classes)

In [10]:
df_test = df[:100]

In [11]:
train, test = train_test_split(df, random_state=66, test_size=0.25, shuffle=True)

In [12]:
X_train, y_train = train['full_text_with_title'], mlb.fit_transform(train['category'])
X_val, y_val  = test['full_text_with_title'], mlb.fit_transform(test['category'])

## load tokenizer and encode text data

In [13]:
tokenizer = BertTokenizer.from_pretrained('NbAiLab/nb-bert-base', do_lower_case=True)

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/746 [00:00<?, ?B/s]

In [14]:
encoded_data_train = tokenizer.batch_encode_plus(
    X_train, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='longest', 
    truncation=True,
    max_length=512, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    X_val, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='longest', 
    truncation=True,
    max_length=512, 
    return_tensors='pt'
)

In [15]:
encoded_data_train['input_ids'].shape

torch.Size([1400, 512])

## Build dataset and dataloader

In [16]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_val)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [17]:
batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

# Build model

## Download model

In [26]:
model = BertForSequenceClassification.from_pretrained("NbAiLab/nb-bert-base",
                                                      num_labels=5,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at NbAiLab/nb-bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initi

In [20]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("NbAiLab/nb-bert-base", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=5,
                                                           )

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at NbAiLab/nb-bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initi

In [18]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=5,
                                                           )

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

## set hyperparameters 

In [19]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
epochs = 5
device = 'cuda'
criterion = torch.nn.MultiLabelSoftMarginLoss()
best_score = 0.

save_dir = '/content/drive/MyDrive/'

In [20]:
optimizer = torch.optim.AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)            
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [21]:
def result(dir_path, run_name, is_best, state):
    result_file = os.path.join(dir_path,
                               run_name + '-model_best.pth')
    if is_best:
        torch.save(state, result_file)
        print("Result saved to {}".format(result_file))

## Threshold searching function

In [22]:
def best_f2_score(true_labels, predictions):
    def f_neg(threshold):
        ## Scipy tries to minimize the function so we must get its inverse 
        return - fbeta_score(true_labels, predictions > threshold, beta=2, average='samples')

    # Initialization of best threshold search
    thr_0 = [0.20] * 5
    constraints = [(0., 1.)] * 5

    def bounds(**kwargs):
        x = kwargs["x_new"]
        tmax = bool(np.all(x <= 1))
        tmin = bool(np.all(x >= 0))
        return tmax and tmin

    # Search using L-BFGS-B, the epsilon step must be big otherwise there is no gradient
    minimizer_kwargs = {"method": "L-BFGS-B",
                        "bounds": constraints,
                        "options": {
                            "eps": 0.05
                        }
                        }

    # We combine L-BFGS-B with Basinhopping for stochastic search with random steps
    print("===> Searching optimal threshold for each label")
    # logger.info("===> Searching optimal threshold for each label")
    start_time = timer()

    opt_output = basinhopping(f_neg, thr_0,
                              stepsize=0.1,
                              minimizer_kwargs=minimizer_kwargs,
                              niter=10,
                              accept_test=bounds)

    end_time = timer()
    print("===> Optimal threshold for each label:\n{}".format(opt_output.x))
    print("Threshold found in: %s seconds" % (end_time - start_time))
    # logger.info("===> Optimal threshold for each label:\n{}".format(opt_output.x))
    # logger.info("Threshold found in: %s seconds" % (end_time - start_time))

    score = - opt_output.fun
    return score, opt_output.x

## Evalutation function

In [23]:
def evaluate(epoch, dataloader_val, model, loss_func, mlb):

    model.eval()
    loss_val_total = 0
    predictions, true_vals, val_his = [], [], []
    print("Starting Validation")

    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  # 'labels':         batch[2],
                 }
        labels = batch[2]
        with torch.no_grad():        
            outputs = model(**inputs)
            
        logits = outputs['logits']
        pred = torch.sigmoid(logits)
        predictions.append(pred.data.cpu().numpy())

        loss_val_total += loss_func(logits, labels).data

        label_ids = labels.cpu().numpy()
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    val_his.append(loss_val_avg)
    
    predictions = np.vstack(predictions)
    true_vals = np.vstack(true_vals)

    score, threshold = best_f2_score(true_vals, predictions)
    print("Corresponding tags\n{}".format(mlb.classes_))

    pred_labels = torch.as_tensor((predictions - threshold) > 0, dtype=torch.int32)
    report = classification_report(true_vals, pred_labels, target_names=mlb.classes_, output_dict=True)
    mAP = report['samples avg']['precision']
    # print(mAP)
    
    print("===> Validation - Avg. loss: {:.4f}\tAverage precision Score: {:.4f}".format(loss_val_avg, score))
            
    return loss_val_avg, score, threshold, mAP, report #, predictions, true_vals

## Train function

In [24]:
def train(epoch, dataloader_train, model, loss_func, scheduler, optimizer, loss_his=[]):
    model.train()
    model.to(device)

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids':      batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  # 'labels':         batch[2],
                }       
        labels = batch[2].to(device)
        outputs = model(**inputs)
        logits = outputs['logits']
        
        loss = loss_func(logits, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{}'.format(loss.item()/len(batch))})
        loss_his.append(loss.data)
    return loss_his

# Training and save model

In [30]:
# train
val_his,mean_AP, training_his, reports = [], [], [], []

for epoch in tqdm(range(1, epochs+1)):
    # Train and validate
    train_his = train(epoch=epoch, dataloader_train=dataloader_train, \
                      model=model, loss_func=criterion, scheduler=scheduler, optimizer=optimizer)
    loss_val_avg, score, threshold, mAP, report = evaluate(epoch=epoch, \
                                                   dataloader_val=dataloader_validation, \
                                                   model=model, loss_func=criterion, \
                                                   mlb=mlb)
    
    training_his.append(train_his)
    val_his.append(loss_val_avg)
    mean_AP.append(mAP)
    reports.append(report)

    # Save
    is_best = score > best_score
    best_score = max(score, best_score)

    result(dir_path=save_dir, run_name='finetune1', is_best = is_best, state = {
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'best_score': best_score,
        'optimizer': optimizer.state_dict(),
        'threshold': threshold,
        'val_loss': loss_val_avg
    })

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/175 [00:00<?, ?it/s]

Starting Validation
===> Searching optimal threshold for each label
===> Optimal threshold for each label:
[0.28148101 0.25169905 0.19833826 0.23265807 0.09898173]
Threshold found in: 4.581831262999913 seconds
Corresponding tags
['råvaremarked' 'aksjer' 'børs' 'valuta' 'other']
===> Validation - Avg. loss: 0.2453	Average precision Score: 0.8263
Result saved to /content/drive/MyDrive/finetune1-model_best.pth


Epoch 2:   0%|          | 0/175 [00:00<?, ?it/s]

Starting Validation
===> Searching optimal threshold for each label
===> Optimal threshold for each label:
[0.2251212  0.16597977 0.2050984  0.2        0.18368512]
Threshold found in: 3.7132068780001646 seconds
Corresponding tags
['råvaremarked' 'aksjer' 'børs' 'valuta' 'other']
===> Validation - Avg. loss: 0.2456	Average precision Score: 0.8228


Epoch 3:   0%|          | 0/175 [00:00<?, ?it/s]

Starting Validation
===> Searching optimal threshold for each label
===> Optimal threshold for each label:
[0.1802122  0.15952792 0.08374175 0.2        0.14714107]
Threshold found in: 3.0043097410000428 seconds
Corresponding tags
['råvaremarked' 'aksjer' 'børs' 'valuta' 'other']
===> Validation - Avg. loss: 0.2544	Average precision Score: 0.8198


Epoch 4:   0%|          | 0/175 [00:00<?, ?it/s]

Starting Validation
===> Searching optimal threshold for each label
===> Optimal threshold for each label:
[0.2090151  0.14046607 0.31127624 0.0576846  0.19979241]
Threshold found in: 4.704706270000088 seconds
Corresponding tags
['råvaremarked' 'aksjer' 'børs' 'valuta' 'other']
===> Validation - Avg. loss: 0.2518	Average precision Score: 0.8225


Epoch 5:   0%|          | 0/175 [00:00<?, ?it/s]

Starting Validation
===> Searching optimal threshold for each label
===> Optimal threshold for each label:
[0.13979088 0.13818047 0.08150413 0.20814828 0.20011217]
Threshold found in: 5.2434675569998035 seconds
Corresponding tags
['råvaremarked' 'aksjer' 'børs' 'valuta' 'other']
===> Validation - Avg. loss: 0.2495	Average precision Score: 0.8316
Result saved to /content/drive/MyDrive/finetune1-model_best.pth


In [27]:
# train
val_his,mean_AP, training_his, reports = [], [], [], []

for epoch in tqdm(range(1, epochs+1)):
    # Train and validate
    train_his = train(epoch=epoch, dataloader_train=dataloader_train, \
                      model=model, loss_func=criterion, scheduler=scheduler, optimizer=optimizer)
    loss_val_avg, score, threshold, mAP, report = evaluate(epoch=epoch, \
                                                   dataloader_val=dataloader_validation, \
                                                   model=model, loss_func=criterion, \
                                                   mlb=mlb)
    
    training_his.append(train_his)
    val_his.append(loss_val_avg)
    mean_AP.append(mAP)
    reports.append(report)

    # Save
    is_best = score > best_score
    best_score = max(score, best_score)

    result(dir_path=save_dir, run_name='finetune', is_best = is_best, state = {
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'best_score': best_score,
        'optimizer': optimizer.state_dict(),
        'threshold': threshold,
        'val_loss': loss_val_avg
    })

  0%|          | 0/5 [00:00<?, ?it/s]

RuntimeError: ignored

In [25]:
# train
val_his,mean_AP, training_his, reports = [], [], [], []

for epoch in tqdm(range(1, epochs+1)):
    # Train and validate
    train_his = train(epoch=epoch, dataloader_train=dataloader_train, \
                      model=model, loss_func=criterion, scheduler=scheduler, optimizer=optimizer)
    loss_val_avg, score, threshold, mAP, report = evaluate(epoch=epoch, \
                                                   dataloader_val=dataloader_validation, \
                                                   model=model, loss_func=criterion, \
                                                   mlb=mlb)
    
    training_his.append(train_his)
    val_his.append(loss_val_avg)
    mean_AP.append(mAP)
    reports.append(report)

    # Save
    is_best = score > best_score
    best_score = max(score, best_score)

    result(dir_path=save_dir, run_name='multibert', is_best = is_best, state = {
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'best_score': best_score,
        'optimizer': optimizer.state_dict(),
        'threshold': threshold,
        'val_loss': loss_val_avg
    })

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/467 [00:00<?, ?it/s]

RuntimeError: ignored

In [None]:

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals, val_his, mAP = [], [], [], []
    print("Starting Validation")
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  # 'labels':         batch[2],
                 }
        
        labels = batch[2]
        with torch.no_grad():        
            outputs = model(**inputs)
            
        logits = outputs['logits']
        pred = torch.sigmoid(logits)
        loss = criterion(logits, labels)
        loss_val_total += loss.item()

        label_ids = labels.cpu().numpy()
        predictions.append(pred.data.cpu().numpy())
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    val_his.append(loss_val_avg)
    
    predictions = np.vstack(predictions)
    true_vals = np.vstack(true_vals)
    score, threshold = best_f2_score(true_vals, predictions)
    print("Corresponding tags\n{}".format('råvaremarked, aksjer, børs, valuta, other'))
    pred_labels = torch.as_tensor((predictions - threshold) > 0, dtype=torch.int32)
    report = classification_report(true_vals, pred_labels, target_names=mlb.classes_, output_dict=True)
    mAP = report['samples avg']['precision']
    print(mAP)
    
    print("===> Validation - Avg. loss: {:.4f}\tAverage precision Score: {:.4f}".format(loss_val_avg, score))
            
    return loss_val_avg, predictions, true_vals, score
    
for epoch in tqdm(range(1, epochs+1)):
    model.to(device)
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  # 'labels':         batch[2],
                 }       
        labels = batch[2].to(device)
        outputs = model(**inputs)
        logits = outputs['logits']
        
        loss = criterion(logits, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{}'.format(loss.item()/len(batch))})
         
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals, score = evaluate(dataloader_validation)
    # Save
    is_best = score > best_score
    if   is_best:  
      torch.save(model.state_dict(), f'/content/drive/MyDrive/finetuned_BERT.model')
      print('Model updated')
    # val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    # tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/175 [00:00<?, ?it/s]

OutOfMemoryError: ignored

# test on data from finansavisen.no

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

In [None]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,_id,article_tags,datetime,extracted_tags,full_text,title,topics,source,full_text_with_title
0,0,62daa47b79f22232e10b1497,"[""Teknologi"",""metaverse"",""Nyheter""]",2022-01-14T00:00:00.000Z,"[""milllioner"",""kjølvannet"",""tilfeldigvis"",""bol...",Siden Facebook i fjor kunngjorde et navneskift...,Eiendomsprisene er opp 500 prosent i «metaverset»,"[""Metaverse"",""Stykk"",""Boligpris"",""Teknologi"",""...",finansavisen.no,Eiendomsprisene prosent metavers Facebook kunn...
1,1,62daa4c779f22232e10b2376,"[""Nyheter"",""Makro""]",2022-07-20T06:10:05.000Z,"[""husholdningene"",""energiprisøkning"",""styrings...",Konsumprisindeksen er nå oppe på det høyeste n...,Høyeste britisk inflasjon på 40 år,"[""Lønnsøkning"",""Måned"",""Styringsrente"",""Hushol...",finansavisen.no,Høyeste britisk inflasjon konsumprisindeks opp...
2,2,62de880579f22232e1a89005,"[""breaking"",""federal reserve"",""jerome powell"",...",2022-06-15T18:00:56.000Z,"[""styringsrenten"",""rentehevingene"",""inflasjons...",Federal Reserve hever styringsrenten med 75 ba...,Fed med monsterheving,"[""Rentebeslutning"",""federal reserve"",""Styrings...",finansavisen.no,Fed monsterheving Federal Reserve heve styring...
3,3,62deb2e979f22232e1aff02b,"[""porsche"",""vw"",""Bil"",""Nyheter"",""Børs"",""børsno...",2022-07-25T00:00:00.000Z,"[""aksjemarkedet"",""påtroppende"",""elbiler"",""indu...",De siste årene har vært preget av store utford...,"Hva nå, Volkswagen?","[""Industrigigant"",""Børsnotering"",""Tesla"",""Kons...",finansavisen.no,hva Volkswagen år prege utfordring tysk Volksw...
4,4,62dfb08d79f22232e1d674e5,"[""WEST"",""Nyheter"",""Shipping"",""Børs""]",2022-06-14T06:47:40.000Z,"[""rekordhøyt"",""halvår"",""aksjen"",""rateoppgang"",...",Western Bulk håver inn penger på børsuroen. Nå...,Tidenes resultat for Western Bulk,"[""Aksje"",""Tørrlastrederi"",""Rateoppgang"",""Marke...",finansavisen.no,Tidene resultat Western Bulk Western Bulk håve...


In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408 entries, 0 to 1407
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            1408 non-null   int64 
 1   _id                   1408 non-null   object
 2   article_tags          1408 non-null   object
 3   datetime              1408 non-null   object
 4   extracted_tags        1209 non-null   object
 5   full_text             1408 non-null   object
 6   title                 1408 non-null   object
 7   topics                1408 non-null   object
 8   source                1408 non-null   object
 9   full_text_with_title  1408 non-null   object
dtypes: int64(1), object(9)
memory usage: 110.1+ KB


In [None]:
x = test_df['full_text_with_title']

In [None]:
encoded_data_x = tokenizer.batch_encode_plus(
    x, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='longest', 
    truncation=True,
    max_length=512, 
    return_tensors='pt'
)

In [None]:
input_ids_x = encoded_data_x['input_ids']
attention_masks_x = encoded_data_x['attention_mask']

dataset_x = TensorDataset(input_ids_x, attention_masks_x)

In [None]:
dataloader_x = DataLoader(dataset_x, 
                              sampler=RandomSampler(dataset_x), 
                              batch_size=25)

In [None]:
device = torch.device('cpu')
checkpoint = torch.load('/content/drive/MyDrive/finetune-model_best.pth', map_location=device)
model.load_state_dict(checkpoint['state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer'])
threshold = checkpoint['threshold']
threshold = torch.from_numpy(threshold)


In [None]:
model.eval()
model.to(device)

loss_val_total = 0
predictions, true_vals, val_his, mAP = [], [], [], []
print("Starting Validation")
for batch in dataloader_x:
    
    batch = tuple(b.to(device) for b in batch)
    
    inputs = {'input_ids':      batch[0],
              'attention_mask': batch[1],
              # 'labels':         batch[2],
              }
    
    # labels = batch[2]
    with torch.no_grad():        
        outputs = model(**inputs)
        
    logits = outputs['logits']
    pred = torch.sigmoid(logits)

    pred_labels = torch.as_tensor((pred.cpu() - threshold) > 0, dtype=torch.int32)

    predictions.append(pred_labels.data.cpu().numpy())
    # true_vals.append(label_ids)
    

# loss_val_avg = loss_val_total/len(dataloader_x) 
# val_his.append(loss_val_avg)

predictions = np.vstack(predictions)


Starting Validation


KeyboardInterrupt: ignored

In [None]:
p = predictions

In [None]:
p.shape

In [None]:
dataset = pd.DataFrame({'råvaremarked': p[:, 0], 
                        'aksjer': p[:, 1],
                        'børs': p[:, 2],
                        'valuta': p[:, 3],
                        'other': p[:, 4]}
                       )

In [None]:
dataset['article_tags'] = test_df['article_tags']
dataset['_id'] = test_df['_id']

In [None]:
dataset.head()

In [None]:
råvaremarked_keyword = ['stål', 'oljeprodusenter', 'oljelagertall', 'brentolje', 'oljebransjen', 'skiferolje', 'olje/energi', 'litium', 'norsk olje og gass', 
                        'drivstoff', 'oljelager', 'drivstoffpriser', 'biodrivstoff', 'oljeindustrien', 'oljemarkedet', 'oljepolitikk', 'aluminium', 'oljeselskaper', 
                        'oljelisenser', 'gull', 'olje', 'oljeprisutslagene', 'olje og gass', 'bensinprisen', 'olje & gass', 'oljeetterspørsel', 'oljedirektoratet', 
                        'brent', 'bensinpris', 'drivstoffprisene', 'oljeservice', 'amerikansk lettolje (wti)', 'oljeleting', 'wti-olje', 'oljeeksport', 'drivstoffappen', 
                        'energi og råvarer', 'oljeproduksjon', 'oljeskatt', 'oljekraftverk', 'stopp oljeletinga', 'oljefunn', 'norsk stål', 'oljefondets milliardkjøp', 
                        'stålproduksjon', '«oljeskatt» på oppdrett', 'oljearbeidere', 'bensin', 'bensinkrig', 'kull', 'oljeutvinning', 'råvarer', 'nikkel', 'brent-olje']
aksjer_keyword = ['aksjeanbefaling', 'dn aksjer', 'aksjelive', 'aksjemarkedet', 'aksjetipset', 'meme-aksje', 'aksjefond', 'it-aksjer', 'aksjeposisjoner', 
                  'aksje-nm 2022', 'aksjesparing', 'aksje-nm', 'aksjesparekonto', 'aksjepraten', 'kjøpsanbefaling', 'aksjeanalyse', 'lakseaksjer', 'anbefaling', 
                  'aksjehandel', 'aksjetips', 'oljeaksjer', 'aksje', 'teknologiaksjer', 'aksjer', 'investorenes aksjekjøp', 'flyaksjer', 'vekstaksjer', 'aksjeselskaper']
børs_keyword = ['asia-børsene', 'børs', 'børsdagen', 'børsen', 'børsene i asia', 'børsfall', 'børsintervju', 'børskommentar', 'børsmorgen', 'børsnotering', 'børsnoteringer', 
                'børsoppdatering', 'børsåpning', 'dagligvarebørsen', 'dette vil påvirke oslo børs', 'etterbørs', 'helsinki-børsen', 'hongkong-børsen', 'kryptovalutabørs', 
                'new york-børsen', 'nytt fra børsselskapene', 'oslo børs', 'oslo børs seafood index (obsfx)', 'seoul-børsen', 'shanghai-børsen', 'singapore-børsen', 'stockholmsbørsen', 
                'sydney-børsen', 'tokyo-børsen']
valuta_keyword = ['kryptoutvinning', 'krypto', 'kryptovalutabørs', 'valutamarkedet', 'kronekurs', 'dollar', 'kryptokunst', 'kryptoboomen', 'blokkjede', 'kryptovaluta', 'kronekursen', 
                  'norsk krone', 'kronen', 'lokal valuta', 'kryptovault', 'blokkjedeteknologi', 'nft', 'euro', 'krone', 'valuta', 'digital valuta']

In [None]:
def category_assignment(row, råvaremarked_keyword=råvaremarked_keyword, aksjer_keyword=aksjer_keyword, 
                        børs_keyword=børs_keyword, valuta_keyword=valuta_keyword):
    """input keyword list and return corresponding category"""
    if type(row['article_tags']) is str:
        article_tags = json.loads(row['article_tags'])
        article_tags = [item.lower() for item in article_tags]
        article_tags_set = set(article_tags)
        print(article_tags_set)
        category = []
        #check if article belongs to råvaremarked_keyword
        råvaremarked_keyword_set = set(råvaremarked_keyword)
        checker_råvaremarked = råvaremarked_keyword_set.intersection(article_tags_set)

        #check if article belongs to aksjer_keyword
        aksjer_keyword_set = set(aksjer_keyword)
        checker_aksjer = aksjer_keyword_set.intersection(article_tags_set)

        #check if article belongs to børs_keyword
        børs_keyword_set = set(børs_keyword)
        checker_børs = børs_keyword_set.intersection(article_tags_set)

        #check if article belongs to valuta_keyword
        valuta_keyword_set = set(valuta_keyword)
        checker_valuta = valuta_keyword_set.intersection(article_tags_set)
        if checker_råvaremarked:
            category.append('råvaremarked')

        if checker_aksjer:
            category.append('aksjer')

        if checker_børs:
            category.append('børs')

        if checker_valuta:
            category.append('valuta')
        
        if category:
            return category
        else:
            return ['other']

In [None]:
dataset['category'] = dataset.apply(category_assignment, axis = 1)

NameError: ignored

In [None]:
for index, row in dataset.iterrows():
  print(row['article_tags'])
  cats = []
  if row['råvaremarked']:
    cats.append('råvaremarked')
  if row['aksjer']:
    cats.append('aksjer')
  if row['børs']:
    cats.append('børs')
  if row['valuta']:
    cats.append('valuta')
  if row['other']:
    cats.append('other')
  print(cats)
  print(row['_id'])


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
["biontech-pfizer","biontech","joe biden","nvidia","fornybar energi","solenergi","energi","Nyheter","Markedskommentarer","Børsoppdatering"]
0 0 0 0 1
62f126e8a64edcf014abf87e

["dnb markets","fed","Nyheter","Børs","Børsoppdatering"]
0 0 0 0 1
62f126f0a64edcf014abf9d9

["pfizer","Nyheter","Helse"]
0 0 0 0 1
62f126f4a64edcf014abfa9b

["strømstøtte","LO","NHO","Nyheter","Politikk"]
0 0 0 0 1
62f12701a64edcf014abfcd9

["pfizer","oppkjøp","migrene","biohaven","Nyheter","Helse","Børs"]
0 0 0 0 1
62f12748a64edcf014ac093b

["pfizer","coronavaksine","coronapille","Nyheter","Helse"]
0 0 0 0 1
62f12755a64edcf014ac0b76

["Nyheter","Børs"]
0 0 0 0 1
62f127aaa64edcf014ac1a7b

["berkshire hathaway","warren buffett","aksjeposisjoner","apple","bank of america","chevron","coca-cola","american express","Nyheter","Børs"]
0 0 0 1 0
62f127b0a64edcf014ac1c2d

["berkshire hathaway","warren buffett","aksjeposisjoner","apple","bank of america","ch

In [None]:
labels = predictions > np.array(threshold)
p[p > np.array(threshold)] = 1

TypeError: ignored

In [None]:
def best_f2_score(true_labels, predictions):
    def f_neg(threshold):
        ## Scipy tries to minimize the function so we must get its inverse 
        return - fbeta_score(true_labels, predictions > threshold, beta=2, average='samples')

    # Initialization of best threshold search
    thr_0 = [0.20] * 5
    constraints = [(0., 1.)] * 5

    def bounds(**kwargs):
        x = kwargs["x_new"]
        tmax = bool(np.all(x <= 1))
        tmin = bool(np.all(x >= 0))
        return tmax and tmin

    # Search using L-BFGS-B, the epsilon step must be big otherwise there is no gradient
    minimizer_kwargs = {"method": "L-BFGS-B",
                        "bounds": constraints,
                        "options": {
                            "eps": 0.05
                        }
                        }

    # We combine L-BFGS-B with Basinhopping for stochastic search with random steps
    print("===> Searching optimal threshold for each label")
    # logger.info("===> Searching optimal threshold for each label")
    start_time = timer()

    opt_output = basinhopping(f_neg, thr_0,
                              stepsize=0.1,
                              minimizer_kwargs=minimizer_kwargs,
                              niter=10,
                              accept_test=bounds)

    end_time = timer()
    print("===> Optimal threshold for each label:\n{}".format(opt_output.x))
    print("Threshold found in: %s seconds" % (end_time - start_time))
    # logger.info("===> Optimal threshold for each label:\n{}".format(opt_output.x))
    # logger.info("Threshold found in: %s seconds" % (end_time - start_time))

    score = - opt_output.fun
    return score, opt_output.x