In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import shutil
import os
import logging
import sys
sys.path.append('..')

from textblob import TextBlob
from pprint import pprint
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from transformers import AutoModelForSequenceClassification

from finbert.finbert import *
import finbert.utils as tools
from finbert.custom_finbert import CustomFinBert

project_dir = Path.cwd().parent
pd.set_option('max_colwidth', -1)

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.ERROR)

In [2]:
standard_dir = project_dir.parent.parent

# ! TODO: PUT THE PREPROCESSING CODE HERE AND STORE PROCESSED DF

# Prepare model

In [3]:
lm_path = project_dir/'models'/'language_model'/'finbertTRC2'
final_model_dir = project_dir/'models'/'classification_model_256_hidden_dim'
data_dir = standard_dir/"Data"
# classification_data_path= standard_dir/'Data'/'sentiment_data'

## Configure training parameters

In [4]:
# Clean the cl_path
try:
    shutil.rmtree(final_model_dir) 
except:
    pass

## load the pretrained language model trained on Reuters TRC2
bertmodel = AutoModelForSequenceClassification.from_pretrained(
    lm_path, cache_dir=None, num_labels=30)

Some weights of the model checkpoint at /media/student/HDD 2/Michelle/MLS/advanced/finBERT_transaction_classification/models/language_model/finbertTRC2 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
S

In [5]:
custom_finbert = CustomFinBert(bertmodel, 38, 30, hidden_dims=[256])

In [6]:
config = Config(
    data_dir=data_dir,
    model_dir=final_model_dir,
    bert_model=custom_finbert,
    num_train_epochs=10,
    max_seq_length=48,
    train_batch_size=32,
    learning_rate=2e-5,
    output_mode="classification",
    warm_up_proportion=0.2,
    local_rank=-1,
    discriminate=True,
    gradual_unfreeze=False,
    numeric_params=38,
    label_colname="category"
)

In [7]:
finbert = FinBert(config, transaction_classification=True)

In [8]:
y = pd.read_csv(standard_dir/"Data"/"eval.csv")["category"]

In [9]:
finbert.prepare_model(label_list=y.unique().tolist())

07/18/2024 11:13:47 - INFO - finbert.finbert -   device: cuda n_gpu: 2, distributed training: False, 16-bits training: False


In [10]:
train_examples, numeric_feats = finbert.get_data("train")

In [11]:
model = finbert.create_the_model()

In [12]:
trained_model = finbert.train(train_examples = train_examples, model = model, train_numeric_feats=numeric_feats)

07/18/2024 11:13:56 - INFO - finbert.utils -   *** Example ***
07/18/2024 11:13:56 - INFO - finbert.utils -   guid: 0
07/18/2024 11:13:56 - INFO - finbert.utils -   tokens: [CLS] ac ##h withdrawal am ##mar ##yse hem ##ant ##ne ##x ##press transfer [SEP]
07/18/2024 11:13:56 - INFO - finbert.utils -   input_ids: 101 9353 2232 10534 2572 7849 23274 19610 4630 2638 2595 20110 4651 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 11:13:56 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 11:13:56 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 11:13:56 - INFO - finbert.utils -   label: Transfer Debit (id = 2)
07/18/2024 11:14:08 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 11:14:08 - INFO - finbert.finbert -     Num examples = 160391
07/18

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 3.5488927364349365
(Iteration 100) Training loss: 3.3954172134399414
(Iteration 200) Training loss: 3.4377686977386475
(Iteration 300) Training loss: 3.3448245525360107
(Iteration 400) Training loss: 3.4722142219543457
(Iteration 500) Training loss: 3.2703239917755127
(Iteration 600) Training loss: 3.3134829998016357
(Iteration 700) Training loss: 3.195587158203125
(Iteration 800) Training loss: 3.125565528869629
(Iteration 900) Training loss: 3.1242454051971436
(Iteration 1000) Training loss: 3.1316120624542236
(Iteration 1100) Training loss: 3.1121156215667725
(Iteration 1200) Training loss: 2.914438247680664
(Iteration 1300) Training loss: 2.8431332111358643
(Iteration 1400) Training loss: 2.9618353843688965
(Iteration 1500) Training loss: 2.888730049133301
(Iteration 1600) Training loss: 2.765958547592163
(Iteration 1700) Training loss: 2.582751512527466
(Iteration 1800) Training loss: 2.3954269886016846
(Iteration 1900) Training loss: 2.292456865310669

07/18/2024 11:34:10 - INFO - finbert.utils -   *** Example ***
07/18/2024 11:34:10 - INFO - finbert.utils -   guid: 0
07/18/2024 11:34:10 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 11:34:10 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 11:34:10 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 11:34:10 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 11:34:10 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 11:34:15 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 11:34:15 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 11:34:15 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [1.0823279039673386]
Validation Accuracy: 0.7632
No best model found


Epoch:  10%|█         | 1/10 [22:30<3:22:34, 1350.51s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.39581629633903503
(Iteration 100) Training loss: 0.7217799425125122
(Iteration 200) Training loss: 0.9955273866653442
(Iteration 300) Training loss: 1.1689791679382324
(Iteration 400) Training loss: 1.8410831689834595
(Iteration 500) Training loss: 0.9113364815711975
(Iteration 600) Training loss: 0.7064817547798157
(Iteration 700) Training loss: 1.2638002634048462
(Iteration 800) Training loss: 0.6762069463729858
(Iteration 900) Training loss: 0.7368118762969971
(Iteration 1000) Training loss: 0.83381187915802
(Iteration 1100) Training loss: 1.8166134357452393
(Iteration 1200) Training loss: 0.6714252829551697
(Iteration 1300) Training loss: 0.7715001702308655
(Iteration 1400) Training loss: 0.7104709148406982
(Iteration 1500) Training loss: 1.0635074377059937
(Iteration 1600) Training loss: 0.9281696081161499
(Iteration 1700) Training loss: 0.48990073800086975
(Iteration 1800) Training loss: 0.2792249619960785
(Iteration 1900) Training loss: 0.306467890

07/18/2024 11:56:56 - INFO - finbert.utils -   *** Example ***
07/18/2024 11:56:56 - INFO - finbert.utils -   guid: 0
07/18/2024 11:56:56 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 11:56:56 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 11:56:56 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 11:56:56 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 11:56:56 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 11:57:01 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 11:57:01 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 11:57:01 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [1.0823279039673386, 0.567875035701726]
Validation Accuracy: 0.8477


Epoch:  20%|██        | 2/10 [45:15<3:01:14, 1359.30s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.30549052357673645
(Iteration 100) Training loss: 0.5448124408721924
(Iteration 200) Training loss: 0.19052715599536896
(Iteration 300) Training loss: 0.4199184775352478
(Iteration 400) Training loss: 0.9848079085350037
(Iteration 500) Training loss: 0.24357745051383972
(Iteration 600) Training loss: 0.31310519576072693
(Iteration 700) Training loss: 0.08498489856719971
(Iteration 800) Training loss: 0.13566680252552032
(Iteration 900) Training loss: 0.14553208649158478
(Iteration 1000) Training loss: 7.066809177398682
(Iteration 1100) Training loss: 0.505128026008606
(Iteration 1200) Training loss: 0.19817344844341278
(Iteration 1300) Training loss: 0.19511936604976654
(Iteration 1400) Training loss: 0.213861882686615
(Iteration 1500) Training loss: 0.22337782382965088
(Iteration 1600) Training loss: 0.2543811798095703
(Iteration 1700) Training loss: 0.2122201919555664
(Iteration 1800) Training loss: 0.5356157422065735
(Iteration 1900) Training loss: 0.18

07/18/2024 12:19:39 - INFO - finbert.utils -   *** Example ***
07/18/2024 12:19:39 - INFO - finbert.utils -   guid: 0
07/18/2024 12:19:39 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 12:19:39 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 12:19:39 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 12:19:39 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 12:19:39 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 12:19:44 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 12:19:44 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 12:19:44 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [1.0823279039673386, 0.567875035701726, 0.4626183950135518]
Validation Accuracy: 0.8664


Epoch:  30%|███       | 3/10 [1:07:59<2:38:48, 1361.24s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.5738397836685181
(Iteration 100) Training loss: 0.06752213835716248
(Iteration 200) Training loss: 0.07930409908294678
(Iteration 300) Training loss: 0.7711036205291748
(Iteration 400) Training loss: 3.281656503677368
(Iteration 500) Training loss: 4.675759792327881
(Iteration 600) Training loss: 0.1938289999961853
(Iteration 700) Training loss: 0.11264997720718384
(Iteration 800) Training loss: 0.5182145833969116
(Iteration 900) Training loss: 2.6670570373535156
(Iteration 1000) Training loss: 0.25726187229156494
(Iteration 1100) Training loss: 0.42612868547439575
(Iteration 1200) Training loss: 0.32882168889045715
(Iteration 1300) Training loss: 0.5022168755531311
(Iteration 1400) Training loss: 1.4693524837493896
(Iteration 1500) Training loss: 1.4580790996551514
(Iteration 1600) Training loss: 0.2898847162723541
(Iteration 1700) Training loss: 0.21638233959674835
(Iteration 1800) Training loss: 0.08048170059919357
(Iteration 1900) Training loss: 0.219

07/18/2024 12:42:24 - INFO - finbert.utils -   *** Example ***
07/18/2024 12:42:24 - INFO - finbert.utils -   guid: 0
07/18/2024 12:42:24 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 12:42:24 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 12:42:24 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 12:42:24 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 12:42:24 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 12:42:29 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 12:42:29 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 12:42:29 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [1.0823279039673386, 0.567875035701726, 0.4626183950135518, 0.4138067854482121]
Validation Accuracy: 0.8910


Epoch:  40%|████      | 4/10 [1:30:44<2:16:16, 1362.79s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.09224998205900192
(Iteration 100) Training loss: 0.7063512802124023
(Iteration 200) Training loss: 0.032641906291246414
(Iteration 300) Training loss: 0.3112999498844147
(Iteration 400) Training loss: 0.16028927266597748
(Iteration 500) Training loss: 0.26303568482398987
(Iteration 600) Training loss: 0.15707746148109436
(Iteration 700) Training loss: 0.15405327081680298
(Iteration 800) Training loss: 0.22371719777584076
(Iteration 900) Training loss: 0.08385369926691055
(Iteration 1000) Training loss: 0.22437544167041779
(Iteration 1100) Training loss: 0.14794720709323883
(Iteration 1200) Training loss: 0.262125164270401
(Iteration 1300) Training loss: 0.09381537139415741
(Iteration 1400) Training loss: 0.2401309758424759
(Iteration 1500) Training loss: 0.2160736322402954
(Iteration 1600) Training loss: 0.12703266739845276
(Iteration 1700) Training loss: 0.17918261885643005
(Iteration 1800) Training loss: 0.11988580971956253
(Iteration 1900) Training los

07/18/2024 13:05:11 - INFO - finbert.utils -   *** Example ***
07/18/2024 13:05:11 - INFO - finbert.utils -   guid: 0
07/18/2024 13:05:11 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 13:05:11 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 13:05:11 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 13:05:11 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 13:05:11 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 13:05:16 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 13:05:16 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 13:05:16 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [1.0823279039673386, 0.567875035701726, 0.4626183950135518, 0.4138067854482121, 0.4124975941745671]
Validation Accuracy: 0.8918


Epoch:  50%|█████     | 5/10 [1:53:31<1:53:40, 1364.17s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.05280433967709541
(Iteration 100) Training loss: 0.12844420969486237
(Iteration 200) Training loss: 0.3765615224838257
(Iteration 300) Training loss: 0.34861233830451965
(Iteration 400) Training loss: 0.032593365758657455
(Iteration 500) Training loss: 0.4293704330921173
(Iteration 600) Training loss: 0.01824577897787094
(Iteration 700) Training loss: 0.39228323101997375
(Iteration 800) Training loss: 0.15259604156017303
(Iteration 900) Training loss: 0.21731583774089813
(Iteration 1000) Training loss: 0.1337394416332245
(Iteration 1100) Training loss: 0.10545472055673599
(Iteration 1200) Training loss: 0.038970284163951874
(Iteration 1300) Training loss: 0.1684545874595642
(Iteration 1400) Training loss: 0.1392226666212082
(Iteration 1500) Training loss: 0.10277414321899414
(Iteration 1600) Training loss: 0.28994399309158325
(Iteration 1700) Training loss: 0.1833617389202118
(Iteration 1800) Training loss: 0.14214558899402618
(Iteration 1900) Training lo

07/18/2024 13:27:46 - INFO - finbert.utils -   *** Example ***
07/18/2024 13:27:46 - INFO - finbert.utils -   guid: 0
07/18/2024 13:27:46 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 13:27:46 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 13:27:46 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 13:27:46 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 13:27:46 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 13:27:52 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 13:27:52 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 13:27:52 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [1.0823279039673386, 0.567875035701726, 0.4626183950135518, 0.4138067854482121, 0.4124975941745671, 0.4031076118135075]
Validation Accuracy: 0.8988


Epoch:  60%|██████    | 6/10 [2:16:06<1:30:44, 1361.13s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.09898189455270767
(Iteration 100) Training loss: 0.8797031044960022
(Iteration 200) Training loss: 0.16311749815940857
(Iteration 300) Training loss: 0.030643673613667488
(Iteration 400) Training loss: 0.19094903767108917
(Iteration 500) Training loss: 0.2843332290649414
(Iteration 600) Training loss: 0.1117299273610115
(Iteration 700) Training loss: 0.18261203169822693
(Iteration 800) Training loss: 0.31628724932670593
(Iteration 900) Training loss: 0.058895424008369446
(Iteration 1000) Training loss: 0.11591161787509918
(Iteration 1100) Training loss: 0.12803539633750916
(Iteration 1200) Training loss: 0.1463954746723175
(Iteration 1300) Training loss: 0.10859749466180801
(Iteration 1400) Training loss: 0.2575448155403137
(Iteration 1500) Training loss: 0.11162347346544266
(Iteration 1600) Training loss: 0.22803185880184174
(Iteration 1700) Training loss: 0.8600932955741882
(Iteration 1800) Training loss: 0.23977893590927124
(Iteration 1900) Training lo

07/18/2024 13:50:28 - INFO - finbert.utils -   *** Example ***
07/18/2024 13:50:28 - INFO - finbert.utils -   guid: 0
07/18/2024 13:50:28 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 13:50:28 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 13:50:28 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 13:50:28 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 13:50:28 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 13:50:33 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 13:50:33 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 13:50:33 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [1.0823279039673386, 0.567875035701726, 0.4626183950135518, 0.4138067854482121, 0.4124975941745671, 0.4031076118135075, 0.38278619712868706]
Validation Accuracy: 0.8998


Epoch:  70%|███████   | 7/10 [2:38:48<1:08:03, 1361.30s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.13895215094089508
(Iteration 100) Training loss: 0.7056775093078613
(Iteration 200) Training loss: 0.229900524020195
(Iteration 300) Training loss: 0.08551844954490662
(Iteration 400) Training loss: 0.12910449504852295
(Iteration 500) Training loss: 0.15690405666828156
(Iteration 600) Training loss: 0.11753295361995697
(Iteration 700) Training loss: 0.07901570945978165
(Iteration 800) Training loss: 0.1002514436841011
(Iteration 900) Training loss: 0.05607883632183075
(Iteration 1000) Training loss: 0.036232903599739075
(Iteration 1100) Training loss: 0.08021773397922516
(Iteration 1200) Training loss: 0.11892934143543243
(Iteration 1300) Training loss: 0.6666170358657837
(Iteration 1400) Training loss: 0.07998702675104141
(Iteration 1500) Training loss: 0.13526561856269836
(Iteration 1600) Training loss: 0.13257406651973724
(Iteration 1700) Training loss: 0.06046886742115021
(Iteration 1800) Training loss: 0.3117300868034363
(Iteration 1900) Training los

07/18/2024 14:13:12 - INFO - finbert.utils -   *** Example ***
07/18/2024 14:13:12 - INFO - finbert.utils -   guid: 0
07/18/2024 14:13:12 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 14:13:12 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 14:13:12 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 14:13:12 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 14:13:12 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 14:13:17 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 14:13:17 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 14:13:17 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [1.0823279039673386, 0.567875035701726, 0.4626183950135518, 0.4138067854482121, 0.4124975941745671, 0.4031076118135075, 0.38278619712868706, 0.3806190205803063]
Validation Accuracy: 0.8996


Epoch:  80%|████████  | 8/10 [3:01:31<45:24, 1362.05s/it]  

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.11165734380483627
(Iteration 100) Training loss: 0.4312560558319092
(Iteration 200) Training loss: 0.16528509557247162
(Iteration 300) Training loss: 0.6599239110946655
(Iteration 400) Training loss: 0.13351383805274963
(Iteration 500) Training loss: 0.0836997702717781
(Iteration 600) Training loss: 0.16833074390888214
(Iteration 700) Training loss: 0.09944190829992294
(Iteration 800) Training loss: 0.17148827016353607
(Iteration 900) Training loss: 0.04040578752756119
(Iteration 1000) Training loss: 0.25558578968048096
(Iteration 1100) Training loss: 3.2754266262054443
(Iteration 1200) Training loss: 0.07482770085334778
(Iteration 1300) Training loss: 0.409658282995224
(Iteration 1400) Training loss: 0.2733098566532135
(Iteration 1500) Training loss: 0.028146326541900635
(Iteration 1600) Training loss: 0.17828018963336945
(Iteration 1700) Training loss: 0.0741192102432251
(Iteration 1800) Training loss: 0.1415064036846161
(Iteration 1900) Training loss: 

07/18/2024 14:35:54 - INFO - finbert.utils -   *** Example ***
07/18/2024 14:35:54 - INFO - finbert.utils -   guid: 0
07/18/2024 14:35:54 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 14:35:54 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 14:35:54 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 14:35:54 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 14:35:54 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 14:35:59 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 14:35:59 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 14:35:59 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [1.0823279039673386, 0.567875035701726, 0.4626183950135518, 0.4138067854482121, 0.4124975941745671, 0.4031076118135075, 0.38278619712868706, 0.3806190205803063, 0.3842699571994552]
Validation Accuracy: 0.9027


Epoch:  90%|█████████ | 9/10 [3:24:14<22:42, 1362.13s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.06415817141532898
(Iteration 100) Training loss: 0.06305783241987228
(Iteration 200) Training loss: 0.5804417729377747
(Iteration 300) Training loss: 0.20280034840106964
(Iteration 400) Training loss: 0.40526047348976135
(Iteration 500) Training loss: 0.188215970993042
(Iteration 600) Training loss: 0.3974580466747284
(Iteration 700) Training loss: 0.10390491038560867
(Iteration 800) Training loss: 0.13635551929473877
(Iteration 900) Training loss: 0.08073074370622635
(Iteration 1000) Training loss: 0.06034556031227112
(Iteration 1100) Training loss: 0.07767575234174728
(Iteration 1200) Training loss: 0.13206328451633453
(Iteration 1300) Training loss: 0.09452766925096512
(Iteration 1400) Training loss: 0.06325747072696686
(Iteration 1500) Training loss: 0.08373218774795532
(Iteration 1600) Training loss: 0.049672681838274
(Iteration 1700) Training loss: 0.10852455347776413
(Iteration 1800) Training loss: 0.05470076948404312
(Iteration 1900) Training loss

07/18/2024 14:58:37 - INFO - finbert.utils -   *** Example ***
07/18/2024 14:58:37 - INFO - finbert.utils -   guid: 0
07/18/2024 14:58:37 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 14:58:37 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 14:58:37 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 14:58:37 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 14:58:37 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 14:58:42 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 14:58:42 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 14:58:42 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [1.0823279039673386, 0.567875035701726, 0.4626183950135518, 0.4138067854482121, 0.4124975941745671, 0.4031076118135075, 0.38278619712868706, 0.3806190205803063, 0.3842699571994552, 0.3820825545351281]
Validation Accuracy: 0.9020


Epoch: 100%|██████████| 10/10 [3:46:57<00:00, 1361.76s/it]


In [13]:
import pickle
with open("./trained_model_3.pkl", "wb") as f:
    pickle.dump(trained_model, f)

In [29]:
np.argmax(trained_model.cpu().detach().numpy(), -1).shape

(128,)

In [14]:
test_examples, test_numeric_feats = finbert.get_data("validation")

In [15]:
results = finbert.evaluate(trained_model, test_examples, test_numeric_feats)

07/18/2024 15:14:02 - INFO - finbert.utils -   *** Example ***
07/18/2024 15:14:02 - INFO - finbert.utils -   guid: 0
07/18/2024 15:14:02 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 15:14:02 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 15:14:02 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 15:14:02 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 15:14:02 - INFO - finbert.utils -   label: Loans (id = 0)


07/18/2024 15:14:07 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 15:14:07 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 15:14:07 - INFO - finbert.finbert -     Batch size = 32
07/18/2024 15:14:07 - INFO - finbert.finbert -     Num steps = 21480
07/18/2024 15:14:07 - INFO - finbert.finbert -   ***** Running evaluation ***** 
07/18/2024 15:14:07 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 15:14:07 - INFO - finbert.finbert -     Batch size = 32


Testing:   0%|          | 0/2149 [00:00<?, ?it/s]

Evaluation loss: 0.2722044852638089


In [16]:
preds = results["predictions"].apply(lambda x: np.argmax(x, axis=0))

In [17]:
label_mapping = {i:label for i, label in enumerate(finbert.label_list)}

In [18]:
preds = [label_mapping[p] for p in preds]

In [19]:
labels = [label_mapping[l] for l in results["labels"]]

In [20]:
print(classification_report(labels, preds, digits=3))

  'precision', 'predicted', average, warn_for)


                            precision    recall  f1-score   support

                       ATM      0.995     0.999     0.997      1702
    Arts and Entertainment      0.872     0.975     0.921       119
                 Bank Fees      0.993     0.992     0.992      1940
             Check Deposit      0.939     0.984     0.961        63
  Clothing and Accessories      0.823     0.890     0.855       957
        Convenience Stores      0.713     0.694     0.703      5589
         Department Stores      0.636     0.842     0.724       601
     Digital Entertainment      0.928     0.936     0.932      1358
Food and Beverage Services      0.690     0.777     0.731       103
              Gas Stations      0.459     0.584     0.514      3876
  Gyms and Fitness Centers      0.633     0.905     0.745        21
                Healthcare      0.629     0.710     0.667        62
                 Insurance      0.955     0.892     0.922       526
                  Interest      1.000     1.000