In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import shutil
import os
import logging
import sys
sys.path.append('..')

from textblob import TextBlob
from pprint import pprint
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from transformers import AutoModelForSequenceClassification

from finbert.finbert import *
import finbert.utils as tools
from finbert.custom_finbert import CustomFinBert

project_dir = Path.cwd().parent
pd.set_option('max_colwidth', -1)

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.ERROR)

In [2]:
standard_dir = project_dir.parent.parent

# ! TODO: PUT THE PREPROCESSING CODE HERE AND STORE PROCESSED DF

# Prepare model

In [3]:
lm_path = project_dir/'models'/'language_model'/'finbertTRC2'
final_model_dir = project_dir/'models'/'classification_model_lr_1e-4'
data_dir = standard_dir/"Data"
# classification_data_path= standard_dir/'Data'/'sentiment_data'

## Configure training parameters

In [4]:
# Clean the cl_path
try:
    shutil.rmtree(final_model_dir) 
except:
    pass

## load the pretrained language model trained on Reuters TRC2
bertmodel = AutoModelForSequenceClassification.from_pretrained(
    lm_path, cache_dir=None, num_labels=30)

Some weights of the model checkpoint at /media/student/HDD 2/Michelle/MLS/advanced/finBERT_transaction_classification/models/language_model/finbertTRC2 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
S

In [5]:
custom_finbert = CustomFinBert(bertmodel, 38, 30, hidden_dims=[])

In [7]:
config = Config(
    data_dir=data_dir,
    model_dir=final_model_dir,
    bert_model=custom_finbert,
    num_train_epochs=10,
    max_seq_length=48,
    train_batch_size=32,
    learning_rate=1e-4,
    output_mode="classification",
    warm_up_proportion=0.2,
    local_rank=-1,
    discriminate=True,
    gradual_unfreeze=False,
    numeric_params=38,
    label_colname="category"
)

In [8]:
finbert = FinBert(config, transaction_classification=True)

In [9]:
y = pd.read_csv(standard_dir/"Data"/"eval.csv")["category"]

In [10]:
finbert.prepare_model(label_list=y.unique().tolist())

07/18/2024 15:48:21 - INFO - finbert.finbert -   device: cuda n_gpu: 2, distributed training: False, 16-bits training: False


In [11]:
train_examples, numeric_feats = finbert.get_data("train")

In [12]:
model = finbert.create_the_model()

In [13]:
trained_model = finbert.train(train_examples = train_examples, model = model, train_numeric_feats=numeric_feats)

07/18/2024 15:48:30 - INFO - finbert.utils -   *** Example ***
07/18/2024 15:48:30 - INFO - finbert.utils -   guid: 0
07/18/2024 15:48:30 - INFO - finbert.utils -   tokens: [CLS] ac ##h withdrawal am ##mar ##yse hem ##ant ##ne ##x ##press transfer [SEP]
07/18/2024 15:48:30 - INFO - finbert.utils -   input_ids: 101 9353 2232 10534 2572 7849 23274 19610 4630 2638 2595 20110 4651 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 15:48:30 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 15:48:30 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 15:48:30 - INFO - finbert.utils -   label: Transfer Debit (id = 2)
07/18/2024 15:48:42 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 15:48:42 - INFO - finbert.finbert -     Num examples = 160391
07/18

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 3.4555718898773193
(Iteration 100) Training loss: 3.219036102294922
(Iteration 200) Training loss: 3.37111234664917
(Iteration 300) Training loss: 3.6875386238098145
(Iteration 400) Training loss: 3.483788013458252
(Iteration 500) Training loss: 2.8606741428375244
(Iteration 600) Training loss: 2.557075262069702
(Iteration 700) Training loss: 2.3294999599456787
(Iteration 800) Training loss: 1.8645200729370117
(Iteration 900) Training loss: 1.7264622449874878
(Iteration 1000) Training loss: 1.751303791999817
(Iteration 1100) Training loss: 2.0255682468414307
(Iteration 1200) Training loss: 1.5908957719802856
(Iteration 1300) Training loss: 1.3990073204040527
(Iteration 1400) Training loss: 1.4846954345703125
(Iteration 1500) Training loss: 1.2271440029144287
(Iteration 1600) Training loss: 1.1639950275421143
(Iteration 1700) Training loss: 0.8918479681015015
(Iteration 1800) Training loss: 0.6752336621284485
(Iteration 1900) Training loss: 1.021091341972351

07/18/2024 16:08:52 - INFO - finbert.utils -   *** Example ***
07/18/2024 16:08:52 - INFO - finbert.utils -   guid: 0
07/18/2024 16:08:52 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 16:08:52 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 16:08:52 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 16:08:52 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 16:08:52 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 16:08:57 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 16:08:57 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 16:08:57 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [0.5799220393079416]
Validation Accuracy: 0.8546
No best model found


Epoch:  10%|█         | 1/10 [22:39<3:23:53, 1359.30s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.19735576212406158
(Iteration 100) Training loss: 0.4083735942840576
(Iteration 200) Training loss: 1.1223801374435425
(Iteration 300) Training loss: 0.7589579224586487
(Iteration 400) Training loss: 0.04343169927597046
(Iteration 500) Training loss: 0.14386402070522308
(Iteration 600) Training loss: 0.6445583701133728
(Iteration 700) Training loss: 0.9569388628005981
(Iteration 800) Training loss: 0.18728171288967133
(Iteration 900) Training loss: 0.08831146359443665
(Iteration 1000) Training loss: 0.43327921628952026
(Iteration 1100) Training loss: 0.08476390689611435
(Iteration 1200) Training loss: 0.4250596761703491
(Iteration 1300) Training loss: 0.1763879358768463
(Iteration 1400) Training loss: 0.45743635296821594
(Iteration 1500) Training loss: 1.0057531595230103
(Iteration 1600) Training loss: 0.40737029910087585
(Iteration 1700) Training loss: 0.23466897010803223
(Iteration 1800) Training loss: 0.13774892687797546
(Iteration 1900) Training loss: 

07/18/2024 16:31:38 - INFO - finbert.utils -   *** Example ***
07/18/2024 16:31:38 - INFO - finbert.utils -   guid: 0
07/18/2024 16:31:38 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 16:31:38 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 16:31:38 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 16:31:38 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 16:31:38 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 16:31:43 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 16:31:43 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 16:31:43 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [0.5799220393079416, 0.5043733746258445]
Validation Accuracy: 0.8712


Epoch:  20%|██        | 2/10 [45:25<3:01:44, 1363.07s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.22933520376682281
(Iteration 100) Training loss: 0.5029081702232361
(Iteration 200) Training loss: 0.23294799029827118
(Iteration 300) Training loss: 0.764281690120697
(Iteration 400) Training loss: 2.0454235076904297
(Iteration 500) Training loss: 0.20611754059791565
(Iteration 600) Training loss: 0.2582716643810272
(Iteration 700) Training loss: 0.06681673973798752
(Iteration 800) Training loss: 0.05964310094714165
(Iteration 900) Training loss: 0.0814225897192955
(Iteration 1000) Training loss: 2.492708444595337
(Iteration 1100) Training loss: 0.13302473723888397
(Iteration 1200) Training loss: 0.1755986213684082
(Iteration 1300) Training loss: 0.21490976214408875
(Iteration 1400) Training loss: 0.20308583974838257
(Iteration 1500) Training loss: 0.18954770267009735
(Iteration 1600) Training loss: 0.36280113458633423
(Iteration 1700) Training loss: 0.15474313497543335
(Iteration 1800) Training loss: 0.5061920881271362
(Iteration 1900) Training loss: 0.

07/18/2024 16:54:22 - INFO - finbert.utils -   *** Example ***
07/18/2024 16:54:22 - INFO - finbert.utils -   guid: 0
07/18/2024 16:54:22 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 16:54:22 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 16:54:22 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 16:54:22 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 16:54:22 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 16:54:27 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 16:54:27 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 16:54:27 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [0.5799220393079416, 0.5043733746258445, 0.41001141623374143]
Validation Accuracy: 0.8792


Epoch:  30%|███       | 3/10 [1:08:08<2:39:03, 1363.35s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.32996633648872375
(Iteration 100) Training loss: 0.05202765017747879
(Iteration 200) Training loss: 0.09758801013231277
(Iteration 300) Training loss: 0.38786476850509644
(Iteration 400) Training loss: 0.09487371891736984
(Iteration 500) Training loss: 2.7276744842529297
(Iteration 600) Training loss: 0.16712257266044617
(Iteration 700) Training loss: 0.10514508187770844
(Iteration 800) Training loss: 0.17141558229923248
(Iteration 900) Training loss: 0.7726597785949707
(Iteration 1000) Training loss: 0.2766897678375244
(Iteration 1100) Training loss: 0.3778592050075531
(Iteration 1200) Training loss: 0.299282044172287
(Iteration 1300) Training loss: 0.9625679850578308
(Iteration 1400) Training loss: 1.3891234397888184
(Iteration 1500) Training loss: 0.04955466464161873
(Iteration 1600) Training loss: 0.2290286421775818
(Iteration 1700) Training loss: 0.1121704950928688
(Iteration 1800) Training loss: 0.07391663640737534
(Iteration 1900) Training loss: 0.

07/18/2024 17:16:53 - INFO - finbert.utils -   *** Example ***
07/18/2024 17:16:53 - INFO - finbert.utils -   guid: 0
07/18/2024 17:16:53 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 17:16:53 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 17:16:53 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 17:16:53 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 17:16:53 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 17:16:58 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 17:16:58 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 17:16:58 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [0.5799220393079416, 0.5043733746258445, 0.41001141623374143, 0.396769596618917]
Validation Accuracy: 0.8952


Epoch:  40%|████      | 4/10 [1:30:39<2:15:51, 1358.57s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.18411047756671906
(Iteration 100) Training loss: 0.5594586133956909
(Iteration 200) Training loss: 0.026097165420651436
(Iteration 300) Training loss: 0.10068661719560623
(Iteration 400) Training loss: 0.0466470904648304
(Iteration 500) Training loss: 0.18874841928482056
(Iteration 600) Training loss: 0.15717510879039764
(Iteration 700) Training loss: 0.16473020613193512
(Iteration 800) Training loss: 0.20020776987075806
(Iteration 900) Training loss: 0.07622480392456055
(Iteration 1000) Training loss: 0.2319982349872589
(Iteration 1100) Training loss: 0.10482260584831238
(Iteration 1200) Training loss: 0.29965120553970337
(Iteration 1300) Training loss: 0.092338427901268
(Iteration 1400) Training loss: 0.15209969878196716
(Iteration 1500) Training loss: 0.23314982652664185
(Iteration 1600) Training loss: 0.08971826732158661
(Iteration 1700) Training loss: 4.434072971343994
(Iteration 1800) Training loss: 0.14380566775798798
(Iteration 1900) Training loss

07/18/2024 17:39:36 - INFO - finbert.utils -   *** Example ***
07/18/2024 17:39:36 - INFO - finbert.utils -   guid: 0
07/18/2024 17:39:36 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 17:39:36 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 17:39:36 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 17:39:36 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 17:39:36 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 17:39:41 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 17:39:41 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 17:39:41 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [0.5799220393079416, 0.5043733746258445, 0.41001141623374143, 0.396769596618917, 0.42151122965939647]
Validation Accuracy: 0.9003


Epoch:  50%|█████     | 5/10 [1:53:22<1:53:20, 1360.06s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.0393543541431427
(Iteration 100) Training loss: 0.20521637797355652
(Iteration 200) Training loss: 0.2192939817905426
(Iteration 300) Training loss: 0.22629103064537048
(Iteration 400) Training loss: 0.036076173186302185
(Iteration 500) Training loss: 0.5149597525596619
(Iteration 600) Training loss: 0.012788048014044762
(Iteration 700) Training loss: 0.4310140907764435
(Iteration 800) Training loss: 0.14375582337379456
(Iteration 900) Training loss: 0.22181472182273865
(Iteration 1000) Training loss: 0.11014761030673981
(Iteration 1100) Training loss: 0.10138650983572006
(Iteration 1200) Training loss: 0.035772331058979034
(Iteration 1300) Training loss: 0.1269054412841797
(Iteration 1400) Training loss: 0.1546446979045868
(Iteration 1500) Training loss: 0.06530165672302246
(Iteration 1600) Training loss: 0.10202781111001968
(Iteration 1700) Training loss: 0.15742889046669006
(Iteration 1800) Training loss: 0.1287020742893219
(Iteration 1900) Training lo

07/18/2024 18:02:15 - INFO - finbert.utils -   *** Example ***
07/18/2024 18:02:15 - INFO - finbert.utils -   guid: 0
07/18/2024 18:02:15 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 18:02:15 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 18:02:15 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 18:02:15 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 18:02:15 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 18:02:20 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 18:02:20 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 18:02:20 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [0.5799220393079416, 0.5043733746258445, 0.41001141623374143, 0.396769596618917, 0.42151122965939647, 0.3938752062074414]
Validation Accuracy: 0.9037


Epoch:  60%|██████    | 6/10 [2:16:01<1:30:38, 1359.68s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.08332796394824982
(Iteration 100) Training loss: 0.8270719647407532
(Iteration 200) Training loss: 0.08714286237955093
(Iteration 300) Training loss: 0.022989608347415924
(Iteration 400) Training loss: 0.15587380528450012
(Iteration 500) Training loss: 0.28942233324050903
(Iteration 600) Training loss: 0.061030518263578415
(Iteration 700) Training loss: 0.1666271835565567
(Iteration 800) Training loss: 0.2464296668767929
(Iteration 900) Training loss: 0.05765983834862709
(Iteration 1000) Training loss: 0.03928649425506592
(Iteration 1100) Training loss: 0.07927814871072769
(Iteration 1200) Training loss: 0.1399872750043869
(Iteration 1300) Training loss: 0.08417807519435883
(Iteration 1400) Training loss: 0.32130175828933716
(Iteration 1500) Training loss: 0.09321905672550201
(Iteration 1600) Training loss: 0.157287135720253
(Iteration 1700) Training loss: 0.0726046934723854
(Iteration 1800) Training loss: 0.10453904420137405
(Iteration 1900) Training los

07/18/2024 18:24:56 - INFO - finbert.utils -   *** Example ***
07/18/2024 18:24:56 - INFO - finbert.utils -   guid: 0
07/18/2024 18:24:56 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 18:24:56 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 18:24:56 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 18:24:56 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 18:24:56 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 18:25:01 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 18:25:01 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 18:25:01 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [0.5799220393079416, 0.5043733746258445, 0.41001141623374143, 0.396769596618917, 0.42151122965939647, 0.3938752062074414, 0.37931189097748164]
Validation Accuracy: 0.9025


Epoch:  70%|███████   | 7/10 [2:38:42<1:08:00, 1360.21s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.1301930695772171
(Iteration 100) Training loss: 0.7445387244224548
(Iteration 200) Training loss: 0.21466606855392456
(Iteration 300) Training loss: 0.07792592793703079
(Iteration 400) Training loss: 0.1265583336353302
(Iteration 500) Training loss: 0.1257137656211853
(Iteration 600) Training loss: 0.026006558910012245
(Iteration 700) Training loss: 0.07039958983659744
(Iteration 800) Training loss: 0.08131920546293259
(Iteration 900) Training loss: 0.03178245574235916
(Iteration 1000) Training loss: 0.035932477563619614
(Iteration 1100) Training loss: 0.09253688901662827
(Iteration 1200) Training loss: 0.02757108584046364
(Iteration 1300) Training loss: 0.7135929465293884
(Iteration 1400) Training loss: 0.07531067728996277
(Iteration 1500) Training loss: 0.16770359873771667
(Iteration 1600) Training loss: 0.06487569957971573
(Iteration 1700) Training loss: 0.04261672496795654
(Iteration 1800) Training loss: 0.215719535946846
(Iteration 1900) Training los

07/18/2024 18:47:34 - INFO - finbert.utils -   *** Example ***
07/18/2024 18:47:34 - INFO - finbert.utils -   guid: 0
07/18/2024 18:47:34 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 18:47:34 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 18:47:34 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 18:47:34 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 18:47:34 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 18:47:39 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 18:47:39 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 18:47:39 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [0.5799220393079416, 0.5043733746258445, 0.41001141623374143, 0.396769596618917, 0.42151122965939647, 0.3938752062074414, 0.37931189097748164, 0.38214523738286754]
Validation Accuracy: 0.9040


Epoch:  80%|████████  | 8/10 [3:01:20<45:19, 1359.51s/it]  

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.08985567092895508
(Iteration 100) Training loss: 0.35107022523880005
(Iteration 200) Training loss: 0.015064402483403683
(Iteration 300) Training loss: 0.7840164303779602
(Iteration 400) Training loss: 0.08737221360206604
(Iteration 500) Training loss: 0.06067093834280968
(Iteration 600) Training loss: 0.08430716395378113
(Iteration 700) Training loss: 0.028842037543654442
(Iteration 800) Training loss: 0.11765879392623901
(Iteration 900) Training loss: 0.03995703533291817
(Iteration 1000) Training loss: 0.23261646926403046
(Iteration 1100) Training loss: 3.7266383171081543
(Iteration 1200) Training loss: 0.1545954793691635
(Iteration 1300) Training loss: 0.5340873003005981
(Iteration 1400) Training loss: 0.23913514614105225
(Iteration 1500) Training loss: 0.028092699125409126
(Iteration 1600) Training loss: 0.16820186376571655
(Iteration 1700) Training loss: 0.10539138317108154
(Iteration 1800) Training loss: 0.10141554474830627
(Iteration 1900) Training

07/18/2024 19:10:05 - INFO - finbert.utils -   *** Example ***
07/18/2024 19:10:05 - INFO - finbert.utils -   guid: 0
07/18/2024 19:10:05 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 19:10:05 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 19:10:05 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 19:10:05 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 19:10:05 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 19:10:10 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 19:10:10 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 19:10:10 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [0.5799220393079416, 0.5043733746258445, 0.41001141623374143, 0.396769596618917, 0.42151122965939647, 0.3938752062074414, 0.37931189097748164, 0.38214523738286754, 0.3901790154324485]
Validation Accuracy: 0.9078


Epoch:  90%|█████████ | 9/10 [3:23:51<22:36, 1356.79s/it]

Iteration:   0%|          | 0/5013 [00:00<?, ?it/s]

(Iteration 0) Training loss: 0.05954249948263168
(Iteration 100) Training loss: 0.08362004160881042
(Iteration 200) Training loss: 0.6835175156593323
(Iteration 300) Training loss: 0.1416778266429901
(Iteration 400) Training loss: 0.3615134656429291
(Iteration 500) Training loss: 0.17148441076278687
(Iteration 600) Training loss: 0.15235929191112518
(Iteration 700) Training loss: 0.09528184682130814
(Iteration 800) Training loss: 0.06382440775632858
(Iteration 900) Training loss: 0.07699044048786163
(Iteration 1000) Training loss: 0.05135718733072281
(Iteration 1100) Training loss: 0.0718797966837883
(Iteration 1200) Training loss: 0.14104001224040985
(Iteration 1300) Training loss: 0.09840114414691925
(Iteration 1400) Training loss: 0.04215167835354805
(Iteration 1500) Training loss: 0.036543816328048706
(Iteration 1600) Training loss: 0.010139521211385727
(Iteration 1700) Training loss: 0.10808117687702179
(Iteration 1800) Training loss: 0.04660505801439285
(Iteration 1900) Training 

07/18/2024 19:32:44 - INFO - finbert.utils -   *** Example ***
07/18/2024 19:32:44 - INFO - finbert.utils -   guid: 0
07/18/2024 19:32:44 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 19:32:44 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 19:32:44 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 19:32:44 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 19:32:44 - INFO - finbert.utils -   label: Loans (id = 0)
07/18/2024 19:32:49 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 19:32:49 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 19:32:49 - INFO - fin

Validating:   0%|          | 0/2149 [00:00<?, ?it/s]

Validation losses: [0.5799220393079416, 0.5043733746258445, 0.41001141623374143, 0.396769596618917, 0.42151122965939647, 0.3938752062074414, 0.37931189097748164, 0.38214523738286754, 0.3901790154324485, 0.39054488360537487]
Validation Accuracy: 0.9071


Epoch: 100%|██████████| 10/10 [3:46:31<00:00, 1359.11s/it]


In [14]:
import pickle
with open("./trained_model_4.pkl", "wb") as f:
    pickle.dump(trained_model, f)

In [21]:
test_examples, test_numeric_feats = finbert.get_data("eval")

In [22]:
results = finbert.evaluate(trained_model, test_examples, test_numeric_feats)

07/18/2024 20:04:41 - INFO - finbert.utils -   *** Example ***
07/18/2024 20:04:41 - INFO - finbert.utils -   guid: 0
07/18/2024 20:04:41 - INFO - finbert.utils -   tokens: [CLS] rt ##p received money ##lion ins ##ta ##cas ##h 48 ##8 [SEP]
07/18/2024 20:04:41 - INFO - finbert.utils -   input_ids: 101 19387 2361 2363 2769 18964 16021 2696 15671 2232 4466 2620 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 20:04:41 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 20:04:41 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/18/2024 20:04:41 - INFO - finbert.utils -   label: Loans (id = 0)


07/18/2024 20:04:46 - INFO - finbert.finbert -   ***** Loading data *****
07/18/2024 20:04:46 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 20:04:46 - INFO - finbert.finbert -     Batch size = 32
07/18/2024 20:04:46 - INFO - finbert.finbert -     Num steps = 21480
07/18/2024 20:04:46 - INFO - finbert.finbert -   ***** Running evaluation ***** 
07/18/2024 20:04:46 - INFO - finbert.finbert -     Num examples = 68739
07/18/2024 20:04:46 - INFO - finbert.finbert -     Batch size = 32


Testing:   0%|          | 0/2149 [00:00<?, ?it/s]

Evaluation loss: 0.2720436788037967


In [23]:
preds = results["predictions"].apply(lambda x: np.argmax(x, axis=0))

In [24]:
label_mapping = {i:label for i, label in enumerate(finbert.label_list)}

In [25]:
preds = [label_mapping[p] for p in preds]

In [26]:
labels = [label_mapping[l] for l in results["labels"]]

In [27]:
print(classification_report(labels, preds, digits=3))

  'precision', 'predicted', average, warn_for)


                            precision    recall  f1-score   support

                       ATM      0.996     0.999     0.998      1702
    Arts and Entertainment      0.921     0.975     0.947       119
                 Bank Fees      0.994     0.993     0.994      1940
             Check Deposit      0.940     1.000     0.969        63
  Clothing and Accessories      0.860     0.889     0.875       957
        Convenience Stores      0.721     0.711     0.716      5589
         Department Stores      0.685     0.844     0.756       601
     Digital Entertainment      0.946     0.932     0.939      1358
Food and Beverage Services      0.683     0.835     0.751       103
              Gas Stations      0.470     0.595     0.525      3876
  Gyms and Fitness Centers      0.679     0.905     0.776        21
                Healthcare      0.568     0.742     0.643        62
                 Insurance      0.959     0.888     0.922       526
                  Interest      1.000     1.000