In [1]:
from cltrier_prosem import Pipeline
import src.notification_sound_player as notification

pygame 2.5.2 (SDL 2.28.3, Python 3.10.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# just a sound test for the notification sound
# can be used to mark the beginning of the training session
notification.play_mp3("./src/Signal.mp3")

In [3]:
# preparing the pipeline settings
pipeline = Pipeline({
    'encoder': {
        'model': 'deepset/gbert-base',
        # 'model': './data/results/querdenker_tu/model.bin'
    },
    'dataset': {
        'path': './data/preparation_output',                # path for input parquet-files
        'text_column': 'text',                              # define the text / sentence column
        'label_column': 'label',                            # define the label column
        
        # configuration of the expected labels within the datasets
        'label_classes': ['corona', 'web2']                 # binary classification
        # 'label_classes': ['corona', 'web1', 'web2'],      # trinary classification
    },
    'classifier': {
        'hid_size': 512,
        'dropout': 0.2,
    },
    'pooler': {
        'form': 'cls',
        'span_column': 'span'
    },
    'trainer': {
        'num_epochs': 20,            
        'batch_size': 32,                        
        'learning_rate': 1e-3,
        'export_path': './data/results',            # define the path for the output files
    }
})

[--- SETUP ---]
> Computation Device: cuda
[--- LOAD ENCODER ---]
> Encoder Name: deepset/gbert-base
  Memory Usage: 419.3486 MB
> f(__init__) took: 1.7311 sec
[--- LOAD TRAINER ---]
> Dataset: train
  Samples: 7966
> Dataset: test
  Samples: 1992
> Model: Multilayer perceptron
  Memory Usage: 1.5059 MB 
  (0): Linear(in_features=768, out_features=512, bias=True)
  (1): Dropout(p=0.2, inplace=False)
  (2): LeakyReLU(negative_slope=0.01)
  (3): Linear(in_features=512, out_features=2, bias=True)



In [4]:
# executing of the pipeline function:
# this function accesses the files train.parquet and test.parquet 
# in the folder data/preparation_output as expected input
# therefore be sure, your preprocessed datasets are in the mentioned folder and named respectively
pipeline()

# just a double notification to mark the end of the training process
notification.play_mp3("./src/Signal.mp3")
notification.play_mp3("./src/Signal.mp3")

[--- RUN TRAINER ---]
[@001]: 	loss_train=0.4718 	loss_test=0.4409 	f1_train=0.7673 	f1_test=0.7870 	duration=0:00:00
[@002]: 	loss_train=0.4219 	loss_test=0.4207 	f1_train=0.8013 	f1_test=0.8037 	duration=0:00:00
[@003]: 	loss_train=0.4025 	loss_test=0.4223 	f1_train=0.8067 	f1_test=0.7966 	duration=0:00:00
[@004]: 	loss_train=0.3843 	loss_test=0.4180 	f1_train=0.8142 	f1_test=0.7963 	duration=0:00:00
[@005]: 	loss_train=0.3755 	loss_test=0.4975 	f1_train=0.8198 	f1_test=0.7401 	duration=0:00:00
[@006]: 	loss_train=0.3597 	loss_test=0.4268 	f1_train=0.8286 	f1_test=0.7962 	duration=0:00:00
[@007]: 	loss_train=0.3445 	loss_test=0.4248 	f1_train=0.8333 	f1_test=0.8060 	duration=0:00:00
[@008]: 	loss_train=0.3264 	loss_test=0.4237 	f1_train=0.8479 	f1_test=0.8068 	duration=0:00:00
[@009]: 	loss_train=0.3119 	loss_test=0.4572 	f1_train=0.8503 	f1_test=0.7890 	duration=0:00:00
[@010]: 	loss_train=0.2982 	loss_test=0.4804 	f1_train=0.8602 	f1_test=0.7857 	duration=0:00:00
[@011]: 	loss_trai