# Weights and Biases Logger

This notebook logs the evaluation metrics of dev set and test set on every model.

In [1]:
import pandas as pd
import wandb
from configparser import ConfigParser

.cfg file is a config file that store your personal api key
The format inside the file looks like this:
```
[<YOUR_API_KEY_NAME>]
auth_key: <HERE_IS_YOUR_API_KEY>
```
.gitignore is set on .cfg

In [2]:
parser = ConfigParser()
_ = parser.read("../../notebook.cfg")
wandb_api_auth_key = parser.get("wandb_api_key", "auth_key")

In [3]:
wandb.login(key=wandb_api_auth_key)

[34m[1mwandb[0m: Currently logged in as: [33mhaydenchiush[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/haydenchiu/.netrc


True

# Validation set

In [4]:
PROJECT_NAME = 'BioLaySumm2024'# wandb project name
ENTITY = 'bossy_beaver' #change to your wandb team name
PATH_TO_VAL_METRICS = './data/output/mini_dev_set/scores'
VAL_METRICS_FILES = ['elife_scores.txt', 'plos_scores.txt', 'scores.txt']

In [5]:
# For example we can use these parameters to define our metrics
# file = 'elife_scores.txt'
# data_src = 'eLife_dev'
# sample_portion = 0.1
# name = 'dummy_baseline'
# tags = ['baseline']

In [6]:
# Initialize WandB
def wandb_log_eval_metrics(file, data_src, sample_portion, name, tags, job_type='eval', project='BioLaySumm2024', entity='bossy_beaver'):
    run_config = {'data_src':data_src, 'sample_portion':sample_portion}
    run = wandb.init(project=PROJECT_NAME, entity=ENTITY, 
                     job_type=job_type, tags=tags, name=name)
    
    # Open the file in read mode
    with open(PATH_TO_VAL_METRICS + "/" + file, 'r') as file:
        
        lines = file.readlines()
    
    # Create an empty dictionary to store the metrics
    metrics = {}
    
    # Iterate over each line
    for line in lines:
        # Split the line into key and value using ':' as the delimiter
        key, value = line.strip().split(': ')
        # Store the key-value pair in the dictionary
        metrics[key] = float(value)
    print(metrics)
    
    # Log metrics to WandB
    run.log(metrics)
    
    # Finish WandB run
    run.finish()


In [7]:
# eLife dummy baseline on 10% dev set
wandb_log_eval_metrics(file='elife_scores.txt', data_src='eLife_dev', sample_portion=0.1, 
                       name='elife_BioMistral_7B_4bits', tags=['milestone4','mini_dev_set','BioMistral_7B_4bit'])

[34m[1mwandb[0m: Currently logged in as: [33mhaydenchiush[0m ([33mbossy_beaver[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'ROUGE1': 0.2058151383298021, 'ROUGE2': 0.03188751898254138, 'ROUGEL': 0.1953632875703846, 'BERTScore': 0.7950564622879028, 'FKGL': 11.987499999999999, 'DCRS': 7.13, 'CLI': 12.106250000000001, 'LENS': 19.283961666987636, 'AlignScore': 0.5808112174272537, 'SummaC': 0.49667278801401454}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.58081
BERTScore,0.79506
CLI,12.10625
DCRS,7.13
FKGL,11.9875
LENS,19.28396
ROUGE1,0.20582
ROUGE2,0.03189
ROUGEL,0.19536
SummaC,0.49667


In [8]:
# PLOS dummy baseline on 10% dev set
wandb_log_eval_metrics(file='plos_scores.txt', data_src='PLOS_dev', sample_portion=0.1, 
                       name='plos_BioMistral_7B_4bits', tags=['milestone4','mini_dev_set','BioMistral_7B_4bit'])

{'ROUGE1': 0.23096328029880522, 'ROUGE2': 0.05211601297580994, 'ROUGEL': 0.21588534589456596, 'BERTScore': 0.8106353352035301, 'FKGL': 14.751449275362317, 'DCRS': 8.05159420289855, 'CLI': 11.361086956521739, 'LENS': 26.181919402495303, 'AlignScore': 0.6073269683165826, 'SummaC': 0.5512916521317717}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.60733
BERTScore,0.81064
CLI,11.36109
DCRS,8.05159
FKGL,14.75145
LENS,26.18192
ROUGE1,0.23096
ROUGE2,0.05212
ROUGEL,0.21589
SummaC,0.55129


In [10]:
# Combined dummy baseline on 10% dev set
wandb_log_eval_metrics(file='scores.txt', data_src='combined_dev', sample_portion=0.1, 
                       name='BioMistral_7B_4bits', tags=['milestone4','mini_dev_set','BioMistral_7B_4bit', 'combined'])

{'ROUGE1': 0.21838920931430367, 'ROUGE2': 0.04200176597917566, 'ROUGEL': 0.20562431673247528, 'BERTScore': 0.8028458987457165, 'FKGL': 13.369474637681158, 'DCRS': 7.590797101449274, 'CLI': 11.73366847826087, 'LENS': 22.73294053474147, 'AlignScore': 0.5940690928719181, 'SummaC': 0.5239822200728931}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.59407
BERTScore,0.80285
CLI,11.73367
DCRS,7.5908
FKGL,13.36947
LENS,22.73294
ROUGE1,0.21839
ROUGE2,0.042
ROUGEL,0.20562
SummaC,0.52398


# Test Set

In [11]:
PATH_TO_VAL_METRICS = './data/output/test_set/scores'
TEST_METRICS_FILES = ['elife_scores.txt', 'plos_scores.txt', 'scores.txt']

In [12]:
# elife dummy baseline on test set
wandb_log_eval_metrics(file='elife_scores.txt', data_src='eLife_test', sample_portion=1, 
                       name='elife_BioMistral_7B_4bits_test', tags=['milestone4','BioMistral_7B_4bit', 'test'])

{'ROUGE1': 0.2658170189667828, 'ROUGE2': 0.048088032711473404, 'ROUGEL': 0.24802597448928654, 'BERTScore': 0.8134527907405101, 'FKGL': 14.349295774647887, 'DCRS': 8.43056338028169, 'CLI': 15.087535211267605, 'LENS': 54.63684429708165, 'AlignScore': 0.5911370053834898, 'SummaC': 0.4232159972610608}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.59114
BERTScore,0.81345
CLI,15.08754
DCRS,8.43056
FKGL,14.3493
LENS,54.63684
ROUGE1,0.26582
ROUGE2,0.04809
ROUGEL,0.24803
SummaC,0.42322


In [13]:
# PLOS dummy baseline on test set
wandb_log_eval_metrics(file='plos_scores.txt', data_src='PLOS_test', sample_portion=1, 
                       name='plos_BioMistral_7B_4bits_test', tags=['milestone4','BioMistral_7B_4bit', 'test'])

{'ROUGE1': 0.34778996253378136, 'ROUGE2': 0.09508331730337351, 'ROUGEL': 0.31575959220879085, 'BERTScore': 0.8386020064353943, 'FKGL': 12.290845070422533, 'DCRS': 8.755915492957747, 'CLI': 13.4, 'LENS': 57.02802431965428, 'AlignScore': 0.6461510611829203, 'SummaC': 0.4562869989116427}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.64615
BERTScore,0.8386
CLI,13.4
DCRS,8.75592
FKGL,12.29085
LENS,57.02802
ROUGE1,0.34779
ROUGE2,0.09508
ROUGEL,0.31576
SummaC,0.45629


In [14]:
# Combined dummy baseline on test set
wandb_log_eval_metrics(file='scores.txt', data_src='combined_test', sample_portion=1, 
                       name='BioMistral_7B_4bits_test', tags=['milestone4','BioMistral_7B_4bit', 'combined', 'test'])

{'ROUGE1': 0.30680349075028207, 'ROUGE2': 0.07158567500742345, 'ROUGEL': 0.28189278334903867, 'BERTScore': 0.8260273985879523, 'FKGL': 13.32007042253521, 'DCRS': 8.593239436619719, 'CLI': 14.243767605633803, 'LENS': 55.832434308367965, 'AlignScore': 0.618644033283205, 'SummaC': 0.4397514980863517}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.61864
BERTScore,0.82603
CLI,14.24377
DCRS,8.59324
FKGL,13.32007
LENS,55.83243
ROUGE1,0.3068
ROUGE2,0.07159
ROUGEL,0.28189
SummaC,0.43975
