# Weights and Biases Logger

This notebook logs the evaluation metrics of dev set and test set on every model.

In [None]:
import pandas as pd
import wandb
from configparser import ConfigParser

.cfg file is a config file that store your personal api key
The format inside the file looks like this:
```
[<YOUR_API_KEY_NAME>]
auth_key: <HERE_IS_YOUR_API_KEY>
```
.gitignore is set on .cfg

In [7]:
parser = ConfigParser()
_ = parser.read("../notebook.cfg")
wandb_api_auth_key = parser.get("wandb_api_key", "auth_key")

In [8]:
wandb.login(key=wandb_api_auth_key)

[34m[1mwandb[0m: Currently logged in as: [33mhaydenchiush[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/haydenchiu/.netrc


True

# Validation set

In [18]:
PROJECT_NAME = 'BioLaySumm2024'# wandb project name
ENTITY = 'bossy_beaver' #change to your wandb team name
PATH_TO_VAL_METRICS = './Proxy_val_Results'
VAL_METRICS_FILES = ['elife_scores.txt', 'plos_scores.txt', 'scores.txt']

In [26]:
# For example we can use these parameters to define our metrics
# file = 'elife_scores.txt'
# data_src = 'eLife_dev'
# sample_portion = 0.1
# name = 'dummy_baseline'
# tags = ['baseline']

In [20]:
# Initialize WandB
def wandb_log_eval_metrics(file, data_src, sample_portion, name, tags, job_type='eval', project='BioLaySumm2024', entity='bossy_beaver'):
    run_config = {'data_src':data_src, 'sample_portion':sample_portion}
    run = wandb.init(project=PROJECT_NAME, entity=ENTITY, 
                     job_type=job_type, tags=tags, name=name)
    
    # Open the file in read mode
    with open(PATH_TO_VAL_METRICS + "/" + file, 'r') as file:
        
        lines = file.readlines()
    
    # Create an empty dictionary to store the metrics
    metrics = {}
    
    # Iterate over each line
    for line in lines:
        # Split the line into key and value using ':' as the delimiter
        key, value = line.strip().split(': ')
        # Store the key-value pair in the dictionary
        metrics[key] = float(value)
    print(metrics)
    
    # Log metrics to WandB
    run.log(metrics)
    
    # Finish WandB run
    run.finish()


In [22]:
# eLife dummy baseline on 10% dev set
wandb_log_eval_metrics(file='elife_scores.txt', data_src='eLife_dev', sample_portion=0.1, 
                       name='elife_dummy_baseline', tags=['baseline'])

[34m[1mwandb[0m: Currently logged in as: [33mhaydenchiush[0m ([33mbossy_beaver[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'ROUGE1': 0.32312883506775875, 'ROUGE2': 0.065595999095099, 'ROUGEL': 0.3008124655909614, 'BERTScore': 0.8375861893097559, 'FKGL': 15.133333333333335, 'DCRS': 11.590416666666664, 'CLI': 16.845416666666665, 'LENS': 39.63814759728811, 'AlignScore': 0.9943027173479398, 'SummaC': 0.9525209615627924}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.9943
BERTScore,0.83759
CLI,16.84542
DCRS,11.59042
FKGL,15.13333
LENS,39.63815
ROUGE1,0.32313
ROUGE2,0.0656
ROUGEL,0.30081
SummaC,0.95252


In [23]:
# PLOS dummy baseline on 10% dev set
wandb_log_eval_metrics(file='plos_scores.txt', data_src='PLOS_dev', sample_portion=0.1, 
                       name='plos_dummy_baseline', tags=['baseline'])

{'ROUGE1': 0.5125303136128874, 'ROUGE2': 0.21055582506879417, 'ROUGEL': 0.4767545280909615, 'BERTScore': 0.8725961204888164, 'FKGL': 14.94927536231884, 'DCRS': 10.976449275362318, 'CLI': 16.293840579710146, 'LENS': 38.619040578940265, 'AlignScore': 0.9880770807680876, 'SummaC': 0.9578714357770007}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.98808
BERTScore,0.8726
CLI,16.29384
DCRS,10.97645
FKGL,14.94928
LENS,38.61904
ROUGE1,0.51253
ROUGE2,0.21056
ROUGEL,0.47675
SummaC,0.95787


In [24]:
# Combined dummy baseline on 10% dev set
wandb_log_eval_metrics(file='scores.txt', data_src='combined_dev', sample_portion=0.1, 
                       name='dummy_baseline', tags=['baseline', 'combined'])

{'ROUGE1': 0.4178295743403231, 'ROUGE2': 0.13807591208194658, 'ROUGEL': 0.3887834968409615, 'BERTScore': 0.8550911548992861, 'FKGL': 15.041304347826088, 'DCRS': 11.283432971014491, 'CLI': 16.569628623188407, 'LENS': 39.128594088114184, 'AlignScore': 0.9911898990580137, 'SummaC': 0.9551961986698966}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.99119
BERTScore,0.85509
CLI,16.56963
DCRS,11.28343
FKGL,15.0413
LENS,39.12859
ROUGE1,0.41783
ROUGE2,0.13808
ROUGEL,0.38878
SummaC,0.9552


# Test Set

In [25]:
PATH_TO_VAL_METRICS = './scoring_result'
TEST_METRICS_FILES = ['elife_scores.txt', 'plos_scores.txt', 'scores.txt']

In [27]:
# elife dummy baseline on test set
wandb_log_eval_metrics(file='elife_scores.txt', data_src='eLife_test', sample_portion=1, 
                       name='elife_dummy_baseline_test', tags=['baseline', 'test'])

{'ROUGE1': 0.3089138993381871, 'ROUGE2': 0.06957613212293816, 'ROUGEL': 0.2813751343330106, 'BERTScore': 0.8407322109585077, 'FKGL': 15.080985915492958, 'DCRS': 11.753802816901409, 'CLI': 17.258943661971827, 'LENS': 30.24145106148018, 'AlignScore': 0.9781640558175637, 'SummaC': 0.9402011343291108}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.97816
BERTScore,0.84073
CLI,17.25894
DCRS,11.7538
FKGL,15.08099
LENS,30.24145
ROUGE1,0.30891
ROUGE2,0.06958
ROUGEL,0.28138
SummaC,0.9402


In [28]:
# PLOS dummy baseline on test set
wandb_log_eval_metrics(file='plos_scores.txt', data_src='PLOS_test', sample_portion=1, 
                       name='plos_dummy_baseline_test', tags=['baseline', 'test'])

{'ROUGE1': 0.461164102179743, 'ROUGE2': 0.16057957580294918, 'ROUGEL': 0.4178593622680423, 'BERTScore': 0.8693121696861696, 'FKGL': 14.751408450704227, 'DCRS': 11.793098591549297, 'CLI': 16.704154929577463, 'LENS': 35.30267832931986, 'AlignScore': 0.9772088964220503, 'SummaC': 0.9488216630170043}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.97721
BERTScore,0.86931
CLI,16.70415
DCRS,11.7931
FKGL,14.75141
LENS,35.30268
ROUGE1,0.46116
ROUGE2,0.16058
ROUGEL,0.41786
SummaC,0.94882


In [29]:
# Combined dummy baseline on test set
wandb_log_eval_metrics(file='scores.txt', data_src='combined_test', sample_portion=1, 
                       name='dummy_baseline_test', tags=['baseline', 'combined', 'test'])

{'ROUGE1': 0.385039000758965, 'ROUGE2': 0.11507785396294368, 'ROUGEL': 0.34961724830052643, 'BERTScore': 0.8550221903223387, 'FKGL': 14.916197183098593, 'DCRS': 11.773450704225354, 'CLI': 16.981549295774645, 'LENS': 32.77206469540002, 'AlignScore': 0.977686476119807, 'SummaC': 0.9445113986730576}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.97769
BERTScore,0.85502
CLI,16.98155
DCRS,11.77345
FKGL,14.9162
LENS,32.77206
ROUGE1,0.38504
ROUGE2,0.11508
ROUGEL,0.34962
SummaC,0.94451
