# Weights and Biases Logger

This notebook logs the evaluation metrics of dev set and test set on every model.

In [1]:
import pandas as pd
import wandb
from configparser import ConfigParser

.cfg file is a config file that store your personal api key
The format inside the file looks like this:
```
[<YOUR_API_KEY_NAME>]
auth_key: <HERE_IS_YOUR_API_KEY>
```
.gitignore is set on .cfg

In [2]:
parser = ConfigParser()
_ = parser.read("../../notebook.cfg")
wandb_api_auth_key = parser.get("wandb_api_key", "auth_key")

In [3]:
wandb.login(key=wandb_api_auth_key)

[34m[1mwandb[0m: Currently logged in as: [33mhaydenchiush[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/haydenchiu/.netrc


True

# Validation set

In [4]:
PROJECT_NAME = 'BioLaySumm2024'# wandb project name
ENTITY = 'bossy_beaver' #change to your wandb team name
PATH_TO_VAL_METRICS = './scores/full_dev'
VAL_METRICS_FILES = ['elife_scores.txt', 'plos_scores.txt', 'scores.txt']

In [5]:
# For example we can use these parameters to define our metrics
# file = 'elife_scores.txt'
# data_src = 'eLife_dev'
# sample_portion = 0.1
# name = 'dummy_baseline'
# tags = ['baseline']

In [6]:
# Initialize WandB
def wandb_log_eval_metrics(file, data_src, sample_portion, name, tags, job_type='eval', project='BioLaySumm2024', entity='bossy_beaver'):
    run_config = {'data_src':data_src, 'sample_portion':sample_portion}
    run = wandb.init(project=PROJECT_NAME, entity=ENTITY, 
                     job_type=job_type, tags=tags, name=name)
    
    # Open the file in read mode
    with open(PATH_TO_VAL_METRICS + "/" + file, 'r') as file:
        
        lines = file.readlines()
    
    # Create an empty dictionary to store the metrics
    metrics = {}
    
    # Iterate over each line
    for line in lines:
        # Split the line into key and value using ':' as the delimiter
        key, value = line.strip().split(': ')
        # Store the key-value pair in the dictionary
        metrics[key] = float(value)
    print(metrics)
    
    # Log metrics to WandB
    run.log(metrics)
    
    # Finish WandB run
    run.finish()


In [7]:
# eLife dummy baseline on 10% dev set
wandb_log_eval_metrics(file='elife_scores.txt', data_src='eLife_dev', sample_portion=1, 
                       name='elife_biomistral_5_shot', tags=['BioMistral','5-shot','full_dev','4-bit'])

[34m[1mwandb[0m: Currently logged in as: [33mhaydenchiush[0m ([33mbossy_beaver[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'ROUGE1': 0.34968394667052005, 'ROUGE2': 0.08623673025178037, 'ROUGEL': 0.3258108671291219, 'BERTScore': 0.8394361432657202, 'FKGL': 12.529875518672199, 'DCRS': 9.967883817427387, 'CLI': 14.386514522821575, 'LENS': 68.98347203767327, 'AlignScore': 0.7683963356423674, 'SummaC': 0.6029898216368251}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.7684
BERTScore,0.83944
CLI,14.38651
DCRS,9.96788
FKGL,12.52988
LENS,68.98347
ROUGE1,0.34968
ROUGE2,0.08624
ROUGEL,0.32581
SummaC,0.60299


In [8]:
# PLOS dummy baseline on 10% dev set
wandb_log_eval_metrics(file='plos_scores.txt', data_src='PLOS_dev', sample_portion=1, 
                       name='plos_biomistral_5_shot', tags=['BioMistral','5-shot','full_dev','4-bit'])

{'ROUGE1': 0.4711352955852023, 'ROUGE2': 0.16758313999895275, 'ROUGEL': 0.43334700459732256, 'BERTScore': 0.8632942395674628, 'FKGL': 13.93015988372093, 'DCRS': 10.798684593023255, 'CLI': 15.557122093023255, 'LENS': 62.15900723817345, 'AlignScore': 0.8491566793013061, 'SummaC': 0.7427245225800678}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.84916
BERTScore,0.86329
CLI,15.55712
DCRS,10.79868
FKGL,13.93016
LENS,62.15901
ROUGE1,0.47114
ROUGE2,0.16758
ROUGEL,0.43335
SummaC,0.74272


In [9]:
# Combined dummy baseline on 10% dev set
wandb_log_eval_metrics(file='scores.txt', data_src='combined_dev', sample_portion=1, 
                       name='biomistral_5_shot', tags=['BioMistral','5-shot','full_dev','4-bit', 'combined'])

{'ROUGE1': 0.41040962112786117, 'ROUGE2': 0.12690993512536655, 'ROUGEL': 0.3795789358632222, 'BERTScore': 0.8513651914165915, 'FKGL': 13.230017701196566, 'DCRS': 10.383284205225321, 'CLI': 14.971818307922415, 'LENS': 65.57123963792336, 'AlignScore': 0.8087765074718367, 'SummaC': 0.6728571721084464}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.80878
BERTScore,0.85137
CLI,14.97182
DCRS,10.38328
FKGL,13.23002
LENS,65.57124
ROUGE1,0.41041
ROUGE2,0.12691
ROUGEL,0.37958
SummaC,0.67286


# Test Set

In [13]:
PATH_TO_VAL_METRICS = './data/scores/scoring_result'
TEST_METRICS_FILES = ['elife_scores.txt', 'plos_scores.txt', 'scores.txt']

In [14]:
# elife dummy baseline on test set
wandb_log_eval_metrics(file='elife_scores.txt', data_src='eLife_test', sample_portion=1, 
                       name='elife_mixtral_tot_test', tags=['milestone6','test','mixtral_8x7B','4-bit','train_of_thought'])

[34m[1mwandb[0m: Currently logged in as: [33mhaydenchiush[0m ([33mbossy_beaver[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'ROUGE1': 0.4425931080994125, 'ROUGE2': 0.10160663478728887, 'ROUGEL': 0.40545993723056156, 'BERTScore': 0.8398589647152055, 'FKGL': 14.361971830985917, 'DCRS': 10.352957746478873, 'CLI': 16.54338028169014, 'LENS': 63.42497698785683, 'AlignScore': 0.757516194817046, 'SummaC': 0.5766379052484539}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.75752
BERTScore,0.83986
CLI,16.54338
DCRS,10.35296
FKGL,14.36197
LENS,63.42498
ROUGE1,0.44259
ROUGE2,0.10161
ROUGEL,0.40546
SummaC,0.57664


In [15]:
# PLOS dummy baseline on test set
wandb_log_eval_metrics(file='plos_scores.txt', data_src='PLOS_test', sample_portion=1, 
                       name='plos_mixtral_tot_test', tags=['milestone6','test','mixtral_8x7B','4-bit','train_of_thought'])

{'ROUGE1': 0.41757900834429124, 'ROUGE2': 0.13639125926488382, 'ROUGEL': 0.38143811561150187, 'BERTScore': 0.8536000835223937, 'FKGL': 13.499295774647887, 'DCRS': 10.309647887323946, 'CLI': 15.867957746478874, 'LENS': 63.888943618717285, 'AlignScore': 0.7414850959895363, 'SummaC': 0.6068648966685147}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.74149
BERTScore,0.8536
CLI,15.86796
DCRS,10.30965
FKGL,13.4993
LENS,63.88894
ROUGE1,0.41758
ROUGE2,0.13639
ROUGEL,0.38144
SummaC,0.60686


In [16]:
# Combined dummy baseline on test set
wandb_log_eval_metrics(file='scores.txt', data_src='combined_test', sample_portion=1, 
                       name='mixtral_tot_test', tags=['milestone6','test','mixtral_8x7B','4-bit','train_of_thought', 'combined'])

{'ROUGE1': 0.43008605822185186, 'ROUGE2': 0.11899894702608635, 'ROUGEL': 0.39344902642103174, 'BERTScore': 0.8467295241187995, 'FKGL': 13.930633802816903, 'DCRS': 10.33130281690141, 'CLI': 16.205669014084506, 'LENS': 63.65696030328706, 'AlignScore': 0.7495006454032911, 'SummaC': 0.5917514009584843}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.7495
BERTScore,0.84673
CLI,16.20567
DCRS,10.3313
FKGL,13.93063
LENS,63.65696
ROUGE1,0.43009
ROUGE2,0.119
ROUGEL,0.39345
SummaC,0.59175
