# Weights and Biases Logger

This notebook logs the evaluation metrics of dev set and test set on every model.

In [1]:
import pandas as pd
import wandb
from configparser import ConfigParser

.cfg file is a config file that store your personal api key
The format inside the file looks like this:
```
[<YOUR_API_KEY_NAME>]
auth_key: <HERE_IS_YOUR_API_KEY>
```
.gitignore is set on .cfg

In [3]:
parser = ConfigParser()
_ = parser.read("../../notebook.cfg")
wandb_api_auth_key = parser.get("wandb_api_key", "auth_key")

In [4]:
wandb.login(key=wandb_api_auth_key)

[34m[1mwandb[0m: Currently logged in as: [33mhaydenchiush[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/haydenchiu/.netrc


True

# Validation set

In [10]:
PROJECT_NAME = 'BioLaySumm2024'# wandb project name
ENTITY = 'bossy_beaver' #change to your wandb team name
PATH_TO_VAL_METRICS = './Proxy_val_Results'
VAL_METRICS_FILES = ['elife_scores.txt', 'plos_scores.txt', 'scores.txt']

In [11]:
# For example we can use these parameters to define our metrics
# file = 'elife_scores.txt'
# data_src = 'eLife_dev'
# sample_portion = 0.1
# name = 'dummy_baseline'
# tags = ['baseline']

In [12]:
# Initialize WandB
def wandb_log_eval_metrics(file, data_src, sample_portion, name, tags, job_type='eval', project='BioLaySumm2024', entity='bossy_beaver'):
    run_config = {'data_src':data_src, 'sample_portion':sample_portion}
    run = wandb.init(project=PROJECT_NAME, entity=ENTITY, 
                     job_type=job_type, tags=tags, name=name)
    
    # Open the file in read mode
    with open(PATH_TO_VAL_METRICS + "/" + file, 'r') as file:
        
        lines = file.readlines()
    
    # Create an empty dictionary to store the metrics
    metrics = {}
    
    # Iterate over each line
    for line in lines:
        # Split the line into key and value using ':' as the delimiter
        key, value = line.strip().split(': ')
        # Store the key-value pair in the dictionary
        metrics[key] = float(value)
    print(metrics)
    
    # Log metrics to WandB
    run.log(metrics)
    
    # Finish WandB run
    run.finish()


In [22]:
# eLife dummy baseline on 10% dev set
wandb_log_eval_metrics(file='elife_scores.txt', data_src='eLife_dev', sample_portion=0.1, 
                       name='elife_dummy_baseline', tags=['baseline'])

[34m[1mwandb[0m: Currently logged in as: [33mhaydenchiush[0m ([33mbossy_beaver[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'ROUGE1': 0.32312883506775875, 'ROUGE2': 0.065595999095099, 'ROUGEL': 0.3008124655909614, 'BERTScore': 0.8375861893097559, 'FKGL': 15.133333333333335, 'DCRS': 11.590416666666664, 'CLI': 16.845416666666665, 'LENS': 39.63814759728811, 'AlignScore': 0.9943027173479398, 'SummaC': 0.9525209615627924}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.9943
BERTScore,0.83759
CLI,16.84542
DCRS,11.59042
FKGL,15.13333
LENS,39.63815
ROUGE1,0.32313
ROUGE2,0.0656
ROUGEL,0.30081
SummaC,0.95252


In [23]:
# PLOS dummy baseline on 10% dev set
wandb_log_eval_metrics(file='plos_scores.txt', data_src='PLOS_dev', sample_portion=0.1, 
                       name='plos_dummy_baseline', tags=['baseline'])

{'ROUGE1': 0.5125303136128874, 'ROUGE2': 0.21055582506879417, 'ROUGEL': 0.4767545280909615, 'BERTScore': 0.8725961204888164, 'FKGL': 14.94927536231884, 'DCRS': 10.976449275362318, 'CLI': 16.293840579710146, 'LENS': 38.619040578940265, 'AlignScore': 0.9880770807680876, 'SummaC': 0.9578714357770007}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.98808
BERTScore,0.8726
CLI,16.29384
DCRS,10.97645
FKGL,14.94928
LENS,38.61904
ROUGE1,0.51253
ROUGE2,0.21056
ROUGEL,0.47675
SummaC,0.95787


In [24]:
# Combined dummy baseline on 10% dev set
wandb_log_eval_metrics(file='scores.txt', data_src='combined_dev', sample_portion=0.1, 
                       name='dummy_baseline', tags=['baseline', 'combined'])

{'ROUGE1': 0.4178295743403231, 'ROUGE2': 0.13807591208194658, 'ROUGEL': 0.3887834968409615, 'BERTScore': 0.8550911548992861, 'FKGL': 15.041304347826088, 'DCRS': 11.283432971014491, 'CLI': 16.569628623188407, 'LENS': 39.128594088114184, 'AlignScore': 0.9911898990580137, 'SummaC': 0.9551961986698966}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.99119
BERTScore,0.85509
CLI,16.56963
DCRS,11.28343
FKGL,15.0413
LENS,39.12859
ROUGE1,0.41783
ROUGE2,0.13808
ROUGEL,0.38878
SummaC,0.9552


# Test Set

In [13]:
PATH_TO_VAL_METRICS = './data/scores/scoring_result'
TEST_METRICS_FILES = ['elife_scores.txt', 'plos_scores.txt', 'scores.txt']

In [14]:
# elife dummy baseline on test set
wandb_log_eval_metrics(file='elife_scores.txt', data_src='eLife_test', sample_portion=1, 
                       name='elife_mixtral_tot_test', tags=['milestone6','test','mixtral_8x7B','4-bit','train_of_thought'])

[34m[1mwandb[0m: Currently logged in as: [33mhaydenchiush[0m ([33mbossy_beaver[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'ROUGE1': 0.4425931080994125, 'ROUGE2': 0.10160663478728887, 'ROUGEL': 0.40545993723056156, 'BERTScore': 0.8398589647152055, 'FKGL': 14.361971830985917, 'DCRS': 10.352957746478873, 'CLI': 16.54338028169014, 'LENS': 63.42497698785683, 'AlignScore': 0.757516194817046, 'SummaC': 0.5766379052484539}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.75752
BERTScore,0.83986
CLI,16.54338
DCRS,10.35296
FKGL,14.36197
LENS,63.42498
ROUGE1,0.44259
ROUGE2,0.10161
ROUGEL,0.40546
SummaC,0.57664


In [15]:
# PLOS dummy baseline on test set
wandb_log_eval_metrics(file='plos_scores.txt', data_src='PLOS_test', sample_portion=1, 
                       name='plos_mixtral_tot_test', tags=['milestone6','test','mixtral_8x7B','4-bit','train_of_thought'])

{'ROUGE1': 0.41757900834429124, 'ROUGE2': 0.13639125926488382, 'ROUGEL': 0.38143811561150187, 'BERTScore': 0.8536000835223937, 'FKGL': 13.499295774647887, 'DCRS': 10.309647887323946, 'CLI': 15.867957746478874, 'LENS': 63.888943618717285, 'AlignScore': 0.7414850959895363, 'SummaC': 0.6068648966685147}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.74149
BERTScore,0.8536
CLI,15.86796
DCRS,10.30965
FKGL,13.4993
LENS,63.88894
ROUGE1,0.41758
ROUGE2,0.13639
ROUGEL,0.38144
SummaC,0.60686


In [16]:
# Combined dummy baseline on test set
wandb_log_eval_metrics(file='scores.txt', data_src='combined_test', sample_portion=1, 
                       name='mixtral_tot_test', tags=['milestone6','test','mixtral_8x7B','4-bit','train_of_thought', 'combined'])

{'ROUGE1': 0.43008605822185186, 'ROUGE2': 0.11899894702608635, 'ROUGEL': 0.39344902642103174, 'BERTScore': 0.8467295241187995, 'FKGL': 13.930633802816903, 'DCRS': 10.33130281690141, 'CLI': 16.205669014084506, 'LENS': 63.65696030328706, 'AlignScore': 0.7495006454032911, 'SummaC': 0.5917514009584843}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
AlignScore,▁
BERTScore,▁
CLI,▁
DCRS,▁
FKGL,▁
LENS,▁
ROUGE1,▁
ROUGE2,▁
ROUGEL,▁
SummaC,▁

0,1
AlignScore,0.7495
BERTScore,0.84673
CLI,16.20567
DCRS,10.3313
FKGL,13.93063
LENS,63.65696
ROUGE1,0.43009
ROUGE2,0.119
ROUGEL,0.39345
SummaC,0.59175
