# Evaluation


This notebook runs the official evaluation code `evaluate.py` on google colab.

In [None]:
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn import preprocessing
import joblib
from torch.optim import AdamW

In [2]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Setup the evaluation script

# 1. download the evaluation models

In [3]:
# run once only to setup the models folder
# will take quite awhile to run (dependening on network speed but usually takes around 15 minutes)
!bash /content/drive/MyDrive/BioLaySumm2024-evaluation_scripts/get_models.sh

Cloning into 'LENS'...
remote: Enumerating objects: 178, done.[K
remote: Counting objects: 100% (85/85), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 178 (delta 25), reused 64 (delta 15), pack-reused 93[K
Receiving objects: 100% (178/178), 509.44 KiB | 4.95 MiB/s, done.
Resolving deltas: 100% (64/64), done.
Processing ./LENS/lens
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pandas==1.1.5 (from lens-metric==0.1.1)
  Downloading pandas-1.1.5.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytorch-lightning==1.6.0 (from lens-metric==0.1.1)
  Downloading pytorch_lightning-1.6.0-py3-none-any.whl (582 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m582

# 2. Install the environment dependencies

In [6]:
#set up the enviornment for running evaluate.py
!pip install -r /content/drive/MyDrive/BioLaySumm2024-evaluation_scripts/requirements.txt

Collecting textstat (from -r /content/drive/MyDrive/BioLaySumm2024-evaluation_scripts/requirements.txt (line 1))
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting rouge-score (from -r /content/drive/MyDrive/BioLaySumm2024-evaluation_scripts/requirements.txt (line 3))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score (from -r /content/drive/MyDrive/BioLaySumm2024-evaluation_scripts/requirements.txt (line 4))
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting summac (from -r /content/drive/MyDrive/BioLaySumm2024-evaluation_scripts/requirements.txt (line 5))
  Downloading summac-0.0.4-py3-none-any.whl (30 kB)
Collecting pyphen (from textstat-

In [7]:
# Setting environment variable such that the result are deterministic 
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

# 3. Evaluation
- It will take around 12 hours to run on T4.  
- The outputs will be saved in the root directory.  

Provided evaluation code:  
- evaluate.py

Calling the code in nb:  
`!python PATH/TO/evaluate.py /path/to/predicted/summaries /path/to/validation/data`

Outputs:
1. /content/summac_conv_vitc_sent_perc_e.bin
2. /content/elife_scores.txt
3. /content/plos_scores.txt
4. /content/scores.txt

In [None]:
!python /content/drive/MyDrive/BioLaySumm2024-evaluation_scripts/evaluate.py /content/drive/MyDrive/BioLaySumm2024-evaluation_scripts/baseline_evaluation /content/drive/MyDrive/BioLaySumm2024-evaluation_scripts/baseline_evaluation

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Downloading tokenizer_config.json: 100% 25.0/25.0 [00:00<00:00, 113kB/s]
Downloading config.json: 100% 482/482 [00:00<00:00, 3.01MB/s]
Downloading vocab.json: 100% 899k/899k [00:00<00:00, 10.2MB/s]
Downloading merges.txt: 100% 456k/456k [00:00<00:00, 18.3MB/s]
Downloading tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 18.5MB/s]
Downloading model.safetensors: 100% 1.42G/1.42G [00:10<00:00, 134MB/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
calculating scores...
computing bert embedding.
100% 8/8 [00:29<00:00,  3.64s/it]
computing greedy matching.
100% 4/4 [00:00<00:00, 14.60it/s]
done in 29.44 seconds, 8.19 sentences/sec
Lightning automa