In [None]:
pip install nltk




In [None]:
!pip install evaluate rouge rouge_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evalua

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import nltk
import itertools
import re
import string

from evaluate import load
from nltk.corpus import stopwords
from nltk.translate.bleu_score import sentence_bleu

Given a sentence of label and a sentence of prediction, compare them such that the metric indicates accuracy of the generated text being accurate to description. Since this is medical image captioning, importance of sentence can be captured in major description words and medical terminology, such as body parts and diagnosis.

1. Filter both label and prediction of stopwords (unneccesary content)
2. Extract important words from the label
3. Compare

In [None]:
nltk.download('stopwords', quiet=True)

class Evaluation:
  def __init__(self, references, predictions):
    self._stopwords = list(itertools.islice(stopwords.words('english'), 100)) #grab first 100 stopwords
    self._rouge = load("rouge", trust_remote_code=True)
    self._ref = [" ".join(self.filter(r)).strip() for r in references]
    self._pred = [" ".join(self.filter(p)).strip() for p in predictions]

  def filter(self, caption: str):
    '''Tokenize and filter text into raw data'''
    cleaned_caption = re.sub(r'[^a-zA-Z0-9\s]', '', caption)
    words = cleaned_caption.split(" ")
    return [word.lower() for word in words if word.lower() not in self._stopwords] #normalize the data to lowercase

  def default(self):
    '''All metrics'''
    rouge_1_total, rouge_2_total, rouge_l_total, bleu_total = 0, 0, 0, 0

    for p, r in zip(self._pred, self._ref):
      calc = self._rouge.compute(predictions=[p], references=[r])
      rouge_1_total += calc["rouge1"]
      rouge_2_total += calc["rouge2"]
      rouge_l_total += calc["rougeL"]
      bleu_total += sentence_bleu(p.split(" "), r.split(" "))

    total_pred = len(self._pred)
    avg_rouge_1 = rouge_1_total / total_pred if total_pred > 0 else 0 #avoid division by 0 errors
    avg_rouge_2 = rouge_2_total / total_pred if total_pred > 0 else 0
    avg_rouge_l = rouge_l_total / total_pred if total_pred > 0 else 0
    avg_bleu = bleu_total / total_pred if total_pred > 0 else 0
    return {"rouge_1": avg_rouge_1, "rouge_2": avg_rouge_2, "rouge_l": avg_rouge_l, "bleu": avg_bleu}

  def rogue(self):
    '''Given labels and predictions, calculate rouge scores'''
    rouge_1_total, rouge_2_total, rouge_l_total = 0, 0, 0

    for r,p in zip(self._ref, self._pred):
      calc = self._rouge.compute(predictions=[p], references=[r])
      rouge_1_total += calc["rouge1"]
      rouge_2_total += calc["rouge2"]
      rouge_l_total += calc["rougeL"]

    total_pred = len(self._pred)
    avg_rouge_1 = rouge_1_total / total_pred if total_pred > 0 else 0 #avoid division by 0 errors
    avg_rouge_2 = rouge_2_total / total_pred if total_pred > 0 else 0
    avg_rouge_l = rouge_l_total / total_pred if total_pred > 0 else 0
    return {"rouge_1": avg_rouge_1, "rouge_2": avg_rouge_2, "rouge_l": avg_rouge_l}

  def baseline(self): #do this for every single prediction, not for the whole thing
    individual_accuracy = []

    for r, p in zip(self._ref, self._pred):
      total = 0
      length = len(p)
      for word in p:
        if word in r:
          total += 1

      accuracy = total / length if length > 0 else 0
      individual_accuracy.append(accuracy)

    return sum(individual_accuracy) / len(individual_accuracy) if len(individual_accuracy) > 0 else 0

  def param_sweep(self, weights, scores):
    '''Given a series of weights and scores (of the same length), output the final score'''
    if len(weights) != len(scores):
      raise ValueError()

    total = 0
    for i, (metric, score) in enumerate(scores.items()): #since python dictionaries maintain order, we can use enumerate
      total += score * weights[i]
    return total/len(scores) if len(scores) > 0 else 0 #normalize weighted score

## **Fetching Label Samples**

In [None]:
!pip install kaggle
from google.colab import drive
drive.mount('/content/drive')
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/Florence_2/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download virajbagal/roco-dataset
! unzip roco-dataset.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: all_data/validation/radiology/images/PMC3870636_CRIM.OTOLARYNGOLOGY2013-650428.002.jpg  
  inflating: all_data/validation/radiology/images/PMC3870648_CRIM.DENTISTRY2013-378062.012.jpg  
  inflating: all_data/validation/radiology/images/PMC3871037_enm-28-326-g001.jpg  
  inflating: all_data/validation/radiology/images/PMC3872022_CRIM.MEDICINE2013-653925.001.jpg  
  inflating: all_data/validation/radiology/images/PMC3872161_CRIM.SURGERY2013-209494.001.jpg  
  inflating: all_data/validation/radiology/images/PMC3872390_CRIM.OBGYN2013-906351.001.jpg  
  inflating: all_data/validation/radiology/images/PMC3872571_OJO-6-193-g005.jpg  
  inflating: all_data/validation/radiology/images/PMC3872571_OJO-6-193-g007.jpg  
  inflating: all_data/validation/radiology/images/PMC3872649_SNI-4-150-g001.jpg  
  inflating: all_data/validation/radiology/images/PMC3872649_SNI-4-150-g014.jpg  
  inflating: all_data/validation/radiolog

In [None]:
df_train = pd.read_csv('/content/all_data/train/radiologytraindata.csv', delimiter=',')
df_train.dataframeName = 'radiologytraindata.csv'
rows, cols = df_train.shape
print(f'There are {rows} rows and {cols} columns')

There are 65450 rows and 3 columns


In [None]:
mask = df_train['caption'].str.contains('chest x-ray', case=False)
captions = df_train[mask]['caption'].tolist()

In [None]:
predictions = ['chest x-ray demonstrating normal lung fields with clear airway, no visible abnormalities in the heart or bony structures' for i in range(len(captions))]

In [None]:
eval = Evaluation(captions, predictions)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
print(eval._ref)
print(eval._pred)

['chest xray confirmed position guidewire extending right internal jugular vein inferior vena cava', 'chest xray findings chest radiograph revealed engorged pulmonary trunk abrupt cutoff pulmonary vascularity distal portions bilaterally indicative westermark sign arrows', 'chest xray pa showing position gun nails', 'chest xray showing rightsided pneumothorax', 'chest xray day admission showing diffuse bilateral haziness air bronchogram', 'chest xray patient chest xray showed faint patchy opacity periphery right upper lung zone black arrow', 'chest xray 17 days admission', 'chest xray posterioranterior view surgical removal intermediate lobe right lung drain right pleural cavity postoperative chest radiograph revealed no pneumothorax', 'chest xray posteroanterior view shows bilateral lower zone consolidation bilateral pleural effusion', 'chest xray pa view showing bilateral reticulonodular infiltrates', 'full range cardiac support technology plain chest xray shows jarvik pump apex left 

In [None]:
print(eval.baseline())

0.9321619428667354


In [None]:
scores = eval.default()

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
print(scores)
print(eval.param_sweep([0.4, 0.2, 0.4, 0], scores))

{'rouge_1': 0.1946135668021863, 'rouge_2': 0.09676711486508709, 'rouge_l': 0.19229709712669793, 'bleu': 1.0726443194895598e-233}
0.04352942213614278


In [None]:
eval2 = Evaluation(captions[0:50], [captions[0] for i in range(50)])

In [None]:
scores2 = eval.default()

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
print(scores)
print(eval.param_sweep([0.4, 0.2, 0.4, 0], scores))

## Graph

In [None]:
#use matplotlib

In [None]:
import matplotlib

In [None]:
# Stub data
iterations = list(range(1, 11)) #number of predictions
model_performance = [0.65, 0.68, 0.72, 0.75, 0.78, 0.81, 0.85, 0.88, 0.91, 0.94]

# Create plot
plt.figure(figsize=(8, 5))
plt.plot(iterations, model_performance, marker='o', linestyle='-', color='b', label='Model Performance')
plt.title('Model Performance vs. Iterations')
plt.xlabel('Iterations')
plt.ylabel('Accuracy')

BLEU EVALUATION METRIC

In [None]:
split_sentences = [[word.lower().strip(",.") for word in sentence.split()] for sentence in captions]
split_pred = [[word.lower().strip(",.") for word in sentence.split()] for sentence in predictions]

bleu_score = sentence_bleu([split_pred[0]], split_pred[0]) # these are the same sentences so it should output 1
print(f"BLEU score -> {bleu_score}")

# this should have a low bleu score because structure and words aren't the same
bleu_score2 = sentence_bleu([split_sentences[0]], split_pred[0])
print(f"BLEU score -> {bleu_score2}")


# random sentence chat-gpt generated that should produce higher bleu score
sentence3 = "Chest X-ray confirming the guidewire position, extending from the right internal jugular vein to the inferior vena cava"
split_pred3 = [word.lower().strip(",.") for word in sentence3.split()]

bleu_score3 = sentence_bleu([split_sentences[0]], split_pred3);
print(f"BLEU score -> {bleu_score3}")


BLEU score -> 1.0
BLEU score -> 4.200173498122231e-155
BLEU score -> 0.41677851600972055



---

### TODO:


medical dictionary

ensure better runtime of retrieval