In [12]:
import sys
import os
import math
import csv
import numpy as np
import torch

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(parent_dir)

from visual_genome.local import VisualGenome

In [2]:
vg = VisualGenome()

Script directory: C:\Users\karab\Desktop\Visual Genome Driver\visual_genome
Data directory: C:\Users\karab\Desktop\Visual Genome Driver\data
Loading data...
Data loaded.


In [3]:
# get all the relationships in "object predicate subject" format with image id and relationship id

relationships = []
for im in vg.get_images():
    rels = vg.get_image_relationships(im)
    for rel in rels:
        dict_ = {
            "image_id": im,
            "relationship_id": rel.id,
            "object": rel.object_name.replace(",", ""),
            "predicate": rel.predicate.replace(",", ""),
            "subject": rel.subject_name.replace(",", "")
        }
        relationships.append(dict_)


# surprisal vs. lm-scorer vs. minicons

To get surprisal values and make experiments using language models, there seems to be three main libraries that can be used: [surprisal](https://github.com/aalok-sathe/surprisal), [lm-scorer](https://github.com/simonepri/lm-scorer) and [minicons](https://github.com/kanishkamisra/minicons). `lm-scorer` is an old library and it is not maintained anymore. I couldn't make it work due to some dependency issues. `surprisal` is implemented inspired by `lm-scorer` and their surprisal outputs seem to match (based on the input sentences provided in `lm-scorer` repository). However, masked language models are not supported in `surprisal`. `minicons` supports both auto-regressive and masked language models and it is actively maintained. So, I decided to use `minicons` for this notebook. 

In its default usage, `minicons` produces slightly different surprisal values compared to `surprisal` and `lm-scorer`. It is discussed in detail in the following issue: https://github.com/kanishkamisra/minicons/issues/29 

To put it simply, main difference is that `minicons`, by default, ignores the first token and starts calculating the probabilities by the second token $p(w_1| w_0)$. `lm-scorer` on the other hand, calculates the probability of the first token as well using $p(w_0| \text{<|endoftext|>})$. According to the claim made by the author of `minicons`, his approach is more consistent with how people are doing experiments in psycholinguistics. To completely match with the probabilities produced by `lm-scorer`, one can use the `bos_token=True, eos_token=False` parameters in `minicons`.


In [4]:
# write relationships to csv
def write(relationships, filename = 'relationships'):
    trynext = False
    try:
        with open(f'{filename}.csv', 'w', newline="") as f:
            field_order = ['image_id', 'relationship_id', 'subject', 'predicate', 'object', 'average surprisal']
            writer = csv.DictWriter(f, field_order)
            writer.writeheader()
            writer.writerows(relationships)
    except:
        print("Error writing to csv")
        trynext = True

    # write relationships to json
    import json

    
    with open(f'{filename}.json', 'w') as f:
        json.dump(relationships, f)
        

    # write relationships to txt
    if trynext:
        try:
            with open(f'{filename}.txt', 'w') as f:
                for rel in relationships:
                    f.write(f"{rel['image_id']} {rel['relationship_id']} {rel['subject']} {rel['predicate']} {rel['object']} {rel['average , newsurprisal']}\n")
        except:
            print("Error writing to txt")
            trynext = True

## Causal Surprisal (minicons)

Python package [minicons] supports both causal and masked surprisal. Make sure the package is installed.
```bash
pip install minicons
```

In [5]:
from minicons import scorer

ilm_model = scorer.IncrementalLMScorer('gpt2', 'cuda') 

sentences = ["I like this package."]
print(ilm_model.sequence_score(sentences, reduction = lambda x: -x.mean(0).item())) # average surprisal of the sequence (normalized by sequence length)


  from .autonotebook import tqdm as notebook_tqdm


[5.194177627563477]


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [6]:
def get_sentences(relationships):
    sentences = []
    for rel in relationships:
        sentence = f"{rel['subject']} {rel['predicate']} {rel['object']}"
        sentences.append(sentence)
    return sentences

def get_surprisals(relationships, sentences, model, PLL_metric = 'within_word_l2r', batch_size=256):
    num_batches = math.ceil(len(sentences) / batch_size)
    for i in range(num_batches):
        start = i * batch_size
        end = start + batch_size
        batch = sentences[start:end]
        if(isinstance(model, scorer.IncrementalLMScorer)):
            results = model.sequence_score(batch, reduction = lambda x: -x.mean(0).item()) # average surprisal of the sequence (normalized by sequence length)
        elif(isinstance(model, scorer.MaskedLMScorer)):
            results = model.sequence_score(batch, reduction = lambda x: -x.mean(0).item(), PLL_metric=PLL_metric)
        else:
            print("Model not recognized")
            return
            
        for i, rel in enumerate(relationships[start:end]):
            rel['average surprisal'] = results[i]
    return relationships

In [7]:
sentences = get_sentences(relationships)
relationships = get_surprisals(relationships, sentences, ilm_model)

In [9]:
write(relationships)

## Masked Surprisal (minicons)

In [23]:
mlm_model = scorer.MaskedLMScorer('bert-base-uncased', 'cuda') 

sentences = ["I like this package."]
print(mlm_model.sequence_score(sentences, reduction = lambda x: -x.mean(0).item(), PLL_metric='within_word_l2r'))

[2.4668994903564454]


In [22]:
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [20]:
sentences = get_sentences(relationships)
relationships = get_surprisals(relationships, sentences, mlm_model, batch_size=64)

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.37 GiB. GPU 0 has a total capacity of 4.00 GiB of which 909.30 MiB is free. Of the allocated memory 1.32 GiB is allocated by PyTorch, and 1020.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
write(relationships,filename = "relationships (masked - minicons)")