<a href="https://colab.research.google.com/github/JesseTNRoberts/AAAI-paper-2024/blob/main/Sweep_rate_of_dropout_for_PopulationLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/JesseTNRoberts/PopulationLM
!pip install git+https://github.com/JesseTNRoberts/minicons_modded

Collecting git+https://github.com/JesseTNRoberts/PopulationLM
  Cloning https://github.com/JesseTNRoberts/PopulationLM to /tmp/pip-req-build-57tooq11
  Running command git clone --filter=blob:none --quiet https://github.com/JesseTNRoberts/PopulationLM /tmp/pip-req-build-57tooq11
  Resolved https://github.com/JesseTNRoberts/PopulationLM to commit 328a6628146ff22e2e0eb8261007b0e9b7af0d0c
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: PopulationLM
  Building wheel for PopulationLM (setup.py) ... [?25l[?25hdone
  Created wheel for PopulationLM: filename=PopulationLM-0.1-py3-none-any.whl size=3527 sha256=b1c4d4643701b9d0e15dd9d0234649cc8530ed1a0bbc846508e00318870097bd
  Stored in directory: /tmp/pip-ephem-wheel-cache-trlbgdk2/wheels/76/d3/b7/994b47d195d749b9752de5e275517c30ab4ef097e508ec8f2f
Successfully built PopulationLM
Installing collected packages: PopulationLM
Successfully installed PopulationLM-0.1
Collecting git+https://github.com/Jess

In [None]:
import os
import argparse
import csv
from tqdm import tqdm
import numpy as np
import random

import torch
from torch.utils.data import DataLoader

from minicons import scorer
import PopulationLM as pop

from google.colab import drive
drive.mount('/content/drive')

committee_size = 50

def shuffle_sentence(sentence, word):
    '''
        returns the shuffled form of a sentence while preserving the
        multi-word expression order for the focus word.
    '''
    sentence = sentence.replace(".", "")
    if len(word.split()) > 1:
        sentence = sentence.replace(word, "@".join(word.split())).split()
    else:
        sentence = sentence.split()
    random.shuffle(sentence)

    return " ".join(sentence).replace("@", " ").capitalize() + "."

inpath = '/content/drive/MyDrive/Data/rosch1975_alternate.csv'
model_name = 'bert-base-uncased'
batch_size = 565
device = 'cpu'
lm_type = 'mlm'

# make results dir: ../data/typicality/results/(dataset)/model_name.csv
components = inpath.split("/")
data_dir = "/".join(components[0:-1])
dataset_name = components[-1].split(".")[0]
results_dir = f"{data_dir}/results/{dataset_name}_popLM_50"

dataset = []
with open(inpath, "r") as f:
    reader = csv.DictReader(f)
    column_names = reader.fieldnames
    for row in reader:
        dataset.append(list(row.values()))

if lm_type == "masked" or lm_type == "mlm":
    transformer = scorer.MaskedLMScorer(model_name, device)
elif lm_type == "incremental" or lm_type == "causal":
    transformer = scorer.IncrementaLMScorer(model_name, device)


if "/" in model_name:
    model_name = model_name.replace("/", "_")

num_params = [sum(p.numel() for p in transformer.model.parameters())] * len(dataset)

stimuli_loader = DataLoader(dataset, batch_size = batch_size, num_workers=0)

for val in np.arange(0.1, 0.9, 0.1):
  # convert the internal model to use MC Dropout
  pop.DropoutUtils.convert_dropouts(transformer.model)
  pop.DropoutUtils.activate_mc_dropout(transformer.model, activate=True, random=val)

  results = []
  control_results = []
  conclusion_only = []

  # create a lambda function alias for the method that performs classifications
  call_me = lambda prefixes, queries: transformer.conditional_score(prefixes, queries, reduction=lambda x: (x.sum(0).item(), x.mean(0).item(), x.tolist()))

  for batch in stimuli_loader:
      premise = list(batch[0])
      conclusion = list(batch[1])

      population = pop.generate_dropout_population(transformer.model, lambda: call_me(premise, conclusion), committee_size=committee_size)

      print(len(premise))
      print(len(conclusion))

      print(conclusion[0])
      # create the population identities

      outs = [item for item in tqdm(pop.call_function_with_population(transformer.model, population, lambda: call_me(premise, conclusion)),
                                    total=committee_size)]
      transposed_outs = [[row[i] for row in outs] for i in range(len(outs[0]))]
      priming_scores = [score for score in transposed_outs]
      results.extend(priming_scores)

  data_out = list(zip(*dataset))
  new_col_names = column_names

  data_out.append(results)
  new_col_names += ["score (sum, mean, [list)"]

  data_out.append(num_params)
  data_out.append([model_name] * len(results))
  new_col_names += ["params", "model"]

  with open(results_dir + f"/{model_name}_dropout_"+str(val)+".csv", "w") as f:
      writer = csv.writer(f)
      writer.writerow(new_col_names)
      writer.writerows(list(zip(*data_out)))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

565
565
toy.


100%|██████████| 50/50 [1:08:26<00:00, 82.13s/it]
