<a href="https://colab.research.google.com/github/JesseTNRoberts/Do-Language-Models-Have-Rational-Preference/blob/main/Simple_Value_Assignment_PopulationLM_Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/JesseTNRoberts/minicons_modded
!pip install git+https://github.com/JesseTNRoberts/PopulationLM
!pip install accelerate transformers

Collecting git+https://github.com/JesseTNRoberts/minicons_modded
  Cloning https://github.com/JesseTNRoberts/minicons_modded to /tmp/pip-req-build-udn9h18t
  Running command git clone --filter=blob:none --quiet https://github.com/JesseTNRoberts/minicons_modded /tmp/pip-req-build-udn9h18t
  Resolved https://github.com/JesseTNRoberts/minicons_modded to commit 2aa9e4e05fb5c2af99f10dc75cc1e2968b5eaceb
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: minicons
  Building wheel for minicons (setup.py) ... [?25l[?25hdone
  Created wheel for minicons: filename=minicons-0.1-py3-none-any.whl size=19102 sha256=fbab2beda41e917474eb0477794db27557dad2a25e76ee3f4e8f6d88b2f25157
  Stored in directory: /tmp/pip-ephem-wheel-cache-rqzjlz1x/wheels/15/0e/ec/7fb33880467af25612dba7aec20864fb5523bca9247bf3e09b
Successfully built minicons
Installing collected packages: minicons
Successfully installed minicons-0.1
Collecting git+https://github.com/JesseTNRoberts/Popu

In [2]:

import os
import argparse
import csv
from tqdm import tqdm
import numpy as np
import random
import accelerate

import transformers
import os
import shutil
import glob

import torch
from transformers import AutoModelForMaskedLM, AutoModelForCausalLM
from transformers import AutoModelForMaskedLM, AutoModelForCausalLM
from torch.utils.data import DataLoader

from minicons import scorer
import PopulationLM as pop

from google.colab import userdata
token = userdata.get('HF_TOKEN')

import gc

from google.colab import drive
drive.mount('/content/drive')

committee_size = 50
add_dropout_layers = True


inpath = '/content/drive/MyDrive/Data/rational.csv'

cachepath = '/content/drive/MyDrive/Data/cache/'



# make results dir: ../data/typicality/results/(dataset)/model_name.csv
components = inpath.split("/")
data_dir = "/".join(components[0:-1])
dataset_name = components[-1].split(".")[0]
results_dir = f"{data_dir}/results/rational"

dataset = []
with open(inpath, "r") as f:
    reader = csv.DictReader(f)
    column_names = reader.fieldnames
    for row in reader:
        dataset.append(list(row.values()))

batch_size = 18
device = 'cuda'
stimuli_loader = DataLoader(dataset, batch_size = batch_size, num_workers=0)




models = [
    # ['bert-base-uncased',  'mlm'],
    # ['bert-large-uncased', 'mlm'],
    # ['distilbert-base-uncased', 'mlm'],
    # ['roberta-base', 'mlm'],
    # ['roberta-large', 'mlm'],
    # ["google/electra-large-generator", 'mlm'],
    # ['FacebookAI/xlm-roberta-large', 'mlm'],
    # ['distilgpt2', 'incremental'],
    # ['gpt2-medium',  'incremental'],
    # ['gpt2',  'incremental'],
    # ['openai-gpt',  'incremental'],
    # ["daryl149/llama-2-7b-hf", 'incremental'],
    # ["mistralai/Mistral-7B-v0.1",  'incremental'],
    # ["google/gemma-7b",  'incremental'],
    # ['PKU-Alignment/alpaca-7b-reproduced', 'incremental'],
    # ['openlm-research/open_llama_13b',  'incremental'],
    # ['upstage/SOLAR-10.7B-v1.0',  'incremental'],
    # ['microsoft/phi-2',  'incremental'],
    # ['meta-llama/Llama-2-13b-hf',  'incremental'],
]

use_population = False
local_only = False

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:


for model in models:
  # don't repeat completed experiments
  model_name = model[0].replace("/", "_")
  if os.path.exists(results_dir + f"/{model_name}.csv"):
    continue

  print(model[0])

  if model[1] == "mlm":
    try:
      transformer = scorer.MaskedLMScorer(model[0], device, token=token, cache_dir=cachepath,
                                          local_files_only=local_only, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto")
    except:
      transformer = scorer.MaskedLMScorer(model[0], device, token=token, cache_dir=cachepath, local_files_only=local_only)

  elif model[1] == "incremental":
    # Try to load the model with fast parameters
    try:
      transformer = scorer.IncrementalLMScorer(model[0], device, token=token, cache_dir=cachepath,
                                          local_files_only=local_only, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto")

    # Sometimes the above fails because old models don't implement the necessary functions
    # call without fast parameters
    except:
      transformer = scorer.IncrementalLMScorer(model[0], device, token=token, cache_dir=cachepath, local_files_only=local_only)

  else:
    raise ValueError("Invalid model type")

  num_params = [sum(p.numel() for p in transformer.model.parameters())] * len(dataset)

  for val in [0.1]:

    # convert the internal model to use MC Dropout with populationLM
    if use_population:
      pop.DropoutUtils.convert_dropouts(transformer.model)
      pop.DropoutUtils.activate_mc_dropout(transformer.model, activate=True, random=val)

    results = []
    control_results = []
    conclusion_only = []

    # create a lambda function alias for the method that performs generates the probabilities
    call_me = lambda prefixes, queries: transformer.conditional_score(prefixes, queries, reduction=lambda x: (x.sum(0).item(), x.mean(0).item(), x.tolist()))

    for batch in stimuli_loader:
        premise = list(batch[0])
        conclusion = list(batch[1])

        # this line generates the dropout population
        if use_population:
          population = pop.generate_dropout_population(transformer.model, lambda: call_me(premise, conclusion), committee_size=committee_size)

        print(len(premise), len(conclusion))

        # this line gets the probabilities for the dropout population
        if use_population:
          outs = [item for item in tqdm(pop.call_function_with_population(transformer.model, population, lambda: call_me(premise, conclusion)),
                                        total=committee_size)]
        else:
          outs = [call_me(premise, conclusion)]

        transposed_outs = [[row[i] for row in outs] for i in range(len(outs[0]))]
        priming_scores = [score for score in transposed_outs]
        results.extend(priming_scores)

    data_out = list(zip(*dataset))
    new_col_names = column_names

    data_out.append(results)
    new_col_names += ["score (sum, mean, [list)"]

    data_out.append(num_params)
    data_out.append([model_name] * len(results))
    new_col_names += ["params", "model"]

    with open(results_dir + f"/{model_name}"+".csv", "w") as f:
        writer = csv.writer(f)
        writer.writerow(new_col_names)
        writer.writerows(list(zip(*data_out)))
        f.flush()

  # Memory cleanup after model use
  try:
    del population
  except:
    pass

  try:
    del transformer
  except:
    pass

  gc.collect()
  torch.cuda.empty_cache()

  # for file in glob.glob(transformers.utils.hub.default_cache_path + '/models*'):
  #   try:
  #       shutil.rmtree(file)
  #   except:
  #       os.remove(file)

meta-llama/Llama-2-70b-hf


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/66.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/15 [00:00<?, ?it/s]

model-00001-of-00015.safetensors:   0%|          | 0.00/9.85G [00:00<?, ?B/s]

model-00002-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00003-of-00015.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

KeyboardInterrupt: 