In [1]:
%pip install datasets
%pip install accelerate
%pip install transformers
%pip install torch
%pip install evaluate 

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Funcction s

In [1]:
import os
import json
import hashlib
import datasets
import random 
import torch
from evaluate import load
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fb2a0104a70>

In [3]:
def load_ds(dataset_name, seed, add_options=None):
    """Load dataset."""

    train_dataset, validation_dataset = None, None

    if dataset_name == "trivia_qa":
            dataset = datasets.load_dataset('TimoImhof/TriviaQA-in-SQuAD-format')['unmodified']
            dataset = dataset.train_test_split(test_size=0.2, seed=seed)
            train_dataset = dataset['train']
            validation_dataset = dataset['test']
    return train_dataset, validation_dataset 

In [4]:
train_dataset, validation_dataset = load_ds( "trivia_qa", 42)


In [5]:
def split_dataset(dataset):
    """Get indices of answerable and unanswerable questions."""

    def clen(ex):
        return len(ex["answers"]["text"])

    answerable_indices = [i for i, ex in enumerate(dataset) if clen(ex) > 0]
    unanswerable_indices = [i for i, ex in enumerate(dataset) if clen(ex) == 0]

    # union == full dataset
    assert set(answerable_indices) | set(
        unanswerable_indices) == set(range(len(dataset)))
    # no overlap
    assert set(answerable_indices) - \
        set(unanswerable_indices) == set(answerable_indices)

    return answerable_indices, unanswerable_indices

In [6]:
answerable_indices, unanswerable_indices = split_dataset(train_dataset)

unanswerable_indices = []
val_answerable, val_unanswerable = split_dataset(validation_dataset)
del val_unanswerable
validation_dataset = [validation_dataset[i] for i in val_answerable]

In [7]:
num_few_shot = 3
prompt_indices = random.sample(answerable_indices, num_few_shot)
remaining_answerable = list(set(answerable_indices) - set(prompt_indices))

In [8]:
def get_make_prompt():
  def make_prompt(context, question, answer, brief, brief_always):
            prompt = ''
            if brief_always:
                prompt += brief
            if (context is not None):
                prompt += f"Context: {context}\n"
            prompt += f"Question: {question}\n"
            if answer:
                prompt += f"Answer: {answer}\n\n"
            else:
                prompt += 'Answer:'
            return prompt
        
  return make_prompt

In [9]:
def construct_fewshot_prompt_from_indices(dataset, example_indices, brief, brief_always, make_prompt):
    """Given a dataset and indices, construct a fewshot prompt."""
    if not brief_always:
        prompt = brief
    else:
        prompt = ''

    for example_index in example_indices:

        example = dataset[example_index]
        context = example["context"]
        question = example["question"]
        answer = example["answers"]["text"][0]

        prompt = prompt + make_prompt(context, question, answer, brief, brief_always)

    return prompt

# HugginFaceModel

In [10]:
from abc import ABC, abstractmethod
from typing import List, Text


STOP_SEQUENCES = ['\n\n\n\n', '\n\n\n', '\n\n', '\n', 'Question:', 'Context:']


class BaseModel(ABC):

    stop_sequences: List[Text]

    @abstractmethod
    def predict(self, input_data, temperature):
        pass

    @abstractmethod
    def get_p_true(self, input_data):
        pass

In [16]:
"""Implement HuggingfaceModel models."""
import copy
import logging
from collections import Counter
import torch

import accelerate

from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from transformers import StoppingCriteria
from transformers import StoppingCriteriaList
from huggingface_hub import snapshot_download


class StoppingCriteriaSub(StoppingCriteria):
    """Stop generations when they match a particular text or token."""
    def __init__(self, stops, tokenizer, match_on='text', initial_length=None):
        super().__init__()
        self.stops = stops
        self.initial_length = initial_length
        self.tokenizer = tokenizer
        self.match_on = match_on
        if self.match_on == 'tokens':
            self.stops = [torch.tensor(self.tokenizer.encode(i)).to('cuda') for i in self.stops]
            print(self.stops)

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        del scores  # `scores` arg is required by StoppingCriteria but unused by us.
        for stop in self.stops:
            if self.match_on == 'text':
                generation = self.tokenizer.decode(input_ids[0][self.initial_length:], skip_special_tokens=False)
                match = stop in generation
            elif self.match_on == 'tokens':
                # Can be dangerous due to tokenizer ambiguities.
                match = stop in input_ids[0][-len(stop):]
            else:
                raise
            if match:
                return True
        return False


def remove_split_layer(device_map_in):
    """Modify device maps s.t. individual layers are not spread across devices."""

    device_map = copy.deepcopy(device_map_in)
    destinations = list(device_map.keys())

    counts = Counter(['.'.join(i.split('.')[:2]) for i in destinations])

    found_split = False
    for layer, count in counts.items():
        if count == 1:
            continue

        if found_split:
            # Only triggers if we find more than one split layer.
            raise ValueError(
                'More than one split layer.\n'
                f'Currently at layer {layer}.\n'
                f'In map: {device_map_in}\n'
                f'Out map: {device_map}\n')

        logging.info(f'Split layer is {layer}.')

        # Remove split for that layer.
        for name in list(device_map.keys()):
            if name.startswith(layer):
                print(f'pop {name}')
                device = device_map.pop(name)

        device_map[layer] = device
        found_split = True

    return device_map


class HuggingfaceModel(BaseModel):
    """Hugging Face Model."""

    def __init__(self, model_name, stop_sequences=None, max_new_tokens=None):
        if max_new_tokens is None:
            raise
        self.max_new_tokens = max_new_tokens

        if stop_sequences == 'default':
            stop_sequences = STOP_SEQUENCES

        if 'llama' in model_name.lower():
            if model_name.endswith('-8bit'):
                kwargs = {'quantization_config': BitsAndBytesConfig(
                    load_in_8bit=True,)}
                model_name = model_name[:-len('-8bit')]
                eightbit = True
            else:
                kwargs = {}
                eightbit = False

            if 'Llama-2' in model_name or "Llama-3" in model_name:
                base = 'meta-llama'
                if 'Llama-2' in model_name:
                    model_name = model_name + '-hf'
            else:
                base = 'huggyllama'

            self.tokenizer = AutoTokenizer.from_pretrained(
                f"{base}/{model_name}", device_map="auto",
                token_type_ids=None)

            llama65b = '65b' in model_name and base == 'huggyllama'
            llama2_70b = '70b' in model_name and base == 'meta-llama'

            if ('7b' in model_name or "8B" in model_name or '13b' in model_name) or eightbit:
                self.model = AutoModelForCausalLM.from_pretrained(
                    f"{base}/{model_name}", device_map="auto",
                    max_memory={0: '80GIB'}, **kwargs,)

            elif llama2_70b or llama65b:
                path = snapshot_download(
                    repo_id=f'{base}/{model_name}',
                    allow_patterns=['*.json', '*.model', '*.safetensors'],
                    ignore_patterns=['pytorch_model.bin.index.json']
                )

                config = AutoConfig.from_pretrained(f"{base}/{model_name}")
                with accelerate.init_empty_weights():
                    self.model = AutoModelForCausalLM.from_config(config)
                self.model.tie_weights()
                max_mem = 15 * 4686198491

                device_map = accelerate.infer_auto_device_map(
                    self.model.model,
                    max_memory={0: max_mem, 1: max_mem},
                    dtype='float16'
                )
                device_map = remove_split_layer(device_map)
                full_model_device_map = {f"model.{k}": v for k, v in device_map.items()}
                full_model_device_map["lm_head"] = 0

                self.model = accelerate.load_checkpoint_and_dispatch(
                    self.model, path, device_map=full_model_device_map,
                    dtype='float16', skip_keys='past_key_values')
            else:
                raise ValueError

        elif 'mistral' in model_name.lower():

            if model_name.endswith('-8bit'):
                kwargs = {'quantization_config': BitsAndBytesConfig(
                    load_in_8bit=True,)}
                model_name = model_name[:-len('-8bit')]
            if model_name.endswith('-4bit'):
                kwargs = {'quantization_config': BitsAndBytesConfig(
                    load_in_4bit=True,)}
                model_name = model_name[:-len('-4bit')]
            else:
                kwargs = {}

            model_id = f'mistralai/{model_name}'
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_id, device_map='auto', token_type_ids=None,
                clean_up_tokenization_spaces=False)

            self.model = AutoModelForCausalLM.from_pretrained(
                model_id,
                device_map='auto',
                max_memory={0: '80GIB'},
                **kwargs,
            )

        elif 'falcon' in model_name:
            

            model_id = f'tiiuae/{model_name}'
            
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_id, device_map='auto', token_type_ids=None,
                clean_up_tokenization_spaces=False)

            self.model = AutoModelForCausalLM.from_pretrained(
                model_id,
                dtype=torch.float16,
                device_map='auto',
            )
            
        elif 'Falcon' in model_name:
            model_id = f'tiiuae/{model_name}'

            self.tokenizer = AutoTokenizer.from_pretrained(
                model_id, device_map='auto', token_type_ids=None,
                clean_up_tokenization_spaces=False)

            self.model = AutoModelForCausalLM.from_pretrained(
                model_id,
                dtype=torch.float16,
                device_map='auto',
            )
        
        elif "Qwen" in model_name:

            model_id = f'Qwen/{model_name}'
            
            self.tokenizer = AutoTokenizer.from_pretrained(model_id,device_map='auto', token_type_ids=None,
                clean_up_tokenization_spaces=False)

            self.model = AutoModelForCausalLM.from_pretrained(
                model_id,
                dtype=torch.float16,
                device_map="auto",
            )

            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model.config.pad_token_id = self.tokenizer.eos_token_id
        
        else:
            raise ValueError

        self.model_name = model_name
        self.stop_sequences = stop_sequences + [self.tokenizer.eos_token]
        self.token_limit = 4096 if 'Llama-2' in model_name or "Llama-3" in model_name else 2048

    def predict(self, input_data, temperature, return_full=False):

        # Implement prediction.
        inputs = self.tokenizer(input_data, return_tensors="pt").to("cuda")

        if 'Qwen' in self.model_name or 'falcon' in self.model_name or 'mistral' in self.model_name.lower():
            if 'token_type_ids' in inputs:  # Some HF models have changed.
                del inputs['token_type_ids']
            pad_token_id = self.tokenizer.eos_token_id
        else:
            pad_token_id = None

        if self.stop_sequences is not None:
            stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(
                stops=self.stop_sequences,
                initial_length=len(inputs['input_ids'][0]),
                tokenizer=self.tokenizer)])
        else:
            stopping_criteria = None

        logging.debug('temperature: %f', temperature)
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=self.max_new_tokens,
                return_dict_in_generate=True,
                output_scores=True,
                output_hidden_states=True,
                temperature=temperature,
                do_sample=True,
                stopping_criteria=stopping_criteria,
                pad_token_id=pad_token_id,
            )

        if len(outputs.sequences[0]) > self.token_limit:
            raise ValueError(
                'Generation exceeding token limit %d > %d',
                len(outputs.sequences[0]), self.token_limit)

        full_answer = self.tokenizer.decode(
            outputs.sequences[0], skip_special_tokens=True)

        if return_full:
            return full_answer

        # For some models, we need to remove the input_data from the answer.
        if full_answer.startswith(input_data):
            input_data_offset = len(input_data)
        else:
            #raise ValueError('Have not tested this in a while.')
            logging.error(f"Full answer should start from input_data. Setting input_data offset to 0")
            logging.error(f"Full answer is {full_answer}")
            logging.error(f"Input data is {input_data}")
            input_data_offset = 0

        # Remove input from answer.
        answer = full_answer[input_data_offset:]

        # Remove stop_words from answer.
        stop_at = len(answer)
        sliced_answer = answer
        if self.stop_sequences is not None:
            for stop in self.stop_sequences:
                if answer.endswith(stop):
                    stop_at = len(answer) - len(stop)
                    sliced_answer = answer[:stop_at]
                    break
            if not all([stop not in sliced_answer for stop in self.stop_sequences]):
                error_msg = 'Error: Stop words not removed successfully!'
                error_msg += f'Answer: >{answer}< '
                error_msg += f'Sliced Answer: >{sliced_answer}<'
                if 'falcon' not in self.model_name.lower():
                    raise ValueError(error_msg)
                else:
                    logging.error(error_msg)

        # Remove whitespaces from answer (in particular from beginning.)
        sliced_answer = sliced_answer.strip()

        # Get the number of tokens until the stop word comes up.
        # Note: Indexing with `stop_at` already excludes the stop_token.
        # Note: It's important we do this with full answer, since there might be
        # non-trivial interactions between the input_data and generated part
        # in tokenization (particularly around whitespaces.)
        token_stop_index = self.tokenizer(full_answer[:input_data_offset + stop_at], return_tensors="pt")['input_ids'].shape[1]
        n_input_token = len(inputs['input_ids'][0])
        n_generated = token_stop_index - n_input_token

        if n_generated == 0:
            logging.warning('Only stop_words were generated. For likelihoods and embeddings, taking stop word instead.')
            n_generated = 1

        # Get the last hidden state (last layer) and the last token's embedding of the answer.
        # Note: We do not want this to be the stop token.

        # outputs.hidden_state is a tuple of len = n_generated_tokens.
        # The first hidden state is for the input tokens and is of shape
        #     (n_layers) x (batch_size, input_size, hidden_size).
        # (Note this includes the first generated token!)
        # The remaining hidden states are for the remaining generated tokens and is of shape
        #    (n_layers) x (batch_size, 1, hidden_size).

        # Note: The output embeddings have the shape (batch_size, generated_length, hidden_size).
        # We do not get embeddings for input_data! We thus subtract the n_tokens_in_input from
        # token_stop_index to arrive at the right output.

        if 'decoder_hidden_states' in outputs.keys():
            hidden = outputs.decoder_hidden_states
        else:
            hidden = outputs.hidden_states

        if len(hidden) == 1:
            logging.warning(
                'Taking first and only generation for hidden! '
                'n_generated: %d, n_input_token: %d, token_stop_index %d, '
                'last_token: %s, generation was: %s',
                n_generated, n_input_token, token_stop_index,
                self.tokenizer.decode(outputs['sequences'][0][-1]),
                full_answer,
                )
            last_input = hidden[0]
        elif ((n_generated - 1) >= len(hidden)):
            # If access idx is larger/equal.
            logging.error(
                'Taking last state because n_generated is too large'
                'n_generated: %d, n_input_token: %d, token_stop_index %d, '
                'last_token: %s, generation was: %s, slice_answer: %s',
                n_generated, n_input_token, token_stop_index,
                self.tokenizer.decode(outputs['sequences'][0][-1]),
                full_answer, sliced_answer
                )
            last_input = hidden[-1]
        else:
            last_input = hidden[n_generated - 1]

        # Then access last layer for input
        last_layer = last_input[-1]
        # Then access last token in input.
        last_token_embedding = last_layer[:, -1, :].cpu()

        # Get log_likelihoods.
        # outputs.scores are the logits for the generated token.
        # outputs.scores is a tuple of len = n_generated_tokens.
        # Each entry is shape (bs, vocabulary size).
        # outputs.sequences is the sequence of all tokens: input and generated.
        transition_scores = self.model.compute_transition_scores(
            outputs.sequences, outputs.scores, normalize_logits=True)
        # Transition_scores[0] only contains the scores for the first generated tokens.

        log_likelihoods = [score.item() for score in transition_scores[0]]
        if len(log_likelihoods) == 1:
            logging.warning('Taking first and only generation for log likelihood!')
            log_likelihoods = log_likelihoods
        else:
            log_likelihoods = log_likelihoods[:n_generated]

        if len(log_likelihoods) == self.max_new_tokens:
            logging.warning('Generation interrupted by max_token limit.')

        if len(log_likelihoods) == 0:
            raise ValueError

        return sliced_answer, log_likelihoods, last_token_embedding

    def get_p_true(self, input_data):
        """Get the probability of the model anwering A (True) for the given input."""

        input_data += ' A'
        tokenized_prompt_true = self.tokenizer(input_data, return_tensors='pt').to('cuda')['input_ids']
        # The computation of the negative log likelihoods follows:
        # https://huggingface.co/docs/transformers/perplexity.

        target_ids_true = tokenized_prompt_true.clone()
        # Set all target_ids except the last one to -100.
        target_ids_true[0, :-1] = -100

        with torch.no_grad():
            model_output_true = self.model(tokenized_prompt_true, labels=target_ids_true)

        loss_true = model_output_true.loss

        return -loss_true.item()

# suite 

In [17]:
BRIEF_PROMPTS = {
    'default': "Answer the following question as briefly as possible.\n",
    'chat': 'Answer the following question in a single brief but complete sentence.\n'}

In [18]:
make_prompt = get_make_prompt()
BRIEF = BRIEF_PROMPTS['default']
prompt = construct_fewshot_prompt_from_indices(
train_dataset, prompt_indices, BRIEF, False, make_prompt)


In [19]:
def init_model(model_name, model_max_new_tokens):
    if 'Qwen' in model_name or 'falcon' in model_name or 'mistral' in model_name.lower():
        model = HuggingfaceModel(
            model_name, stop_sequences='default',
            max_new_tokens=model_max_new_tokens)
    else:
        raise ValueError(f'Unknown model_name `{model_name}`.')
    return model

In [20]:
#model_name = "falcon-7b-instruct"
#model_name = "Qwen2.5-1.5B-Instruct"
model_name = "Qwen2.5-0.5B-Instruct"
model_max_new_tokens = 100
model = init_model(model_name, model_max_new_tokens)

In [21]:
free, total = torch.cuda.mem_get_info()
print("GPU libre  (GB):", free / 1024**3)
print("GPU total  (GB):", total / 1024**3)

GPU libre  (GB): 13.51171875
GPU total  (GB): 14.52947998046875


In [22]:
p_true_num_fewshot = 4 
num_generations = 5
metric = 'squad'

In [23]:
def get_metric(metric):
    if metric == 'squad':

        squad_metric = load("squad_v2")

        def metric_fct(response, example, *args, **kwargs):
            # Compatibility with recomputation.
            if 'id' in example:
                exid = example['id']
            elif 'id' in example['reference']:
                exid = example['reference']['id']
            else:
                raise ValueError

            prediction = {'prediction_text': response, 'no_answer_probability': 0.0, 'id': exid}
            results = squad_metric.compute(
                predictions=[prediction],
                references=[get_reference(example)])
            return 1.0 if (results['f1'] >= 50.0) else 0.0
    return metric_fct
    


In [24]:
metric = get_metric(metric)

In [25]:
"""Compute p_true uncertainty metric."""
import logging


def construct_few_shot_prompt(
        *, model, dataset, indices, prompt, brief, brief_always, make_prompt,
        num_generations, metric):
    """Construct few shot prompt for p_true uncertainty metric."""

    # Call model n_shots many times.
    few_shot_prompt = []
    all_responses = dict()
    for it, i in enumerate(indices):
        prompt_candidate = []
        example = dataset[i]
        question = example["question"]
        context = example["context"]
        if it != 0:
            prompt_candidate += ['\n']
        prompt_candidate += ['Question: ' + question]
        prompt_candidate += ['\nBrainstormed Answers: ']
        current_question = make_prompt(context, question, None, brief, brief_always)
        local_prompt = prompt + current_question
        logging.info('P_TRUE >> Current Question: '.ljust(25) + current_question)

        responses = []
        for j in range(num_generations + 1):

            if j == 0:
                temperature = 0.1
            else:
                temperature = 1.0

            response, _, _ = model.predict(local_prompt, temperature)
            logging.info('P_TRUE >> Current Response: '.ljust(25) + response)

            responses.append(response)
            prompt_candidate += [f'{response.strip()} \n']
            if j == 0:
                # Save most likely response and compute correctness metric for it.
                most_likely_response = response
                is_correct = metric(response, example, model)
                answers = [answer for answer in example['answers']['text']]
                logging.info('P_TRUE >> LOW-T >> true answer: '.ljust(35) + str(answers))
                logging.info('P_TRUE >> LOW-T >> acc: '.ljust(35) + str(is_correct))

        all_responses[i] = dict(
            responses=responses, most_likely_response=most_likely_response,
            is_correct=is_correct)

        prompt_candidate += ['Possible answer: ' + most_likely_response + '\n']
        prompt_candidate += ['Is the possible answer:\n']
        prompt_candidate += ['A) True\n']
        prompt_candidate += ['B) False\n']
        prompt_candidate += ['The possible answer is:']
        prompt_candidate += [' A' if is_correct else ' B']

        prompt_len = len(model.tokenizer.encode(''.join(few_shot_prompt + prompt_candidate)))
        # At test time, get a maximum of `num_generations * model.token_limit` extra tokens
        # 200 buffer for question and 'Possible Answer'.
        max_input_len = prompt_len + num_generations * model.max_new_tokens + 200

        if max_input_len < model.token_limit:
            few_shot_prompt.extend(prompt_candidate)
        else:
            logging.warning('Cutting of p_true prompt at length %d.', it)
            break

    return ''.join(few_shot_prompt), all_responses, it


def calculate_p_true(
        model, question, most_probable_answer, brainstormed_answers,
        few_shot_prompt, hint=False):
    """Calculate p_true uncertainty metric."""

    if few_shot_prompt:
        prompt = few_shot_prompt + '\n'
    else:
        prompt = ''

    prompt += 'Question: ' + question
    prompt += '\nBrainstormed Answers: '
    for answer in brainstormed_answers + [most_probable_answer]:
        prompt += answer.strip() + '\n'
    prompt += 'Possible answer: ' + most_probable_answer + '\n'
    if not hint:
        prompt += 'Is the possible answer:\n'
        prompt += 'A) True\n'
        prompt += 'B) False\n'
        prompt += 'The possible answer is:'
    else:
        prompt += 'Do the brainstormed answers match the possible answer? Respond with A if they do, if they do not respond with B. Answer:'

    log_prob = model.get_p_true(prompt)

    return log_prob

In [26]:
def get_reference(example):
    if 'answers' not in example:
        example = example['reference']
    answers = example['answers']
    answer_starts = answers.get('answer_start', [])
    reference = {'answers': {'answer_start': answer_starts, 'text': answers['text']}, 'id': example['id']}
    return reference

In [27]:
p_true_indices = random.sample(answerable_indices, p_true_num_fewshot)
remaining_answerable = list(set(remaining_answerable) - set(p_true_indices))
p_true_few_shot_prompt, p_true_responses, len_p_true = construct_few_shot_prompt(
            model=model, dataset=train_dataset, indices=p_true_indices,
            prompt=prompt, brief=BRIEF,
            brief_always=False,
            make_prompt=make_prompt, num_generations=num_generations,
            metric=metric)

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for

In [None]:
p_true_few_shot_prompt

"Question: Which of the 'Classic' horse races, run at Epsom for three year old fillies on the Friday after the derby, is named after the estate then owned by the Earl of Derby?\nBrainstormed Answers: oaks \nOaks \nOaks \nOaks \nOaks race \nOaks race Answer: \nPossible answer: oaks\nIs the possible answer:\nA) True\nB) False\nThe possible answer is: A\nQuestion: In which country is the most northerly point on mainland Africa?\nBrainstormed Answers: tunisia \ntunisia \ntunisia \ntunisia \ntunisia \ntunisia \nPossible answer: tunisia\nIs the possible answer:\nA) True\nB) False\nThe possible answer is: A\nQuestion: Michelangelo Merisi (or Amerighi) is the birth name of which artist?\nBrainstormed Answers: caravaggio \ncaravaggio \ncaravaggio \ncaravaggio \ncaravaggio \ncaravaggio \nPossible answer: caravaggio\nIs the possible answer:\nA) True\nB) False\nThe possible answer is: B\nQuestion: According to figures released by the ONS what was the most popular name for baby boys in 2012?\nBrain

In [None]:
p_true_responses

{12149: {'responses': ['oaks',
   'Oaks',
   'Oaks',
   'Oaks',
   'Oaks race',
   'Oaks race Answer:'],
  'most_likely_response': 'oaks',
  'is_correct': 1.0},
 4506: {'responses': ['tunisia',
   'tunisia',
   'tunisia',
   'tunisia',
   'tunisia',
   'tunisia'],
  'most_likely_response': 'tunisia',
  'is_correct': 1.0},
 4012: {'responses': ['caravaggio',
   'caravaggio',
   'caravaggio',
   'caravaggio',
   'caravaggio',
   'caravaggio'],
  'most_likely_response': 'caravaggio',
  'is_correct': 0.0},
 3657: {'responses': ['harry', 'harry', 'Harry', 'harry', 'Harry', 'harrison'],
  'most_likely_response': 'harry',
  'is_correct': 1.0}}

Start generation answer 

In [None]:
get_training_set_generations = True
temperature = 0.1
get_training_set_generations_most_likely_only = True
num_samples = 10
compute_p_true = False
p_true_hint = True

In [None]:
for dataset_split in ['train', 'validation']:

        # This will store all input data and model predictions.
        accuracies, generations, results_dict, p_trues = [], {}, {}, []

        if dataset_split == 'train':
            if not get_training_set_generations:
                logging.info('Skip training data.')
                continue
            dataset = train_dataset
            possible_indices = list(set(remaining_answerable) | set(unanswerable_indices))

        else:
            dataset = validation_dataset
            possible_indices = range(0, len(dataset))

        # Evaluate over random subset of the datasets.
        indices = random.sample(possible_indices, min(num_samples, len(dataset)))

        #if args.num_samples > len(dataset):
        #    logging.warning('Not enough samples in dataset. Using all %d samples.', len(dataset))

        it = 0
        for index in tqdm(indices):
            if (it + 1 % 10) == 0:
                gc.collect()
                torch.cuda.empty_cache()
            it += 1

            # Grab example at index.
            example = dataset[index]
            question, context = example["question"], example['context']
            generations[example['id']] = {'question': question, 'context': context}
            correct_answer = example['answers']['text']

            current_input = make_prompt(
                context, question, None, BRIEF, True)
            local_prompt = prompt + current_input

            logging.info('Current input: '.ljust(15) + current_input)

            full_responses = []

            # We sample one low temperature answer on which we will compute the
            # accuracy and args.num_generation high temperature answers which will
            # be used to estimate the entropy variants.

            if dataset_split == 'train' and get_training_set_generations_most_likely_only:
                num_generations = 1
            else:
                num_generations = num_generations + 1 # be careful num_generations is the same as above
            
            for i in range(num_generations):

                # Temperature for first generation is always `0.1`.
                temperature = 0.1 if i == 0 else temperature

                predicted_answer, token_log_likelihoods, embedding = model.predict(
                    local_prompt, temperature)
                embedding = embedding.cpu() if embedding is not None else None

                # Only compute accuracy if question is answerable.
                compute_acc = True or (i == 0)
                if correct_answer and compute_acc:
                    acc = metric(predicted_answer, example, model)
                else:
                    acc = 0.0  # pylint: disable=invalid-name

                if i == 0:
                    logging.info('Iteration ' + str(it) + ':  ' + 80*'#')
                    #if args.use_context:
                    #    logging.info('context: '.ljust(15) + str(context))
                    logging.info('question: '.ljust(15) + question)
                    logging.info('low-t prediction: '.ljust(15) + predicted_answer)
                    logging.info('correct answer: '.ljust(15) + str(correct_answer))
                    logging.info('accuracy: '.ljust(15) + str(acc))

                    accuracies.append(acc)

                    most_likely_answer_dict = {
                        'response': predicted_answer,
                        'token_log_likelihoods': token_log_likelihoods,
                        'embedding': embedding,
                        'accuracy': acc}
                    generations[example['id']].update({
                        'most_likely_answer': most_likely_answer_dict,
                        'reference': get_reference(example)})
                    
                else:
                    logging.info('high-t prediction '.ljust(15) + str(i) + ' : ' + predicted_answer)
                    # Aggregate predictions over num_generations.
                    full_responses.append(
                        (predicted_answer, token_log_likelihoods, embedding, acc))

            # Append all predictions for this example to `generations`.
            generations[example['id']]['responses'] = full_responses

            if compute_p_true and dataset_split == 'validation':
                # Already compute p_true here. Avoid cost of generations in compute_uncertainty script.
                p_true = calculate_p_true(
                    model, question, most_likely_answer_dict['response'],
                    [r[0] for r in full_responses], p_true_few_shot_prompt,
                    hint=p_true_hint)
                p_trues.append(p_true)
                logging.info('p_true: %s', p_true)
            
            if dataset_split == "train":
                train_generations = generations
            if dataset_split == "validation":
                validation_generations = generations
                
            # Log overall accuracy.
            accuracy = np.mean(accuracies)
            print(f"Overall {dataset_split} split accuracy: {accuracy}")
            if dataset_split == 'validation':
                results_dict['uncertainty_measures'] = {'p_false':  [1 - p for p in p_trues],'p_false_fixed':  [1 - np.exp(p) for p in p_trues],}

                print(results_dict)


  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
 10%|█         | 1/10 [00:00<00:04,  2.23it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall train split accuracy: 0.0


 20%|██        | 2/10 [00:00<00:03,  2.06it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall train split accuracy: 0.5


 30%|███       | 3/10 [00:01<00:03,  1.97it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall train split accuracy: 0.6666666666666666


 40%|████      | 4/10 [00:02<00:03,  1.97it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall train split accuracy: 0.5


 50%|█████     | 5/10 [00:02<00:02,  2.03it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall train split accuracy: 0.6


 60%|██████    | 6/10 [00:03<00:02,  1.96it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall train split accuracy: 0.6666666666666666


 70%|███████   | 7/10 [00:03<00:01,  2.01it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall train split accuracy: 0.7142857142857143


 80%|████████  | 8/10 [00:03<00:00,  2.03it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall train split accuracy: 0.625


 90%|█████████ | 9/10 [00:04<00:00,  1.88it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall train split accuracy: 0.6666666666666666


100%|██████████| 10/10 [00:05<00:00,  1.97it/s]


Overall train split accuracy: 0.7


  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
 10%|█         | 1/10 [00:00<00:08,  1.08it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall validation split accuracy: 0.0
{'uncertainty_measures': {'p_false': [], 'p_false_fixed': []}}


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
 20%|██        | 2/10 [00:02<00:09,  1.20s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall validation split accuracy: 0.5
{'uncertainty_measures': {'p_false': [], 'p_false_fixed': []}}


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
 30%|███       | 3/10 [00:04<00:11,  1.61s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall validation split accuracy: 0.6666666666666666
{'uncertainty_measures': {'p_false': [], 'p_false_fixed': []}}


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
 40%|████      | 4/10 [00:06<00:11,  1.89s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall validation split accuracy: 0.5
{'uncertainty_measures': {'p_false': [], 'p_false_fixed': []}}


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
 50%|█████     | 5/10 [00:09<00:10,  2.17s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall validation split accuracy: 0.6
{'uncertainty_measures': {'p_false': [], 'p_false_fixed': []}}


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
 60%|██████    | 6/10 [00:12<00:10,  2.60s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall validation split accuracy: 0.6666666666666666
{'uncertainty_measures': {'p_false': [], 'p_false_fixed': []}}


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
 70%|███████   | 7/10 [00:17<00:10,  3.33s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall validation split accuracy: 0.5714285714285714
{'uncertainty_measures': {'p_false': [], 'p_false_fixed': []}}


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
 80%|████████  | 8/10 [00:22<00:07,  3.71s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall validation split accuracy: 0.625
{'uncertainty_measures': {'p_false': [], 'p_false_fixed': []}}


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
 90%|█████████ | 9/10 [00:26<00:03,  3.89s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Overall validation split accuracy: 0.6666666666666666
{'uncertainty_measures': {'p_false': [], 'p_false_fixed': []}}


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
100%|██████████| 10/10 [00:31<00:00,  3.19s/it]

Overall validation split accuracy: 0.7
{'uncertainty_measures': {'p_false': [], 'p_false_fixed': []}}





# uncertainty measure

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


def get_p_ik(train_embeddings, is_false, eval_embeddings=None, eval_is_false=None):
    """Fit linear classifier to embeddings to predict model correctness."""

    logging.info('Accuracy of model on Task: %f.', 1 - torch.tensor(is_false).mean())  # pylint: disable=no-member

    # Convert the list of tensors to a 2D tensor.
    train_embeddings_tensor = torch.cat(train_embeddings, dim=0)  # pylint: disable=no-member
    # Convert the tensor to a numpy array.
    embeddings_array = train_embeddings_tensor.cpu().numpy()

    # Split the data into training and test sets.
    X_train, X_test, y_train, y_test = train_test_split(  # pylint: disable=invalid-name
        embeddings_array, is_false, test_size=0.2, random_state=42)  # pylint: disable=invalid-name

    # Fit a logistic regression model.
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Predict deterministically and probabilistically and compute accuracy and auroc for all splits.
    X_eval = torch.cat(eval_embeddings, dim=0).cpu().numpy()  # pylint: disable=no-member,invalid-name
    y_eval = eval_is_false

    Xs = [X_train, X_test, X_eval]  # pylint: disable=invalid-name
    ys = [y_train, y_test, y_eval]  # pylint: disable=invalid-name
    suffixes = ['train_train', 'train_test', 'eval']

    metrics, y_preds_proba = {}, {}

    for suffix, X, y_true in zip(suffixes, Xs, ys):  # pylint: disable=invalid-name

        # If suffix is eval, we fit a new model on the entire training data set
        # rather than just a split of the training data set.
        if suffix == 'eval':
            model = LogisticRegression()
            model.fit(embeddings_array, is_false)
            convergence = {
                'n_iter': model.n_iter_[0],
                'converged': (model.n_iter_ < model.max_iter)[0]}

        y_pred = model.predict(X)
        y_pred_proba = model.predict_proba(X)
        y_preds_proba[suffix] = y_pred_proba
        acc_p_ik_train = accuracy_score(y_true, y_pred)
        auroc_p_ik_train = roc_auc_score(y_true, y_pred_proba[:, 1])
        split_metrics = {
            f'acc_p_ik_{suffix}': acc_p_ik_train,
            f'auroc_p_ik_{suffix}': auroc_p_ik_train}
        metrics.update(split_metrics)

    logging.info('Metrics for p_ik classifier: %s.', metrics)

    # Return model predictions on the eval set.
    return y_preds_proba['eval'][:, 1]

In [77]:
use_all_generations = False
use_num_generations = 1

In [78]:
validation_generations

{'sfq_12419--81/81_1632714.txt#0_0': {'question': "Britain's first jet fighter, The Meteor was made by which company?",
  'context': "[DOC] [TLE] h2g2 - The Gloster Meteor: Britain's First Operational ...h2g2 - The Gloster Meteor: Britain's First Operational Military Jet - Edited Entry [PAR] To The Galaxy [PAR] Earth Edition [PAR] The Gloster Meteor: Britain's First Operational Military Jet Content from the guide to life, the universe and everything [PAR] The Gloster Meteor: Britain's First Operational Military Jet [PAR] Created [PAR] 4 Conversations [PAR] The Gloster Meteor was the first jet fighter aircraft of the British Royal Air Force, and was introduced into service in August, 1944, only weeks after the Third Reich's Messerschmitt Me262 jet aircraft. It was thus the world's second jet fighter aircraft, and the first and only one of the WWII Allied powers. [PAR] The Gloster Meteor was developed in order to combat the V1 flying bombs, themselves powered by a type of jet engine know

In [79]:
for idx, tid in enumerate(validation_generations):
    print(idx,tid)

0 sfq_12419--81/81_1632714.txt#0_0
1 odql_13272--106/106_2325364.txt#0_2
2 dpql_4988--47/47_120295.txt#0_1
3 dpql_2134--125/125_2647258.txt#0_0
4 qb_9879--45/45_2639890.txt#0_0
5 sfq_2452--77/77_2081353.txt#0_1
6 wh_125--148/148_727507.txt#0_1
7 sfq_19406--6/6_253619.txt#0_1
8 sfq_9178--181/181_42561.txt#0_2
9 bb_5945--146/146_963971.txt#0_0
10 bb_810--80/80_847824.txt#0_1
11 bt_805--114/114_2376528.txt#0_0
12 qg_2269--95/95_2861224.txt#0_0
13 sfq_24275--187/187_73696.txt#0_2
14 bb_2494--157/157_888075.txt#0_0
15 sfq_17698--20/20_2198218.txt#0_1
16 sfq_646--117/117_583579.txt#0_1
17 qw_12905--18/18_1287729.txt#0_0
18 qb_7738--168/168_481285.txt#0_0
19 qf_85--33/33_1632330.txt#0_0
20 dpql_3790--45/45_2651731.txt#0_1
21 bb_7554--37/37_998743.txt#0_1
22 dpql_2467--20/20_619177.txt#0_2
23 odql_13002--100/100_1406227.txt#0_1
24 jp_2757--99/99_78205.txt#0_1
25 odql_12511--3/3_2311203.txt#0_2
26 sfq_25255--0/0_2027211.txt#0_1
27 qb_9602--35/35_531143.txt#0_0
28 sfq_11409--174/174_548373.txt#0

In [80]:
validation_embeddings, validation_is_true, validation_answerable = [], [], []
p_trues = []
count = 0  # pylint: disable=invalid-name

def is_answerable(generation):
        return len(generation['reference']['answers']['text']) > 0

    # Loop over datapoints and compute validation embeddings and entropies.
for idx, tid in enumerate(validation_generations):
        example = validation_generations[tid]
        question = example['question']
        context = example['context']
        full_responses = example["responses"]
        most_likely_answer = example['most_likely_answer']

        if not use_all_generations:
            if use_num_generations == -1:
                raise ValueError
            responses = [fr[0] for fr in full_responses[:use_num_generations]]
        else:
            responses = [fr[0] for fr in full_responses]

        #if args.recompute_accuracy:
         #   logging.info('Recomputing accuracy!')
          #  if is_answerable(example):
           #     try:
            #        acc = metric(most_likely_answer['response'], example, metric_model)
             #   except Exception as e:
              #      logging.error("Unable to calculate metric due to an error from the model. Rollback to previous acc")
               #     logging.error(str(e))
                #    acc = most_likely_answer['accuracy']
           # else:
           #     acc = 0.0  # pylint: disable=invalid-name
           # validation_generations[tid]['most_likely_answer']["accuracy"] = acc
           # validation_is_true.append(acc)
           # logging.info('Recomputed accuracy!')
        #else:
         #   validation_is_true.append(most_likely_answer['accuracy'])

        validation_answerable.append(is_answerable(example))
        validation_embeddings.append(most_likely_answer['embedding'])
        #logging.info('validation_is_true: %f', validation_is_true[-1])

    

KeyError: 'responses'

In [82]:
validation_is_false = [1.0 - is_t for is_t in validation_is_true]
results_dict['validation_is_false'] = validation_is_false

validation_unanswerable = [1.0 - is_a for is_a in validation_answerable]
results_dict['validation_unanswerable'] = validation_unanswerable
logging.info('Unanswerable prop on validation: %f', np.mean(validation_unanswerable))


In [83]:
# Assemble training data for embedding classification.
train_is_true, train_embeddings, train_answerable = [], [], []
for tid in train_generations:
    most_likely_answer = train_generations[tid]['most_likely_answer']
    train_embeddings.append(most_likely_answer['embedding'])
    train_is_true.append(most_likely_answer['accuracy'])
    train_answerable.append(is_answerable(train_generations[tid]))
train_is_false = [0.0 if is_t else 1.0 for is_t in train_is_true]
train_unanswerable = [0.0 if is_t else 1.0 for is_t in train_answerable]
logging.info('Unanswerable prop on p_ik training: %f', np.mean(train_unanswerable))

In [84]:
print(train_embeddings)


[tensor([[-1.1338,  2.8438,  1.1396,  ...,  2.1699,  2.0684, -2.9160]],
       dtype=torch.float16), tensor([[ 1.5312,  1.7178,  1.9268,  ...,  0.0306, -3.9277,  1.6230]],
       dtype=torch.float16), tensor([[-2.9727, -0.8647,  2.2051,  ...,  0.6875, -2.3984,  2.8223]],
       dtype=torch.float16), tensor([[ 0.4609, -1.1826,  0.0132,  ..., -2.4102,  1.0469, -0.6650]],
       dtype=torch.float16), tensor([[ 1.8252, -2.6016, -1.4883,  ..., -1.2969,  0.0414,  5.1680]],
       dtype=torch.float16), tensor([[-2.8984, -1.9199,  1.2510,  ..., -0.8281,  3.2305,  4.5195]],
       dtype=torch.float16), tensor([[ 1.7500, -0.5732, -0.2842,  ...,  0.1616, -5.4180, -1.2832]],
       dtype=torch.float16), tensor([[-3.0762, -3.7910,  0.1038,  ..., -1.5498, -0.5112, -0.4670]],
       dtype=torch.float16), tensor([[-2.4727,  1.2090,  2.7480,  ..., -1.5693,  0.6182, -0.6035]],
       dtype=torch.float16), tensor([[-1.7930, -2.1406, -4.8008,  ..., -2.3984, -2.2871, -2.1445]],
       dtype=torch.float16),

In [85]:
print(len(train_embeddings))

50


In [86]:
print(validation_embeddings)

[tensor([[-3.2988,  2.7344, -0.5352,  ..., -1.3262, -3.9336, -0.7207]],
       dtype=torch.float16), tensor([[-0.9033,  0.9478, -0.5645,  ...,  1.2842,  1.7930, -2.0645]],
       dtype=torch.float16), tensor([[ 0.4058,  1.8350, -1.7148,  ..., -0.5117, -2.0762, -0.6523]],
       dtype=torch.float16), tensor([[ 1.1182, -3.2773,  2.9902,  ..., -1.9912, -0.5615, -1.0117]],
       dtype=torch.float16), tensor([[-1.7607,  0.9526,  0.2219,  ..., -0.3979, -0.9233, -0.0197]],
       dtype=torch.float16), tensor([[-0.7207,  2.0742,  3.6191,  ...,  1.6367, -1.1699, -1.6475]],
       dtype=torch.float16), tensor([[-0.9390,  1.1572, -0.8691,  ...,  1.5195,  2.3379, -0.9121]],
       dtype=torch.float16), tensor([[ 1.0898, -0.1343,  7.4766,  ...,  0.2776, -0.0545,  1.6250]],
       dtype=torch.float16), tensor([[ 2.2539, -2.2539,  0.3357,  ...,  3.3359,  0.9341, -5.3633]],
       dtype=torch.float16), tensor([[-0.7100,  0.8540,  1.5801,  ..., -0.0584, -0.3848, -0.9614]],
       dtype=torch.float16),

In [87]:
train_answerable

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

ERROR:root:Cannot comput_p_ik_answerable


name 'validation_is_false' is not defined


# Train model

In [88]:
try:
    logging.info('Starting training p_ik on train embeddings.')
    # Train classifier of correct/incorrect from embeddings.
    p_ik_predictions = get_p_ik(
        train_embeddings=train_embeddings, is_false=train_is_false,
        eval_embeddings=validation_embeddings, eval_is_false=validation_is_false)
    result_dict['uncertainty_measures']['p_ik'] = p_ik_predictions
    logging.info('Finished training p_ik on train embeddings.')
except Exception as e:
    logging.error("Cannot comput_p_ik_answerable")
    print(str(e))

ERROR:root:Cannot comput_p_ik_answerable


Found input variables with inconsistent numbers of samples: [0, 48]


In [89]:
train_is_false

[1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0]

# Annexe 

In [None]:
        
    if args.compute_p_ik:
        try:
            logging.info('Starting training p_ik on train embeddings.')
            # Train classifier of correct/incorrect from embeddings.
            p_ik_predictions = get_p_ik(
                train_embeddings=train_embeddings, is_false=train_is_false,
                eval_embeddings=validation_embeddings, eval_is_false=validation_is_false)
            result_dict['uncertainty_measures']['p_ik'] = p_ik_predictions
            logging.info('Finished training p_ik on train embeddings.')
        except Exception as e:
            logging.error("Cannot comput_p_ik_answerable")
            print(str(e))

    if args.compute_p_ik_answerable:
        # Train classifier of answerable/unanswerable.
        try:
            p_ik_predictions = get_p_ik(
                train_embeddings=train_embeddings, is_false=train_unanswerable,
                eval_embeddings=validation_embeddings, eval_is_false=validation_unanswerable)
        except Exception as e:
            logging.error("Cannot compute_p_ik_answerable")
            print(str(e))
        result_dict['uncertainty_measures']['p_ik_unanswerable'] = p_ik_predictions


In [None]:
import bitsandbytes as bnb; print(bnb.__version__)

In [None]:
import torch; print(torch.version.cuda, torch.cuda.is_available())

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "tiiuae/falcon-7b-instruct"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0  # optionnel, peut stabiliser la mémoire
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",       # répartit sur GPU(s)
    trust_remote_code=True,
)


In [5]:
import torch

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "tiiuae/falcon-7b-instruct"

#bnb_config = BitsAndBytesConfig(load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.float16,
    device_map="auto",
)


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "Qwen/Qwen2.5-1.5B-Instruct"

#bnb_config = BitsAndBytesConfig(load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.float16,
    device_map="auto",
)


In [9]:
free, total = torch.cuda.mem_get_info()
print("GPU libre  (GB):", free / 1024**3)
print("GPU total  (GB):", total / 1024**3)

GPU libre  (GB): 11.492919921875
GPU total  (GB): 14.56805419921875


In [None]:
%pip install bitsandbytes

In [None]:
import torch

print("CUDA disponible :", torch.cuda.is_available())
print("Nombre de GPU :", torch.cuda.device_count())

if torch.cuda.is_available():
    idx = torch.cuda.current_device()
    print("GPU actuel :", torch.cuda.get_device_name(idx))
    print("VRAM totale (GB) :", torch.cuda.get_device_properties(idx).total_memory / 1024**3)


In [None]:
bnb_config

In [None]:
%pip uninstall bitsandbytes -y
%pip install bitsandbytes-cuda128
