In [1]:
!pip install datasets
!pip install evaluate
!pip install transformers==4.28.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collec

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# imports:
from transformers import DataCollatorWithPadding
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
from datasets import load_dataset
from transformers import AutoConfig , AutoTokenizer , AutoModelForMultipleChoice, TrainingArguments, Trainer, DefaultDataCollator
from evaluate import load
import numpy as np
import plotly.express as px
import pickle

In [4]:
MODEL_NAME = "bert-base-uncased"

In [5]:
config = AutoConfig.from_pretrained("bert-base-uncased",hidden_dropout_prob = 0.3, attention_probs_dropout_prob=0.3)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
accuracy = load("accuracy")
print(config)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.3,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.3,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [6]:
label_mapping_dict = {'A':0, 'B':1, "C":2, "D":3, "E":4}
num_to_label_dict = {0:'A', 1:"B", 2:"C", 3:"D", 4:"E"}

def mapping_func(n):
  return label_mapping_dict[n]


In [7]:
def preprocess_function(examples):
    question_headers = examples["question"]
    second_sentences = [
        [f"{header}[SEP]{examples['choices'][i]['text'][j]}" for j in range(5)] for i, header in enumerate(question_headers)
    ]

    second_sentences = sum(second_sentences, [])

    tokenized_examples = tokenizer(second_sentences, truncation=True)
    return {k: [v[i : i + 5] for i in range(0, len(v), 5)] for k, v in tokenized_examples.items()}

In [8]:
riddleSense_train = load_dataset('riddle_sense', split='train')
riddleSense_train_map = load_dataset('riddle_sense', split='train')
riddleSense_val = load_dataset('riddle_sense', split='validation')

Downloading builder script:   0%|          | 0.00/5.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.11k [00:00<?, ?B/s]

Downloading and preparing dataset riddle_sense/default to /root/.cache/huggingface/datasets/riddle_sense/default/0.1.0/1b311d24c97e1fd41975315faf11fd918a56db0289367a99944ef0fa3dfd6811...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/414k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3510 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1021 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1184 [00:00<?, ? examples/s]

Dataset riddle_sense downloaded and prepared to /root/.cache/huggingface/datasets/riddle_sense/default/0.1.0/1b311d24c97e1fd41975315faf11fd918a56db0289367a99944ef0fa3dfd6811. Subsequent calls will reuse this data.




In [9]:
preprocessed_train = riddleSense_train.map(preprocess_function, batched=True)
riddleSense_train_map_pre = riddleSense_train_map.map(preprocess_function, batched=True)
val_pre = riddleSense_val.map(preprocess_function, batched=True)

Map:   0%|          | 0/3510 [00:00<?, ? examples/s]

Map:   0%|          | 0/3510 [00:00<?, ? examples/s]

Map:   0%|          | 0/1021 [00:00<?, ? examples/s]

In [10]:
@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = 'answerKey'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        new_labels = []
        for label in labels:
          new_labels.append(label_mapping_dict[label])
        batch["labels"] = torch.tensor(new_labels, dtype=torch.int64)
        return batch

In [11]:
class CustomTrainer(Trainer):
    def __init__(self, model, args, train_dataset, eval_dataset, tokenizer, compute_metrics, data_collator):
        super().__init__(model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset,
                         tokenizer=tokenizer, compute_metrics=compute_metrics, data_collator=data_collator)
        self.acc = []
        self.eval_dataset = eval_dataset
        self.labels = torch.tensor([label_mapping_dict[i] for i in self.eval_dataset["answerKey"]])


    def evaluate(self, ignore_keys=None, metric_key_prefix="eval"):
        output = super().evaluate(self.eval_dataset, ignore_keys, metric_key_prefix)
        predictions = self.predict(self.eval_dataset)
        probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions[1]), dim=1)
        pred_labels = torch.argmax(probs, dim=1)
        true_label_probs.append(probs.gather(1, self.labels.unsqueeze(1)).squeeze())
        correctness.append((pred_labels == self.labels).float())
        return output

In [12]:
model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased", config=config)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [13]:
true_label_probs = []
correctness = []

In [14]:
for i in range(1, 6):
  training_args = TrainingArguments("riddle_sense_check", save_strategy="no", label_names=['answerKey'], evaluation_strategy="epoch", num_train_epochs=i)
  model = AutoModelForMultipleChoice.from_pretrained(MODEL_NAME, config=config)
  trainer = CustomTrainer(model=model,
                    args=training_args,
                    train_dataset=preprocessed_train.shuffle(),
                    eval_dataset=riddleSense_train_map_pre,
                    tokenizer=tokenizer,
                    compute_metrics=None,
                    data_collator = DataCollatorForMultipleChoice(tokenizer))
  trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

Epoch,Training Loss,Validation Loss
1,No log,No log


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

Epoch,Training Loss,Validation Loss
1,No log,No log
2,1.106800,No log


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

Epoch,Training Loss,Validation Loss
1,No log,No log
2,1.037000,No log
3,0.670400,No log


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

Epoch,Training Loss,Validation Loss
1,No log,No log
2,1.097700,No log
3,0.703200,No log
4,0.500000,No log


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

Epoch,Training Loss,Validation Loss
1,No log,No log
2,1.086500,No log
3,0.703500,No log
4,0.496900,No log
5,0.367600,No log


In [15]:
# saving correctness and true_label_prob
data_to_dump = {
    'correctness': torch.mean(torch.stack(correctness), keepdim=True, dim=0),
    'true_label_probs': torch.stack(true_label_probs,dim=1)

}
with open('/content/drive/MyDrive/ANLP/stats.pkl', 'wb') as f:
    pickle.dump(data_to_dump, f)

In [16]:
# load correctness and true_label_prob
with open('/content/drive/MyDrive/ANLP/stats.pkl', 'rb') as f:
    data_loaded = pickle.load(f)

# Access
correctness = data_loaded['correctness']
true_label_probs = data_loaded['true_label_probs']
confidence = torch.mean(true_label_probs, dim=1) # mean prob of TRUE labels across all instances in eval_data
variability = torch.std(true_label_probs, dim=1, correction=0) # std prob of TRUE labels across all instances in in eval_data


#DataMAP visualization

In [17]:
# Plot the Data Map.

# Compute bins of correctness
bins = np.digitize(correctness, bins=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

# Create a dictionary to map bin numbers to bin labels
bin_labels = {1: '0.0-0.1', 2: '0.1-0.2', 3: '0.2-0.3', 4: '0.3-0.4', 5: '0.4-0.5', 6: '0.5-0.6',7: '0.6-0.7',8: '0.7-0.8',9:'0.8-0.9',10: '>0.9'}

# Apply the mapping to bins to get bin labels
bins_str = np.vectorize(bin_labels.get)(bins)


fig = px.scatter(
    x=variability.numpy().squeeze(),
    y=confidence.numpy().squeeze(),
    color=bins_str.squeeze(),
    symbol=bins_str.squeeze(),
    labels={
        "color": "Bin number",
        "symbol": "Bin number",
        "y": "Confidence",
        "x": "variability"
    },
    category_orders = {"color": list(bin_labels.values())[::-1]}
)
fig.show()

In [None]:
import matplotlib.pyplot as plt

confidence_np = confidence.numpy().squeeze()
variability_np = variability.numpy().squeeze()
correctness_np = correctness.numpy().squeeze()

# Create subplots for each histogram
fig, axs = plt.subplots(3, figsize=(10, 15))

# Plot histogram for confidence
axs[0].hist(confidence_np, bins=10, color='blue', alpha=0.7)
axs[0].set_title('Confidence')

# Plot histogram for variability
axs[1].hist(variability_np, bins=10, color='green', alpha=0.7)
axs[1].set_title('Variability')

# Plot histogram for correctness
axs[2].hist(correctness_np, bins=10, color='red', alpha=0.7)
axs[2].set_title('Correctness')

# Display the plots
plt.tight_layout()
plt.show()

The scatter plot will contain one point for each evaluation sample.

The x-coordinate of each point will be the variability of the model's predictions for that sample across all epochs, the y-coordinate will be the confidence (the mean model probability of the true label across epochs.) for that sample, and the color and symbol of the point will indicate the bin of the correctness for that sample (how often the model correctly predicted the label of that sample across all epochs).

This visualization will help you to identify patterns in how your model's predictions vary across epochs for different samples. For example, you might see that samples that the model finds harder to classify (those with greater variability and lower confidence) are more likely to be misclassified more often (higher correctness bin).

In [41]:
def classify_points(confidence_threshold = 0.2, variability_threshold = 0.2):
    hard_to_learn = []
    easy_to_learn=[]
    ambiguous=[]
    for i, datapoint in enumerate(riddleSense_train_map_pre):
      conf = confidence[i]
      var = variability[i]
      # Hard to learn
      if conf < confidence_threshold and var<variability_threshold:
        hard_to_learn.append(i)

      # Easy to learn
      elif conf>= confidence_threshold and var<variability_threshold:
        easy_to_learn.append(i)
      # Ambigious
      else:
        ambiguous.append(i)

    data_dict = {"easy_to_learn": easy_to_learn,
                  "hard_to_learn": hard_to_learn,
                  "ambiguous": ambiguous}
    return data_dict


##TODO: need to decide the right confidence and variability thresholds. currently set at 0.2.

In [42]:
data_dict = classify_points(confidence_threshold = 0.2,variability_threshold = 0.2)


easy_to_learn_idx = data_dict["easy_to_learn"]
hard_to_learn_idx = data_dict["hard_to_learn"]
ambiguous_idx = data_dict["ambiguous"]

# assert result == len(trainset)
print(len(easy_to_learn_idx)+len(hard_to_learn_idx) + len(ambiguous_idx))


easy_to_learn_data = riddleSense_train_map_pre.select(easy_to_learn_idx)
hard_to_learn_data = riddleSense_train_map_pre.select(hard_to_learn_idx)
ambiguous_data = riddleSense_train_map_pre.select(ambiguous_idx)

# print quantity and percentages
total_data_num = riddleSense_train_map_pre.num_rows
print(f"Total datapoints: {total_data_num}")
print(f"Easy datapoints: {easy_to_learn_data.num_rows}")
print(f"Hard datapoints: {hard_to_learn_data.num_rows}")
print(f"Ambiguous datapoints: {ambiguous_data.num_rows}")

easy_data_perc = easy_to_learn_data.num_rows/total_data_num
hard_data_perc = hard_to_learn_data.num_rows/total_data_num
ambiguous_data_perc = ambiguous_data.num_rows/total_data_num

print(f"Easy datapoints percentage: {easy_data_perc}")
print(f"Hard_datapoints percentage: {hard_data_perc}")
print(f"Ambiguous datapoints percentage: {ambiguous_data_perc}")


3510
Total datapoints: 3510
Easy datapoints: 2222
Hard datapoints: 102
Ambiguous datapoints: 1186
Easy datapoints percentage: 0.6330484330484331
Hard_datapoints percentage: 0.02905982905982906
Ambiguous datapoints percentage: 0.3378917378917379


In [45]:
# Plot the Data Map based on Hard, Easy, Ambiguous regions


categories = np.empty_like(confidence,dtype = np.str)
categories[easy_to_learn_idx] = "easy_to_learn"
categories[hard_to_learn_idx] = "hard_to_learn"
categories[ambiguous_idx] = "ambiguous"

# Now we plot the Data Map
fig = px.scatter(
    x=variability.numpy().squeeze(),
    y=confidence.numpy().squeeze(),
    color=categories,
    labels={
        "color": "Learning Difficulty",
        "y": "Confidence",
        "x": "Variability"
    },
    category_orders = {"color": ["Easy_to_learn", "ambiguous", "hard_to_learn"]}
)
fig.show()


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



In [23]:
E# visualization
print(easy_to_learn_data[2])
print(hard_to_learn_data[2])
print(ambiguous_data[2])
#print mean of length for each


{'answerKey': 'A', 'question': 'If you allow me to live I shall not live long.   But if you kill me I shall live longer', 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['candle', 'spring forth', 'end life', 'faith', 'pen']}, 'input_ids': [[101, 2065, 2017, 3499, 2033, 2000, 2444, 1045, 4618, 2025, 2444, 2146, 1012, 2021, 2065, 2017, 3102, 2033, 1045, 4618, 2444, 2936, 102, 13541, 102], [101, 2065, 2017, 3499, 2033, 2000, 2444, 1045, 4618, 2025, 2444, 2146, 1012, 2021, 2065, 2017, 3102, 2033, 1045, 4618, 2444, 2936, 102, 3500, 5743, 102], [101, 2065, 2017, 3499, 2033, 2000, 2444, 1045, 4618, 2025, 2444, 2146, 1012, 2021, 2065, 2017, 3102, 2033, 1045, 4618, 2444, 2936, 102, 2203, 2166, 102], [101, 2065, 2017, 3499, 2033, 2000, 2444, 1045, 4618, 2025, 2444, 2146, 1012, 2021, 2065, 2017, 3102, 2033, 1045, 4618, 2444, 2936, 102, 4752, 102], [101, 2065, 2017, 3499, 2033, 2000, 2444, 1045, 4618, 2025, 2444, 2146, 1012, 2021, 2065, 2017, 3102, 2033, 1045, 4618, 2444, 2936, 102, 7279,

Finetune Bert model only on part of the data (easy/hard/ambigious)

In [None]:
data = ambiguous_data
training_args = TrainingArguments("riddle_sense_check", save_strategy="no", label_names=['answerKey'], evaluation_strategy="no", num_train_epochs=9)
model = AutoModelForMultipleChoice.from_pretrained(MODEL_NAME, config=config)
trainer = CustomTrainer(model=model,
                  args=training_args,
                  train_dataset=data.shuffle(),
                  eval_dataset=riddleSense_train_map_pre, #TODO: check if can remove
                  tokenizer=tokenizer,
                  compute_metrics=None,
                  data_collator = DataCollatorForMultipleChoice(tokenizer))

trainer.train()


# get predictions
predictions, label_ids, _ = trainer.predict(val_pre)
labels = np.array([i for i in map(mapping_func, val_pre['answerKey'])])
predicted_classes = np.argmax(predictions[1], axis=1)
accuracy = np.mean(labels == predicted_classes)
# Accuracy
print(f'Accuracy: {accuracy}')



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

Step,Training Loss
500,1.3926


##TODO: we dont have alot of hard_to_lean data. -->explore datapoints. check labeling mistakes as surmised by DATA-MAP paper

In [None]:
#TODO:
print(hard_to_learn_data)

#Data Analysis

Answer stats

- datapoints (Quest & Ans) are unique
- number of datapoints with full answer overlap
- what about Questions with same true label
- distribution of true labels
- avrg length of true answer related to rest answers
- Identify for each categry with a set of variables

we found that a mere 2 percets of words appear in true answers and wrong ones
Thus even a small correlation between ture answers words may greatly help the model.