In [None]:
!pip install datasets
import numpy as np
from datasets import load_dataset
data = load_dataset("riddle_sense")

**Task 1:**
Create a new dataset where each possible answer appears only once.

In [None]:
listOfIndicesToSelect = list()
listOfIndicesToRemove = list()
allWordSet = set()
for index,sample in enumerate(data["train"]):
  # print("The choices are: ",sample["choices"]["text"])
  shouldAddFlag = 1
  for word in sample["choices"]["text"]:
    if word in allWordSet:
      shouldAddFlag = 0
      # print(f"---Found Repeat!---word: {word}, set is: {allWordSet}")
      # print(f"Current list of indices: {listOfIndicesToRemove}")
      break
    else:
      allWordSet.add(word)
      # print(f"---New Word!_--, set is {allWordSet}")
  if shouldAddFlag:
    listOfIndicesToSelect.append(index)
  else:
    listOfIndicesToRemove.append(index)

# print(f"number of samples to add: {len(listOfIndicesToSelect)}")
# print(f"number of samples to remove: {len(listOfIndicesToRemove)}")

newDataset = data["train"].select(listOfIndicesToSelect)


In [None]:
newDataset

Dataset({
    features: ['answerKey', 'question', 'choices'],
    num_rows: 657
})

In [None]:
listOfIndicesToSelect

In [None]:
import pickle
import torch
# load correctness and true_label_prob
with open('/content/stats.pkl', 'rb') as f:
    data_loaded = pickle.load(f)

# Access
correctness = data_loaded['correctness']
true_label_probs = data_loaded['true_label_probs']
confidence = torch.mean(true_label_probs, dim=1) # mean prob of TRUE labels across all instances in eval_data
variability = torch.std(true_label_probs, dim=1, correction=0) # std prob of TRUE labels across all instances in in eval_data


In [None]:
print(confidence[:5])
print(variability[:5])


tensor([0.7555, 0.7266, 0.7194, 0.7498, 0.7353])
tensor([0.2531, 0.3038, 0.3392, 0.2963, 0.3065])


In [None]:
newCorrectness =torch.take(correctness, torch.tensor(listOfIndicesToSelect))
newConfidence =torch.take(confidence, torch.tensor(listOfIndicesToSelect))
newVariability =torch.take(variability, torch.tensor(listOfIndicesToSelect))


In [None]:
import plotly.express as px
# Compute bins of correctness
bins = np.digitize(newCorrectness, bins=[0.0000001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

# Create a dictionary to map bin numbers to bin labels
bin_labels = {0:'0', 1: '(0.0-0.1)', 2: '[0.1-0.2)', 3: '[0.2-0.3)', 4: '[0.3-0.4)', 5: '[0.4-0.5)', 6: '[0.5-0.6)',7: '[0.6-0.7)',8: '[0.7-0.8)',9:'[0.8-0.9)',10: '[0.9-1)', 11:'1'}

# Apply the mapping to bins to get bin labels
bins_str = np.vectorize(bin_labels.get)(bins)

colors = px.colors.sample_colorscale("Jet", [n/(11) for n in range(12)])

fig = px.scatter(
    x=newVariability.numpy().squeeze(),
    y=newConfidence.numpy().squeeze(),
    color=bins_str.squeeze(),
    symbol=bins_str.squeeze(),
    color_discrete_sequence=colors,
    labels={
        "color": "newCorrectness",
        "symbol": "newCorrectness",
        "y": "Confidence",
        "x": "variability"
    },
    category_orders = {"color": list(bin_labels.values())[::-1]}
)
fig.show()

In [None]:
def classify_points(confidence_threshold = 0.2, variability_threshold = 0.2):
    hard_to_learn = []
    easy_to_learn=[]
    ambiguous=[]
    for i, datapoint in enumerate(riddleSense_train_map_pre):
      conf = confidence[i]
      var = variability[i]
      # Hard to learn
      if conf < confidence_threshold and var < variability_threshold:
        hard_to_learn.append(i)

      # Easy to learn
      elif conf>= confidence_threshold and var < variability_threshold:
        easy_to_learn.append(i)
      # Ambigious
      else:
        ambiguous.append(i)

    data_dict = {"easy_to_learn": easy_to_learn,
                  "hard_to_learn": hard_to_learn,
                  "ambiguous": ambiguous}
    return data_dict


In [None]:
# Test that each possible answer appears only once. Result should be nothing printed

mapDict = {"A": 0, "B": 1, "C":2, "D":3, "E":4}
result = []
for sample in newDataset:
  result.append(sample["choices"]["text"][mapDict[sample["answerKey"]]])

result = np.array(result)
a = np.unique(result, return_counts=True)

for index, word in enumerate(a[0]):
  if a[1][index] != 1:
    print(f"word: {word}, times: {a[1][index]}")


In [None]:
# Test that each possible answer appears only once. Result should be all 1's

check = list()
for sample in newDataset:
  for word in sample["choices"]:
    check.append(word)

check = np.array(check)
a = np.unique(result, return_counts=True)
print(a[1])

Task 2: Create the following statistic about each possible answer:
Number of times that the word/phrase appeard in a correct answer out of the number of times the word/phrase appeared as a possible answer


In [None]:
mapDict = {"A": 0, "B": 1, "C":2, "D":3, "E":4}
result = []
for sample in data["train"]:
  result.append(sample["choices"]["text"][mapDict[sample["answerKey"]]])

result = np.array(result)
a = np.unique(result, return_counts=True)

# j = 0
# for index, word in enumerate(a[0]):
#   if a[1][index] != 1:
#     print(f"word: {word}, times: {a[1][index]}")
    # j+=1
# print(j)

probs = {}
for index,word in enumerate(a[0]):
  probs[word] = [a[1][index], 0]

# j = 0
for sample in data["train"]:
  for word in sample["choices"]["text"]:
    if word not in probs:
      probs[word] = [0, 1]
      # j += 1
    else:
      probs[word][1] += 1
# print(j)

always1 = dict()
always0 = dict()
others = dict()

realProbsDict = {}
for word in probs:
  prob = probs[word][0]/probs[word][1]
  realProbsDict[word] = prob
  if prob == 1:
    always1[word] = 1
  elif prob == 0:
    always0[word] = 0
  else:
    others[word] = prob


In [None]:
x = len(always1) + len(always0) + len(others)
x

10772

In [None]:
len(always1)/x


0.2702376531748979

In [None]:
len(always0)/x

0.7078536947642035

In [None]:
len(others)/x

0.021908652060898627

Task 3: Create a dataset that doesn't include simillar words (such as hole and holes) in as answers to different riddles.