In [11]:
import os
import pandas as pd
import sqlite3
from copy import deepcopy

DATABASE_DIR = "../data/raw/"
RAW_CSV_DIR = "../data/raw_csv/"
QUEST_CSV_DIR = "../data/questions/"

cnx = sqlite3.connect(os.path.join(DATABASE_DIR, f"mlinpl.sqlite3"))
questions = pd.read_sql_query("SELECT * FROM main_question", cnx)
answers_mlinpl = pd.read_csv(os.path.join(RAW_CSV_DIR, "mlinpl_answers.csv"), index_col=0)
answers_dpm = pd.read_csv(os.path.join(RAW_CSV_DIR, "dpm_answers.csv"), index_col=0)

answers = pd.concat([answers_mlinpl, answers_dpm], ignore_index=True)

In [18]:
t = answers.groupby(["question_id", "hint_ans"]).agg(
    {
        "id": "count",
        "correct": "sum",
        "hint_asked": "sum",
        "hint_used": "sum",
        "changed_answer": "sum",
        "changed_to_hint": "sum",
        "followed": "sum",
        "followed_wrong": "sum",
        "hint_correct": "sum",
        "decepted": "sum",
    }
).sort_values("decepted", ascending=False)
t = t.rename(columns={"id": "count"})

t = t.loc[t["count"] == t["decepted"]]

t = t.loc[:, ["decepted"]]
t["question"] = t.index.map(lambda x: questions.loc[questions["id"] == x[0], "content"].values[0])
t["answer"] = t.index.map(lambda x: questions.loc[questions["id"] == x[0], f"ans_{x[1]}"].values[0])
t[f"hint"] = t.index.map(lambda x: questions.loc[questions["id"] == x[0], f"hint_{x[1]}"].values[0])

t = t.loc[t["decepted"] >= 1, :]

t.to_csv(os.path.join(QUEST_CSV_DIR, "hints_deceptive.csv"))

In [19]:
t = answers.groupby(["question_id", "hint_ans"]).agg(
    {
        "id": "count",
        "correct": "sum",
        "hint_asked": "sum",
        "hint_used": "sum",
        "changed_answer": "sum",
        "changed_to_hint": "sum",
        "followed": "sum",
        "followed_wrong": "sum",
        "hint_correct": "sum",
        "decepted": "sum",
    }
).sort_values("followed_wrong", ascending=False)
t = t.rename(columns={"id": "count"})

t = t.loc[t["count"] == t["followed_wrong"]]

t = t.loc[:, ["followed_wrong", "decepted"]]
t["question"] = t.index.map(lambda x: questions.loc[questions["id"] == x[0], "content"].values[0])
t["answer"] = t.index.map(lambda x: questions.loc[questions["id"] == x[0], f"ans_{x[1]}"].values[0])
t[f"hint"] = t.index.map(lambda x: questions.loc[questions["id"] == x[0], f"hint_{x[1]}"].values[0])

t = t.loc[t["followed_wrong"] >= 1, :]

t.to_csv(os.path.join(QUEST_CSV_DIR, "hints_followed_wrong.csv"))

In [20]:
t = answers.groupby(["question_id", "hint_ans"]).agg(
    {
        "id": "count",
        "correct": "sum",
        "hint_asked": "sum",
        "hint_imposed": "sum",
        "hint_used": "sum",
        "changed_answer": "sum",
        "changed_to_hint": "sum",
        "followed": "sum",
        "followed_wrong": "sum",
        "hint_correct": "sum",
        "decepted": "sum",
    }
).sort_values("followed_wrong", ascending=True)

t = t.rename(columns={"id": "count"})
t = t.loc[(t["followed"] == 0) & (t["hint_imposed"] > 0) & (t["hint_correct"] == 0), :]

t = t.loc[:, ["count", "followed", "hint_imposed"]]
t["question"] = t.index.map(lambda x: questions.loc[questions["id"] == x[0], "content"].values[0])
t["answer"] = t.index.map(lambda x: questions.loc[questions["id"] == x[0], f"ans_{x[1]}"].values[0])
t[f"hint"] = t.index.map(lambda x: questions.loc[questions["id"] == x[0], f"hint_{x[1]}"].values[0])

t.to_csv(os.path.join(QUEST_CSV_DIR, "hints_not_decepted.csv"))

In [5]:
# DECEPTED
# 2642 - 3/5 decepted (5/5 hints)
# 1453 - 2/2 decepted
# 1900 - 2/2 decepted
# 182 - 2/2 decepted

# FOLLOWED
# 1748 - 5/5 hints followed (5/6 hints)

In [6]:
answers.groupby(["question_id"]).agg(
    {
        "id": "count",
        "correct": "sum",
        "hint_asked": "sum",
        "hint_imposed": "sum",
        "hint_used": "sum",
        "changed_answer": "sum",
        "changed_to_hint": "sum",
        "followed": "sum",
        "followed_wrong": "sum",
        "hint_correct": "sum",
        "decepted": "sum",
    }
).sort_values("followed_wrong", ascending=False).head(30)

Unnamed: 0_level_0,id,correct,hint_asked,hint_imposed,hint_used,changed_answer,changed_to_hint,followed,followed_wrong,hint_correct,decepted
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1748,6,1,4,1,5,1.0,1.0,5.0,4.0,1.0,1.0
2642,5,2,1,4,5,3.0,3.0,4.0,3.0,1.0,3.0
1956,3,0,3,0,3,0.0,0.0,3.0,3.0,0.0,0.0
1419,3,1,2,1,3,1.0,1.0,2.0,2.0,0.0,1.0
1741,3,0,1,1,2,1.0,1.0,2.0,2.0,0.0,1.0
1711,3,1,1,2,3,2.0,2.0,2.0,2.0,0.0,2.0
2301,2,0,2,0,2,0.0,0.0,2.0,2.0,0.0,0.0
467,2,0,1,1,2,1.0,1.0,2.0,2.0,0.0,0.0
2980,2,0,2,0,2,0.0,0.0,2.0,2.0,0.0,0.0
66,2,0,2,0,2,0.0,0.0,2.0,2.0,0.0,0.0


In [7]:
def print_question(question_id):
    row = questions.loc[questions["id"] == question_id, :].iloc[0]
    print(f"Question no. {row['id']}")
    print(row["content"])
    print(f"A. {row['ans_A']}")
    print(f"hint A: {row['hint_A']}")
    print(f"B. {row['ans_B']}")
    print(f"hint B: {row['hint_B']}")
    print(f"C. {row['ans_C']}")
    print(f"hint C: {row['hint_C']}")
    print(f"D. {row['ans_D']}")
    print(f"hint D: {row['hint_D']}")
    print(f"Correct answer: {row['correct_ans']}")
    print("Stats:")
    answers_temp = answers[answers["question_id"] == row["id"]]
    print(f"Total answers: {len(answers_temp)}")
    print(f"Correct answers: {len(answers_temp[answers_temp['correct'] == 1])}")
    print(f"Hints asked: {len(answers_temp[answers_temp['hint_asked'] == 1])}")
    print(f"Hints used: {len(answers_temp[answers_temp['hint_used'] == 1])}")
    print(f"Changed answer: {len(answers_temp[answers_temp['changed_answer'] == 1])}")
    print(f"Changed to hint: {len(answers_temp[answers_temp['changed_to_hint'] == 1])}")
    print(f"Followed: {len(answers_temp[answers_temp['followed'] == 1])}")
    print(f"Followed wrong: {len(answers_temp[answers_temp['followed_wrong'] == 1])}")
    print(f"Decepted: {len(answers_temp[answers_temp['decepted'] == 1])}")
    print(answers_temp.loc[:, ["answer", "hint_ans", "answer_before_prompt"]])

In [8]:
print_question(1956)

Question no. 1956
Which go together to mean totally opposite?
A. Bread and water
hint A: The correct answer is A. Bread and water, because the phrase "bread and water" refers to the basic necessities of life, but are vastly different in terms of their composition and properties. Bread, a staple food made from flour, water, and other ingredients, is rich in carbohydrates and provides energy. On the other hand, water is a vital liquid that sustains life, but has no nutritional value and cannot be used as a source of energy.  The phrase "bread and water" has often been used to describe a situation where only the bare essentials are available, implying scarcity and deprivation.
B. Chalk and cheese
hint B: The correct answer is B. Chalk and cheese, because chalk and cheese are two things that are often used to represent complete opposites, and they are considered to be very different in their properties and characteristics.
C. Rock and flower
hint C: The correct answer is C. Rock and flower

In [None]:
# 2642 is hard (2/5 correct, 5/5 hints used, 4/5 followed hint, followed wrong: 3, 3/5 decepted!)
# 1453 B hint is convincing (A correct)
# 1900 B hint is convincing (C correct)
# 1748 is hard (1/6 correct, 5/6 hints used, 5/6 followed hint, 1/6 decepted)

In [37]:
# for further analysis
t = answers.groupby(["question_id", "hint_ans"]).agg(
    {
        "id": "count",
        "correct": "sum",
        "hint_asked": "sum",
        "hint_used": "sum",
        "changed_answer": "sum",
        "changed_to_hint": "sum",
        "followed": "sum",
        "followed_wrong": "sum",
        "hint_correct": "sum",
        "decepted": "sum",
    }
).sort_values("followed_wrong", ascending=False)
t = t.rename(columns={"id": "count"})

t = t.loc[t["count"] == t["followed_wrong"]]

t = t.loc[:, ["followed_wrong", "decepted"]]
t["question"] = t.index.map(lambda x: questions.loc[questions["id"] == x[0], "content"].values[0])
t["answer"] = t.index.map(lambda x: questions.loc[questions["id"] == x[0], f"ans_{x[1]}"].values[0])
t[f"hint"] = t.index.map(lambda x: questions.loc[questions["id"] == x[0], f"hint_{x[1]}"].values[0])

t = t.loc[(t["followed_wrong"] == 2) & (t["decepted"] == 2), :]
t.reset_index(inplace=True)
t = t.loc[:, ["question_id", "question", "hint_ans"]]
t.columns = ["question_id", "question", "answer_LLM"]

t["answer_correct"] = t["question_id"].map(lambda x: questions.loc[questions["id"] == x, "correct_ans"].values[0])
t["A"] = t["question_id"].map(lambda x: questions.loc[questions["id"] == x, "ans_A"].values[0])
t["B"] = t["question_id"].map(lambda x: questions.loc[questions["id"] == x, "ans_B"].values[0])
t["C"] = t["question_id"].map(lambda x: questions.loc[questions["id"] == x, "ans_C"].values[0])
t["D"] = t["question_id"].map(lambda x: questions.loc[questions["id"] == x, "ans_D"].values[0])
t = t.loc[:, ["question_id", "question", "A", "B", "C", "D", "answer_LLM", "answer_correct"]]
t.to_csv(os.path.join(QUEST_CSV_DIR, "believable_answers.csv"))