In [1]:
import pandas as pd
import pickle
import os
import json


In [2]:
with open('shuffled_QA_pairs_econ.json') as f:
    qa_pairs = json.load(f)


with open('log_all.pkl', 'rb') as f:
    result = pickle.load(f)

with open('log_pick_wrong.pkl', 'rb') as f:
    result_wrong = pickle.load(f)

with open('log_pick_correct.pkl', 'rb') as f:
    result_correct = pickle.load(f)

with open('classification_log_all.pkl', 'rb') as f:
    classification = pickle.load(f)


In [3]:
question_set = set(result.keys())

qa_result = []
for q in question_set:
    temp_dict = {
        'question': q,
        'gpt_choice': result[q]['gpt_choice'],
        'incorrect_choices': result_wrong[q]['incorrect_choices'],
        'correct_choices': result_correct[q]['correct_choices'],
        'correct_choice': result[q]['correct_choice'],
        'completeness_choice': classification[q]['completeness_choice'],
        'type_choice': classification[q]['type_choice'],
        'mcq_choice': classification[q]['mcq_choice'],
        'completeness_reason': classification[q]['completeness_reason'],
        'type_reason': classification[q]['type_reason'],
        'mcq_reason': classification[q]['mcq_reason']
    }
    qa_result.append(temp_dict)

In [10]:
# qa_result to dataframe
df = pd.DataFrame(qa_result)

completeness_name_map = {
    1: 'Incomplete',
    2: 'Missing reference',
    3: 'Missing context',
    9: 'Other',
    0: 'Complete'
}

type_name_map = {
    1: 'Conceptual',
    2: 'Computation',
    3: 'True/False',
    4: 'Graphical',
    5: 'Reasoning',
    9: 'Other'
}

mcq_name_map = {
    1: 'Fit for MCQ',
    0: 'Not fit for MCQ'
}

df['completeness_name'] = df['completeness_choice'].map(completeness_name_map)
df['type_name'] = df['type_choice'].map(type_name_map)
df['mcq_name'] = df['mcq_choice'].map(mcq_name_map)

df['reject'] = df.apply(lambda row: row['correct_choice'] not in row['incorrect_choices'], axis=1)
df['within'] = df.apply(lambda row: row['correct_choice'] in row['correct_choices'], axis=1)



Completeness
- 1. Incomplete: Is the question complete? Are there any missing words or phrases, or references to other parts of the text that are not included?
- 2. Missing reference: Does the question contain all the reference materials? Are there any missing tables, charts, or other information that is necessary to answer the question?
- 3. Missing context: Does the question contain all the necessary context? Does it ask for information from sections in the textbook, which are not included in the question?
- 9. Other: The question does not have enough information, but it does not fit into any of the above categories.
- 0. Complete: The question has all the necessary information to be answered.

Type
- 1. Conceptual: The question asks for an explanation of a concept or theory.
- 2. Computation: The question asks for a calculation or numerical answer.
- 3. True/False: The question asks for a true or false answer.
- 4. Graphical: The question asks for a graph or chart to be drawn.
- 5. Reasoning: The question asks for a logical explanation or reasoning.
- 9. Other: The question does not fit into any of the above categories.

MCQ
- 1. Yes: The question is fit for use as a multiple-choice question.
- 0. No: The question is not fit for use as a multiple-choice question.


In [5]:
df.head()

Unnamed: 0,question,gpt_choice,incorrect_choices,correct_choices,correct_choice,completeness_choice,type_choice,mcq_choice,completeness_reason,type_reason,mcq_reason,completeness_name,type_name,mcq_name
0,\n6. \n\nDo the jobs for workers in low-income...,B,"[1, 3, 4]","[1, 2]",2,0,1,1,The question is complete as it provides all th...,The question is conceptual because it asks for...,This question is fit for use as a multiple-cho...,Complete,Conceptual,Fit for MCQ
1,"\n12. \n\n In a recession, does the actual bu...",2,[4],"[1, 2, 3]",2,0,1,1,The question is complete as it provides all th...,The question is conceptual as it asks for an e...,This question is fit for use as a multiple-cho...,Complete,Conceptual,Fit for MCQ
2,\n15. \n\nWhat is the difference between a fre...,4,"[1, 2, 3]","[1, 4]",4,0,1,0,The question is complete as it provides all th...,The question is asking for an explanation of c...,This question is not fit for use as a multiple...,Complete,Conceptual,Not fit for MCQ
3,\n7. \n\n List the areas where governme...,3,"[1, 2, 4]","[2, 3, 4]",3,0,1,0,The question is complete as it asks for a list...,The question is conceptual because it asks for...,This question is not fit for use as a multiple...,Complete,Conceptual,Not fit for MCQ
4,\n19. \n\nHow would a balanced budget amendmen...,2,"[1, 3, 4]",[2],2,0,1,0,The question is complete as it provides all th...,The question is conceptual because it asks for...,This question is not fit for use as a multiple...,Complete,Conceptual,Not fit for MCQ


### Overview

Reject rate:
reject means the gpt doesn't select the correct answer

In [6]:
# accuracy of gpt choice
overall_accuracy = (df['gpt_choice'] == df['correct_choice']).mean()
print(f"Overall accuracy: {overall_accuracy:.4f}\n")

# reject rate
reject_rate = (df.apply(lambda row: row['correct_choice'] not in row['incorrect_choices'], axis=1)).mean()
print(f"Reject rate: {reject_rate:.4f}\n")

# mean of correct choice in selected correct choices
within_rate = (df.apply(lambda row: row['correct_choice'] in row['correct_choices'], axis=1)).mean()
print(f"within rate: {within_rate:.4f}\n")

# distribution of completeness choice
completeness_dist = df['completeness_name'].value_counts()
print(f"Distribution of completeness: {completeness_dist}\n")

# distribution of type choice
type_dist = df['type_name'].value_counts()
print(f"Distribution of type: {type_dist}\n")

# distribution of mcq choice
mcq_dist = df['mcq_name'].value_counts()
print(f"Distribution of mcq: {mcq_dist}\n")

Overall accuracy: 0.4970

Reject rate: 0.8254

within rate: 0.9438

Distribution of completeness: completeness_name
Complete             281
Missing reference     47
Missing context        7
Incomplete             3
Name: count, dtype: int64

Distribution of type: type_name
Conceptual     222
Computation     53
Reasoning       35
Other           18
Graphical        7
True/False       3
Name: count, dtype: int64

Distribution of mcq: mcq_name
Not fit for MCQ    228
Fit for MCQ        110
Name: count, dtype: int64



### Accuracy by category

In [13]:
# accuracy by completeness
completeness_accuracy = df.groupby('completeness_name')[['gpt_choice', 'correct_choice']].apply(lambda x: (x['gpt_choice'] == x['correct_choice']).mean())
print(f"Accuracy by completeness: {completeness_accuracy}\n")

# reject rate by completeness
completeness_reject_rate = df.groupby('completeness_name')[['reject']].apply(lambda x: x['reject'].mean())
print(f"Reject rate by completeness: {completeness_reject_rate}\n")

# within rate by completeness
completeness_within_rate = df.groupby('completeness_name')[['within']].apply(lambda x: x['within'].mean())
print(f"Within rate by completeness: {completeness_within_rate}\n")

print('=='*20)
# accuracy by type
type_accuracy = df.groupby('type_name')[['gpt_choice', 'correct_choice']].apply(lambda x: (x['gpt_choice'] == x['correct_choice']).mean())
print(f"Accuracy by type: {type_accuracy}\n")

# reject rate by type
type_reject_rate = df.groupby('type_name')[['reject']].apply(lambda x: x['reject'].mean())
print(f"Reject rate by type: {type_reject_rate}\n")

# within rate by type
type_within_rate = df.groupby('type_name')[['within']].apply(lambda x: x['within'].mean())
print(f"Within rate by type: {type_within_rate}\n")

print('=='*20)
# accuracy by mcq
mcq_accuracy = df.groupby('mcq_name')[['gpt_choice', 'correct_choice']].apply(lambda x: (x['gpt_choice'] == x['correct_choice']).mean())
print(f"Accuracy by mcq: {mcq_accuracy}\n")

Accuracy by completeness: completeness_name
Complete             0.505338
Incomplete           0.333333
Missing context      0.571429
Missing reference    0.446809
dtype: float64

Reject rate by completeness: completeness_name
Complete             0.832740
Incomplete           0.666667
Missing context      0.857143
Missing reference    0.787234
dtype: float64

Within rate by completeness: completeness_name
Complete             0.946619
Incomplete           0.666667
Missing context      1.000000
Missing reference    0.936170
dtype: float64

Accuracy by type: type_name
Computation    0.358491
Conceptual     0.563063
Graphical      0.000000
Other          0.333333
Reasoning      0.514286
True/False     0.000000
dtype: float64

Reject rate by type: type_name
Computation    0.679245
Conceptual     0.873874
Graphical      0.571429
Other          0.666667
Reasoning      0.885714
True/False     0.666667
dtype: float64

Within rate by type: type_name
Computation    0.867925
Conceptual     0.959

### Accuracy of our filter

- filter 1: completeness in [0] and type in [1, 2, 5]
- filter 2: completeness in [0] and type in [1, 2, 5] and mcq in [1]


In [14]:
df['filter_1'] = (df['completeness_choice'] == 0) & (df['type_choice'].isin([1, 2, 5]))
df['filter_2'] = (df['completeness_choice'] == 0) & (df['type_choice'].isin([1, 2, 5])) & (df['mcq_choice'] == 1)

In [15]:
# accuracy by fitler
filter_1_accuracy = df.groupby('filter_1')[['gpt_choice', 'correct_choice']].apply(lambda x: (x['gpt_choice'] == x['correct_choice']).mean())
filter_1_reject_rate = df.groupby('filter_1')[['reject']].apply(lambda x: x['reject'].mean())
filter_1_within_rate = df.groupby('filter_1')[['within']].apply(lambda x: x['within'].mean())


print(f"Accuracy by filter 1: {filter_1_accuracy}\n")
print(f"Reject rate by filter 1: {filter_1_reject_rate}\n")
print(f"Within rate by filter 1: {filter_1_within_rate}\n")




Accuracy by filter 1: filter_1
False    0.373333
True     0.532319
dtype: float64

Reject rate by filter 1: filter_1
False    0.760000
True     0.844106
dtype: float64

Within rate by filter 1: filter_1
False    0.92000
True     0.95057
dtype: float64

