So, I want to consider all jsons with 
- "type": "isaacQuestionPage"
- "published": true

I want to record

- question_page_id
- skills taken from tags [tag]
- audience stage(s) - optional
- audience difficulty(ies) - optional

then for each question part:
- question_part_id
- skills taken from hints [skill_id]

all skills will be separated by |

What should I do with questions with deprecated=True field?

In [1]:
from config import *
isaac_data_folder = parentdir + "/data/isaac/"
isaac_content_directory = parentdir + "/data/isaac/content/"

In [2]:
from typing import Dict, Optional, List
import json
import pandas as pd

In [3]:
def extract_skills_from_hint_value(hint: str) -> List[str]:
    skills = []
    if "{/concepts/" in hint:
        for element in hint.split("{/concepts/")[1:]:
            skills.append(element[:element.find("}")])
    return skills

print(extract_skills_from_hint_value("**\\link{Glossary}{/glossary}**\n\n<GLOSS> Steady state\n\n**Concepts**\n\n\\link{Capacitors}{/concepts/cp_capacitor}\n\n\\link{Kirchhoff's Laws}{/concepts/cp_kirchhoffs_laws}"))


['cp_capacitor', 'cp_kirchhoffs_laws']


In [4]:
def extract_question_part_entries_from_json(path: str) -> Optional[List[Dict[str, str]]]:
    #print(f"extract_question_part_entries_from_json called with {path}")
    if ".json" != path[-5:]:
        return None
    with open(path, "r") as f:
        data = json.load(f)

        if ('type' not in data or data['type'] != 'isaacQuestionPage' or 
                'published' not in data or data['published'] != True or
                    'children' not in data):
            return None

        question_page_id = data['id']
        tag = '|'.join(data['tags'])
        whole_question_dict = {
            "question_page_id": question_page_id,
            "tag": tag,
        }

        audiences = []
        if 'audience' in data:
            audience_stage = None
            audience_difficulty = None
            for audience_record in data['audience']:
                if 'stage' in audience_record:
                    audience_stage = '|'.join(audience_record['stage'])
                if 'difficulty' in audience_record:
                    audience_difficulty = '|'.join(audience_record['difficulty'])
            audiences.append({
                "audience_stage": audience_stage,
                "audience_difficulty": audience_difficulty,
            })

        question_parts = []
        for child in data['children']:
            if child['type'] in ["isaacMultiChoiceQuestion", "isaacItemQuestion", "isaacReorderQuestion", "isaacParsonsQuestion", "isaacNumericQuestion", "isaacSymbolicQuestion", "isaacSymbolicChemistryQuestion", "isaacStringMatchQuestion", "isaacFreeTextQuestion", "isaacSymbolicLogicQuestion", "isaacGraphSketcherQuestion", "isaacClozeQuestion"]:
                first_hint = ""
                if 'hints' in child and len(child['hints']) > 0:
                    if 'children' in child['hints'][0]:
                        if 'value' in child['hints'][0]['children'][0]:
                            first_hint = child['hints'][0]['children'][0]['value']
                    else:
                        if 'value' in child['hints'][0]:
                            first_hint = child['hints'][0]['value']
                
                skills = extract_skills_from_hint_value(first_hint)
                major_skills = {skill.split('#')[0] for skill in skills}
                first_skill = None
                if len(skills) > 0:
                    first_skill = skills[0].split('#')[0]
                question_parts.append({
                    "skill_id": '|'.join(skills),
                    "question_part_id": child['id'],
                    "major_skills": '|'.join(major_skills),
                    "first_skill": first_skill,
                })

        question_part_list = []
        if len(question_parts) > 0:
            assert len(audiences) <= 1
            if len(audiences) == 1:
                for i in range(len(question_parts)):
                    question_part_list.append({**whole_question_dict, **audiences[0], **question_parts[i]})
            else:
                #ignore audiences
                for i in range(len(question_parts)):
                    question_part_list.append({**whole_question_dict, **question_parts[i]})
        
        return question_part_list

print(extract_question_part_entries_from_json(isaac_content_directory + "questions/physics/circuits/capacitors/level4/a_capacitor_conundrum.json"))


[{'question_page_id': 'a_capacitor_conundrum', 'tag': 'capacitors|electricity|physics|problem_solving', 'audience_stage': 'a_level', 'audience_difficulty': 'challenge_2', 'skill_id': 'cp_capacitor|cp_kirchhoffs_laws', 'question_part_id': '426a8866-7e8b-456f-b9f8-b623ba50fe01', 'major_skills': 'cp_kirchhoffs_laws|cp_capacitor', 'first_skill': 'cp_capacitor'}]


In [18]:
def iterate_directory(directory: str) -> List[Dict[str, str]]:
    #print(f"iterate_directory called with: {directory}")
    entries = []
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            question_part_entries = extract_question_part_entries_from_json(f)
            if question_part_entries is not None:
                entries.extend(question_part_entries)
        else:
            entries.extend(iterate_directory(f))
    return entries

result = iterate_directory(isaac_content_directory)
df = pd.DataFrame(result)
df.head()

          question_page_id                                                tag  \
0                jury_duty       maths|probability|problem_solving|statistics   
1          integration_3_2  calculus|integration|maths|maths_book|problem_...   
2          integration_3_6  calculus|integration|maths|maths_book|problem_...   
3          integration_3_4  calculus|integration|maths|maths_book|problem_...   
4     int_substitution_new  calculus|integration|maths|maths_book|problem_...   
...                    ...                                                ...   
2809          chem_16_c2_9  book|chemistry_16|chemistry|foundations|stoich...   
2810          chem_16_c2_3  book|chemistry_16|chemistry|foundations|stoich...   
2811          chem_16_c2_5  book|chemistry_16|chemistry|foundations|stoich...   
2812          chem_16_c2_7  book|chemistry_16|chemistry|foundations|stoich...   
2813          chem_16_c2_2  book|chemistry_16|chemistry|foundations|stoich...   

     audience_stage audienc

In [20]:
df_to_file = df
df_to_file['question_id'] = df_to_file['question_page_id'] + '|' + df_to_file['question_part_id']

In [21]:
df_to_file.to_csv(isaac_data_folder + 'question_entries.csv', columns=["question_id", "skill_id", "major_skills", "first_skill", "tag", "audience_stage", "audience_difficulty"])


In [131]:
print(f"number of questions: {len(df)}")
print(f"number of questions with no skills: {sum(df['skill_id'] == '')}")
print(f"number of records with empty audience: {sum(df['audience_stage'].isna())}")

number of questions: 2814
number of questions with no skills: 2095
number of records with empty audience: 447
