In [2]:
import pandas as pd
import unicodedata

In [3]:
sced_master_list_file = "../data/SCEDMasterList_v7.0_508.xlsx"
sced_v7_file = "../data/SCED_Version_7_FINAL_508.xlsx"

In [4]:
# Dataframe holding course titles and descriptions for active courses
# Defines list of active codes
sced_info_df = pd.read_excel(sced_v7_file, 
                             sheet_name="SCED 7.0",
                             converters={"SCED Course Code": lambda x: str(x)})
sced_info_df = sced_info_df[~(sced_info_df["Change Status"] == "Archived Course")]
sced_info_df.head()

Unnamed: 0,Course Title,SCED Course Code,Course Description,Change Status
0,English/Language Arts I (9th grade),1001,English/Language Arts I (9th grade) courses bu...,No Change
1,English/Language Arts II (10th grade),1002,English/Language Arts II (10th grade) courses ...,No Change
2,English/Language Arts III (11th grade),1003,English/Language Arts III (11th grade) courses...,No Change
3,English/Language Arts IV (12th grade),1004,English/Language Arts IV (12th grade) courses ...,No Change
4,AP English Language and Composition,1005,Following the College Board’s suggested curric...,No Change


In [5]:
# Read additional information from MasterList files
# Make sure to tell pandas which columns are SCED codes to prevent casting as INTs which trims leading zeros
sced_master_df = pd.read_excel(sced_master_list_file, 
                               sheet_name=["Recodes", "SCED Master List", "Archived"],
                               converters={"Initial code": lambda x: str(x),
                                           "Current code": lambda x: str(x),
                                           "SCED Course Code": lambda x: str(x),
                                           "SCED Code": lambda x: str(x)})

In [6]:
# Sheet containing all SCED codes ever (inluding archived/changed)
sced_code_v7_df = sced_master_df["SCED Master List"]
sced_code_v7_df.head()

Unnamed: 0,SCED Course Code,Course Title,1.0 Status,Change from 1.0 to 2.0,Change from 2.0 to 3.0,Change from 3.0 to 4.0,Change from 4.0 to 5.0,Change from 5.0 to 6.0,Change from 6.0 to 7.0
0,1001,English/Language Arts I (9th grade),Original,No change,No change,No change,No change,No change,No change
1,1002,English/Language Arts II (10th grade),Original,No change,No change,No change,No change,No change,No change
2,1003,English/Language Arts III (11th grade),Original,No change,No change,No change,No change,No change,No change
3,1004,English/Language Arts IV (12th grade),Original,No change,No change,No change,No change,No change,No change
4,1005,AP English Language and Composition,Original,No change,No change,No change,No change,No change,No change


In [7]:
# Sheet containing crosswalks between past/current SCED codes
recode_df = sced_master_df["Recodes"]
recode_df.head()

Unnamed: 0,Initial code,Initial course name,Current code,Current course name
0,6039,Foreign Language—General,24039,World Language—General
1,6100,Spanish (prior-to-secondary),24050;\n24051,Spanish for Young Learners (prior-to-secondary...
2,6101,Spanish I,24052,Spanish I
3,6102,Spanish II,24053,Spanish II
4,6103,Spanish III,24054,Spanish III


In [8]:
# Sheet containing code descriptions for archived codes
# These can be helpful for parsing additional crosswalks when not specified in the 'Recode' sheet
archived_df = sced_master_df["Archived"]
archived_df.head()

Unnamed: 0,Course Title,SCED Code,Course Description,Change Status
0,Exploration in Drama,5054,Exploration in Drama courses are now archived....,SCED 1.0 Archived Course
1,AP Studio Art—General Portfolio,5171,AP Studio Art—General Portfolio courses are no...,SCED 1.0 Archived Course
2,AP French Literature,6133,AP French Literature courses are no longer off...,SCED 1.0 Archived Course
3,AP Computer Science AB,10158,AP Computer Science AB courses are no longer o...,SCED 1.0 Archived Course
4,Forestry Harvesting,18503,Forestry Harvesting courses are now archived. ...,SCED 1.0 Archived Course


## Look to see how many codes are missing crosswalks to latest SCED version

In [9]:
total_codes = sced_code_v7_df["SCED Course Code"]
print("All codes: %s" % len(total_codes))

# Active v7 codes that haven't been archived
active_codes = sced_code_v7_df[~sced_code_v7_df["Change from 6.0 to 7.0"].isin(["Previously Archived", "Archived; code updated"])]["SCED Course Code"]
active_codes_final = sced_info_df["SCED Course Code"]
print("Active (v7 MasterList): %s" % len(active_codes))
print("Active (v7 FINAL): %s" % len(active_codes_final))
active_codes = active_codes_final
print("Expected number of crosswalks needed: %s" % (len(total_codes)-len(active_codes)))

# Archived codes as of v7
archived_codes = sced_code_v7_df[~sced_code_v7_df["SCED Course Code"].isin(active_codes)]["SCED Course Code"]
print("Total codes in spreadsheet marked as archived (should be == # expected crosswalks): %s" % len(archived_codes))

# Archived codes that have a crosswalk
crosswalked_codes = recode_df["Initial code"]
print("Archived codes that have a crosswalk: %s" % len(crosswalked_codes))
print("Expected missing crosswalk: %s" % (len(archived_codes) - len(crosswalked_codes)))

# Archived codes that have no crosswalk
missing_crosswalk_codes = archived_codes[~archived_codes.isin(crosswalked_codes)]
print("Actual # archived codes that don't have a crosswalk: %s" % len(missing_crosswalk_codes))

# Archived codes that have archive info that a crosswalk can be rescued from
archive_info_codes = archived_df["SCED Code"]
print("Archive codes with code description: %s" % len(archive_info_codes))

# Archived codes with no crosswalk but can possibly be rescued using archived description
missing_crosswalk_with_archive_info_codes = missing_crosswalk_codes[missing_crosswalk_codes.isin(archive_info_codes)]
print("# of archived codes with no crosswalks that could be rescued from code descriptions: %s" % len(missing_crosswalk_with_archive_info_codes))

# Archived codes we have no way to crosswalk at all because they don't have a crosswalk or an archived code description
missing_crosswalk_no_info_codes = missing_crosswalk_codes[~missing_crosswalk_codes.isin(archive_info_codes)]
print("# of archived codes with no way of cross-walking: %s" % len(missing_crosswalk_no_info_codes))

All codes: 2703
Active (v7 MasterList): 1734
Active (v7 FINAL): 1736
Expected number of crosswalks needed: 967
Total codes in spreadsheet marked as archived (should be == # expected crosswalks): 969
Archived codes that have a crosswalk: 760
Expected missing crosswalk: 209
Actual # archived codes that don't have a crosswalk: 209
Archive codes with code description: 478
# of archived codes with no crosswalks that could be rescued from code descriptions: 198
# of archived codes with no way of cross-walking: 11


In [10]:
# Write out info to files for data provenance
missing_crosswalk = sced_code_v7_df[sced_code_v7_df["SCED Course Code"].isin(missing_crosswalk_codes)]
#missing_crosswalk.to_excel("../data/archived_courses_missing_crosswalk.xlsx", index=False)

missing_crosswalk_rescuable = archived_df[archived_df["SCED Code"].isin(missing_crosswalk_with_archive_info_codes)]
#missing_crosswalk_rescuable.to_excel("../data/archived_courses_missing_crosswalk_possible.xlsx", index=False)

missing_crosswalk_not_rescuable = sced_code_v7_df[sced_code_v7_df["SCED Course Code"].isin(missing_crosswalk_no_info_codes)]
#missing_crosswalk_not_rescuable.to_excel("../data/archived_courses_crosswalk_impossible.xlsx", index=False)

In [11]:
# Take a look at courses that you have no way to crosswalk
# In current version (sced 7.0) it's just the grade-level Foreign language catch-all classes removed in SCED v2.0
missing_crosswalk_not_rescuable

Unnamed: 0,SCED Course Code,Course Title,1.0 Status,Change from 1.0 to 2.0,Change from 2.0 to 3.0,Change from 3.0 to 4.0,Change from 4.0 to 5.0,Change from 5.0 to 6.0,Change from 6.0 to 7.0
2599,56028,Foreign Language (early childhood education),Existed in pSCED,Archived; code updated,Previously archived,Previously archived,Previously archived,Previously Archived,Previously Archived
2600,56029,Foreign Language (pre-kindergarten),Existed in pSCED,Archived; code updated,Previously archived,Previously archived,Previously archived,Previously Archived,Previously Archived
2601,56030,Foreign Language (kindergarten),Existed in pSCED,Archived; code updated,Previously archived,Previously archived,Previously archived,Previously Archived,Previously Archived
2602,56031,Foreign Language (grade 1),Existed in pSCED,Archived; code updated,Previously archived,Previously archived,Previously archived,Previously Archived,Previously Archived
2603,56032,Foreign Language (grade 2),Existed in pSCED,Archived; code updated,Previously archived,Previously archived,Previously archived,Previously Archived,Previously Archived
2604,56033,Foreign Language (grade 3),Existed in pSCED,Archived; code updated,Previously archived,Previously archived,Previously archived,Previously Archived,Previously Archived
2605,56034,Foreign Language (grade 4),Existed in pSCED,Archived; code updated,Previously archived,Previously archived,Previously archived,Previously Archived,Previously Archived
2606,56035,Foreign Language (grade 5),Existed in pSCED,Archived; code updated,Previously archived,Previously archived,Previously archived,Previously Archived,Previously Archived
2607,56036,Foreign Language (grade 6),Existed in pSCED,Archived; code updated,Previously archived,Previously archived,Previously archived,Previously Archived,Previously Archived
2608,56037,Foreign Language (grade 7),Existed in pSCED,Archived; code updated,Previously archived,Previously archived,Previously archived,Previously Archived,Previously Archived


### Create standardized master sheet of courses that need to be added to crosswalk

In [12]:
def make_crosswalk_df(df, colname_map):
    
    # Check colname map contains right columns
    colname_order = ["archived_code", "active_code", "course_name",  "course_desc"]
    for col in colname_order:
        if col not in colname_map:
            raise IOError("Colname map must include mapping for required column: {0}".format(col))
            
    # Check that columns actually exist in data frame
    for new_col, old_col in colname_map.items():
        if old_col is not None and old_col not in df.columns:
            raise IOError("Invalid '{0}' mapping! '{1}' doesn't appear in dataframe!".format(new_col, 
                                                                                           old_col))
    
    present_cols = [colname_map[col] for col in colname_order if colname_map[col] is not None]
    # Subset to include only required columns
    cw_df = df[present_cols].copy()
    
    # Rename columns to standard names
    cw_df.columns = [col for col in colname_order if colname_map[col] is not None]
    
    # Add missing columns
    missing_cols = [col for col in colname_order if colname_map[col] is None]
    for missing_col in missing_cols:
        cw_df[missing_col] = None
        
    # Reorder
    return cw_df[colname_order]

In [13]:
# Create colname maps to indicate which columns map to required data types
    # archived_code: SCED code for archived courses
    # active_code: Current SCED code for a course
    # course_name: SCED code course title
    # course_description: SCED code course description 
active_colname_map = {"archived_code": None, 
                      "active_code": "SCED Course Code",
                      "course_name": "Course Title",
                      "course_desc": "Course Description"}

recode_colname_map = {"archived_code": "Initial code", 
                      "active_code": "Current code",
                      "course_name": "Initial course name",
                      "course_desc": None}

archived_colname_map = {"archived_code": "SCED Code", 
                        "active_code": None,
                        "course_name": "Course Title",
                        "course_desc": "Course Description"}

null_colname_map =   {"archived_code": "SCED Course Code", 
                      "active_code": None,
                      "course_name": "Course Title",
                      "course_desc": None}

In [14]:
# Creat dataframe with all information needed for active courses
active_course_df = make_crosswalk_df(sced_info_df, active_colname_map)
assert len(active_course_df) == len(active_codes), "Active course df differs from active courses: {0} codes".format(len(active_course_df))

# Create dataframe with all information needed for recoded courses
recoded_course_df = make_crosswalk_df(recode_df, recode_colname_map)
assert len(recoded_course_df) == len(crosswalked_codes), "Recoded course df differs from crosswalked courses: {0} codes".format(len(recoded_course_df))

# Create dataframe with all information needed for archived courses
archived_course_df = make_crosswalk_df(archived_df, archived_colname_map)
assert len(archived_course_df) == len(archive_info_codes), "Archived course df differs from archive_info courses: {0} codes".format(len(archived_course_df))  

# Create dataframe of subset of courses that can't be crosswalked
null_course_df = sced_code_v7_df[sced_code_v7_df["SCED Course Code"].isin(missing_crosswalk_no_info_codes)].copy()
null_course_df = make_crosswalk_df(null_course_df, null_colname_map)
assert len(null_course_df) == len(missing_crosswalk_no_info_codes), "Null course df differs from archive_missing_info courses: {0} codes".format(len(null_course_df))

In [15]:
sced_codes_df = pd.concat([active_course_df,
                           recoded_course_df,
                           archived_course_df,
                           null_course_df])
print(len(sced_codes_df))
sced_codes_df.head()

2985


Unnamed: 0,archived_code,active_code,course_name,course_desc
0,,1001,English/Language Arts I (9th grade),English/Language Arts I (9th grade) courses bu...
1,,1002,English/Language Arts II (10th grade),English/Language Arts II (10th grade) courses ...
2,,1003,English/Language Arts III (11th grade),English/Language Arts III (11th grade) courses...
3,,1004,English/Language Arts IV (12th grade),English/Language Arts IV (12th grade) courses ...
4,,1005,AP English Language and Composition,Following the College Board’s suggested curric...


## Parse and validate SCED code crosswalks

In [16]:
import re
import copy

class CrossWalkerInitError(BaseException):
    pass

class InvalidCrosswalkError(BaseException):
    pass
        
class CrossWalker:
    def __init__(self, active_codes, archived_codes):
        # List of latest active SCED codes
        self.active_codes = active_codes
        if isinstance(self.active_codes, pd.Series):
            self.active_codes = active_codes.tolist()
        # Strip for sanity
        self.active_codes = [code.strip() for code in self.active_codes]
        
        # List of archived SCED codes that need to be crosswalked
        self.archived_codes = archived_codes
        if isinstance(self.archived_codes, pd.Series):
            self.archived_codes = archived_codes.tolist()
        self.archived_codes = [code.strip() for code in self.archived_codes]
        
        # Make sure active/archived codes are mutually exclusive
        invalid_codes = [code for code in self.active_codes if code in self.archived_codes]
        if invalid_codes:
            raise CrossWalkerInitError("Invald CrossWalker! Codes can't be "
                                       "both archived and active: {0}".format(", ".join(invalid_codes)))
        
        # Initialize crosswalk with active codes pointing to themselves
        self.crosswalk = {active_code: [active_code] for active_code in active_codes}
        
        # Initialize empty crosswalks for archived codes that will be populated
        for archived_code in self.archived_codes:
            self.crosswalk[archived_code] = []
        self.archived_crosswalk = {}
        
    def add_crosswalk(self, archived_code, new_codes, strict=True, overwrite=False):
        # Add exiting crosswalk to current crosswalk
        
        # Raise error if you're trying to crosswalk a code that hasn't been archived
        if archived_code not in self.crosswalk:
            raise InvalidCrosswalkError("Cannot add crosswalk for code not found in list of archived"
                                       " codes: {0}".format(archived_code))
            
        # Raise error/warn if trying to overwrite active code (these can't be crosswalked)
        if archived_code in self.active_codes:
            err_msg = "Cannot add code {0} to crosswalk as it's already an active code!".format(archived_code)
            print(err_msg)
            if strict:
                raise InvalidCroswalkError(err_msg)
        
        # Return immediately if code already has a valid crosswalk and you don't want to overwrite
        if self.crosswalk[archived_code] and not overwrite:
            return
            
        # Convert to list if not already a list
        if not isinstance(new_codes, list):
            new_codes = [new_codes]
            
        # Check to make sure new codes list isn't empty
        # Handles cases when attempting to add crosswalk parsed from description and no SCEDs found
        if not new_codes:
            err_msg = "Empty crosswalk for archived code: {0}".format(archived_code)
            if not strict:
                # Print warning message and return if not strict, otherwise throw an error
                print(err_msg)
                return
            raise InvalidCrosswalkError(err_msg)
            
        # Validate that new codes are actually latest SCED version
        valid_codes = set()
        archived_codes = set()
        for code in new_codes:
            if code in self.active_codes:
                # Add code to crosswalk if it maps onto an active SCED code
                valid_codes.add(code.strip())
            elif code in self.archived_codes:
                # Add code to archived crosswalk if it maps onto archived SCED code
                # Will attempt to resue these later
                archived_codes.add(code.strip())
                    
        # Convert sets back into lists for easier manipulation
        valid_codes = list(valid_codes)
        archived_codes = list(archived_codes)
        if valid_codes:
            self.crosswalk[archived_code] = valid_codes
        elif archived_codes:
            self.archived_crosswalk[archived_code] = archived_codes
            print("Archived code {0} maps to another archived code(s): {1}".format(archived_code, 
                                                                                   ", ".join(new_codes)))
        else:
            # SCED code maps to an invalid code. Raise error or print warning. 
            err_msg = "Invalid code(s) in crosswalk for {0}: {1}".format(archived_code, 
                                                                         ", ".join(new_codes))
            if strict:
                raise InvalidCrosswalkError(err_msg)
            else:
                print(err_msg)
    
    def rescue_archived_codes(self):
        # Try to update archived SCED crosswalks to latest SCED code
        # E.g. archived code 72249 => archived code 22249 => active code 19299
        rescued = []
        for archived_code, new_codes in self.archived_crosswalk.items():
            rescued_codes = []
            for new_code in new_codes:
                rescued_codes += self.rescue_code(new_code)
            
            if rescued_codes:
                print("Rescued archived code {0} to point to active code(s): {1}".format(archived_code,
                                                                                         ", ".join(rescued_codes)))
                self.crosswalk[archived_code] = rescued_codes
                rescued.append(archived_code)
            else:
                print("Currently unable to rescue code: {0}".format(archived_code))
        
        for archived_code in rescued:
            self.archived_crosswalk.pop(archived_code)
    
    def rescue_code(self, archived_code):
        if archived_code in self.crosswalk:
            # Code can be rescued because it points to an active code in crosswalk
            return self.crosswalk[archived_code]
        
        if archived_code not in self.archived_crosswalk:
            # Code can't be rescued from current crosswalk or archived crosswalks
            return []
        
        # Recursively determine if any codes eventually map to an active code 
        valid_codes = []
        for mapped_code in self.archived_crosswalk[archived_code]:
            valid_codes += self.rescue_code(mapped_code)
        return list(set(valid_codes))
    
    @staticmethod
    def parse_crosswalk(new_code):
        # Parses crosswalked code field which sometimes contains multiple codes 
        return [x.strip().strip("\n") for x in new_code.split(";")]
    
    @staticmethod
    def parse_crosswalk_from_desc(code_desc):
        # Function for parsing valid codes from archived course descriptions
        # Used to handle courses that don't have a valid crosswalk in spreadsheet but who might have that info in desc
        # e.g. "This code is now archived; use code 24917."
    
        # Get all strings in description that match SCED regex
        return re.findall(r"\D(\d{5})\D", " "+code_desc+" ")
        
    def get_orphaned_codes(self):
        # Return list of archived codes that don't currently have a valid crosswalk
        return [x for x in self.archived_codes+self.active_codes if not self.crosswalk[x]] 
    
    def get_active_code_crosswalk(self):
        # Return crosswalk where keys are active codes that point to list of archived synonyms
        rev_crosswalk = {active_code: set([]) for active_code in self.active_codes}
        for archived_code, active_codes in self.crosswalk.items():
            if archived_code in self.active_codes:
                continue
            for active_code in active_codes:
                rev_crosswalk[active_code].add(archived_code)
        
        rev_crosswalk = {k:list(v) for k,v in rev_crosswalk.items()}
        
        err_msg = "Invalid reverse crosswalk contains {0} entries but there are {1} active codes in this crosswalk!".format(len(rev_crosswalk),
                                                                                                                            len(self.active_codes))
        assert len(rev_crosswalk) == len(self.active_codes), err_msg
        return rev_crosswalk
            
        

In [17]:
crosswalk = CrossWalker(active_codes, archived_codes)

# Create validated crosswalks for archived courses to translate to latest SCED version
for code_info in sced_codes_df.to_dict(orient="records"):
    archived_code = code_info["archived_code"]
    active_code = code_info["active_code"]
    code_desc = code_info["course_desc"]
    
    if archived_code is None and active_code is not None:
        # No need to crosswalk active courses as they already appear in crosswalk
        continue

    if active_code is not None:
        # Parse crosswalk directly from recoded courses
        active_code = CrossWalker.parse_crosswalk(active_code)
    elif code_desc is not None:
        # Try to parse crosswalk from course description from archived courses
        active_code = CrossWalker.parse_crosswalk_from_desc(code_desc)
    
    if not active_code or active_code is None:
        # No information is available to actually crosswalk code to current version
        # Will happen either when no crosswalk could be parsed from course_description
        # Can also happen in a handful of early version courses that have no archived description AND have not been recoded
        continue
        
    # Add crosswalk for archived code 
    crosswalk.add_crosswalk(archived_code, active_code, strict=False)

# See if you can recursively update any crosswalks that pointed to an archived course
# e.g. course 10001 was listed as mapping to archived course 20001 but 20001 was listed as mapping to active course 30001
# This function traces those dependencies in order to say code 10001 is an archived version of code 30001
crosswalk.rescue_archived_codes()

print(len(crosswalk.crosswalk))
print(len(crosswalk.archived_crosswalk))
print(crosswalk.archived_crosswalk)


Archived code 52061 maps to another archived code(s): 02061
Archived code 55054 maps to another archived code(s): 05054
Archived code 58007 maps to another archived code(s): 08007
Archived code 72201 maps to another archived code(s): 22201
Archived code 72202 maps to another archived code(s): 22202
Archived code 72203 maps to another archived code(s): 22203
Archived code 72205 maps to another archived code(s): 22205
Archived code 72206 maps to another archived code(s): 22206
Archived code 72207 maps to another archived code(s): 22207
Archived code 72208 maps to another archived code(s): 22208
Archived code 72209 maps to another archived code(s): 22209
Archived code 72210 maps to another archived code(s): 22210
Archived code 72249 maps to another archived code(s): 22249
Archived code 06133 maps to another archived code(s): 06129, 06132
Rescued archived code 52061 to point to active code(s): 02063, 02064, 02065, 02062
Rescued archived code 55054 to point to active code(s): 05052, 05051
C

In [18]:
# Total number of codes that couldn't be crosswalked for any reason
missing_codes = pd.Series(crosswalk.get_orphaned_codes())
print("Total orphaned archived codes: %s" % len(missing_codes))

# Total number of codes orphaned because archived description didn't contain a valid crosswalk
missing_codes_df = sced_codes_df[sced_codes_df["archived_code"].isin(missing_codes) & 
                                 ~(sced_codes_df["course_desc"].isnull())]
print("Total codes orphaned because description didn't contain valid crosswalk: %s" % len(missing_codes_df))
missing_codes_df.to_excel("../data/archived_codes_no_crosswalk_unable_to_parse_from_description.xlsx", index=False)

# Total number of codes orphaned because we had absolutely no info to work with
missing_codes_df = sced_codes_df[sced_codes_df["archived_code"].isin(missing_codes) & 
                                 sced_codes_df["course_desc"].isnull()]
print("Total missing codes where description parsing wasn't possible: %s" % len(missing_codes_df))
missing_codes_df.to_excel("../data/archived_codes_no_crosswalk_or_description.xlsx", index=False)

Total orphaned archived codes: 87
Total codes orphaned because description didn't contain valid crosswalk: 75
Total missing codes where description parsing wasn't possible: 12


In [19]:
# Take a look at the reverse crosswalk to see if it makes sense
rev_crosswalk = crosswalk.get_active_code_crosswalk()
print(len(rev_crosswalk))
codes_with_synonyms = {act_code: pd.Series(arch_codes) for act_code, arch_codes in rev_crosswalk.items() if len(arch_codes) > 0}
print(len(codes_with_synonyms))

1736
748


In [20]:
pd.DataFrame(codes_with_synonyms).transpose().tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
24989,6199,6259.0,6859.0,6359.0,6759.0,6279.0,6499.0,6699.0,6599.0,6659.0,6799.0,6879.0,6679.0
24995,6995,,,,,,,,,,,,
24997,6997,56997.0,,,,,,,,,,,
24998,6998,,,,,,,,,,,,
24999,56999,6999.0,,,,,,,,,,,


In [21]:
pd.DataFrame(codes_with_synonyms).transpose().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
1008,51008,,,,,,,,,,,,
1009,51009,51991.0,,,,,,,,,,,
1010,51007,,,,,,,,,,,,
1026,51026,,,,,,,,,,,,
1027,51027,,,,,,,,,,,,


## Create course information lookup table

In [22]:
def clean_string(s):
    # Replaces pesky unicode strings that appear in SCED code files
    return s.strip().replace(chr(0x2014), 
                             "--").replace(chr(0x2019), 
                                           "'").replace(chr(0x201c), 
                                                        "'").replace(chr(0x201d), 
                                                                     "'").replace(chr(0x2026),
                                                                                 "...").replace(chr(0x00a0),
                                                                                               " ").replace(chr(0x2013),
                                                                                                            "--").replace(chr(0x00e9),
                                                                                                                         "e")

class CourseRecordMaker:
    def __init__(self, crosswalk):
        self.crosswalk = crosswalk
        self.archive_crosswalk = crosswalk.crosswalk
        self.active_crosswalk = crosswalk.get_active_code_crosswalk()
        
    def get_course_record(self, code, title=None, desc=None):
        if code not in self.crosswalk.active_codes and code not in self.crosswalk.archived_codes:
            raise IOError("Code {0} isn't found in archived or active codes!".format(code))
        
        # Whether code is active or archived
        is_archived = code in self.crosswalk.archived_codes
        
        # Strip any whitespace
        code = clean_string(code)
        title = clean_string(title) if title is not None else title
        desc = clean_string(desc) if desc is not None else desc
        warnings = None
        
        # Include special warning for small subset of archived codes that couldn't be crosswalked
        if is_archived and not self.archive_crosswalk[code]:
            warnings = "Archived SCED code could not be crosswalked to current SCED version!"
    
        record = {"sced_code": code,
                  "course_title": title,
                  "course_description": desc,
                  "is_archived": is_archived,
                  "warnings": warnings}
        
        record["v7_substitutes"] = self.archive_crosswalk[code] if is_archived else []
        record["archived_versions"] = self.active_crosswalk[code] if not is_archived else []
        return record

### Create lookup tables for all SCED codes ever with embedded crosswalk information

In [23]:
# Initialize instance of CourseRecordMaker to generate course records for courses based on crosswalk info
record_maker = CourseRecordMaker(crosswalk)

# Gather active course info 
code_info = {}

# Gather active course info
for course_info in sced_codes_df.to_dict(orient="records"):
    archived_code = course_info["archived_code"]
    active_code = course_info["active_code"]
    title = course_info["course_name"]
    desc = course_info["course_desc"]
    
    # Only use active codes for the actual records for the active codes
    code = active_code if archived_code is None else archived_code

    # Generate course code data entry
    info = record_maker.get_course_record(code, title, desc)
    code_info[code] = info

In [24]:
print(len(code_info))

2705


In [25]:
pd.DataFrame(code_info).transpose().head()

Unnamed: 0,archived_versions,course_description,course_title,is_archived,sced_code,v7_substitutes,warnings
1001,[],English/Language Arts I (9th grade) courses bu...,English/Language Arts I (9th grade),False,1001,[],
1002,[],English/Language Arts II (10th grade) courses ...,English/Language Arts II (10th grade),False,1002,[],
1003,[],English/Language Arts III (11th grade) courses...,English/Language Arts III (11th grade),False,1003,[],
1004,[],English/Language Arts IV (12th grade) courses ...,English/Language Arts IV (12th grade),False,1004,[],
1005,[],Following the College Board's suggested curric...,AP English Language and Composition,False,1005,[],


In [26]:
pd.DataFrame(code_info).transpose().tail()

Unnamed: 0,archived_versions,course_description,course_title,is_archived,sced_code,v7_substitutes,warnings
56034,[],,Foreign Language (grade 4),True,56034,[],Archived SCED code could not be crosswalked to...
56035,[],,Foreign Language (grade 5),True,56035,[],Archived SCED code could not be crosswalked to...
56036,[],,Foreign Language (grade 6),True,56036,[],Archived SCED code could not be crosswalked to...
56037,[],,Foreign Language (grade 7),True,56037,[],Archived SCED code could not be crosswalked to...
56038,[],,Foreign Language (grade 8),True,56038,[],Archived SCED code could not be crosswalked to...


### Validate final annotated crosswalk

In [27]:
# Make sure equal number of active coures
active = 0
archived = 0
crosswalked = 0
not_crosswalked = 0
active_with_archived = 0

for code, course_info in code_info.items():
    rec_code = course_info["sced_code"]
    is_archived = course_info["is_archived"]
    subs = course_info["v7_substitutes"]
    arc_vers = course_info["archived_versions"]
    
    assert rec_code == code, "Code mismatch key:{0} with code:{1}".format(code, rec_code)
    if is_archived:
        archived += 1
        if subs:
            crosswalked += 1
        else:
            not_crosswalked += 1
        assert code in archived_codes.tolist(), "Code archive mismatch: {0}".format(code)
        assert not arc_vers, "Archived code can't have archived versions: {0}".format(code)
        assert "_".join(sorted(crosswalk.crosswalk[code])) == "_".join(sorted(subs)), "Archived code subs don't match crosswalk: {0}".format(code)
    else:
        active += 1
        if arc_vers:
            active_with_archived += 1
        assert code in active_codes.tolist(), "Code active mismatch: {0}".format(code)
        assert not subs, "Active code can't have v7 substitution: {0}".format(code)


print("Active: %s" % active)    
print("archived: %s" % archived) 
print("crosswalked: %s" % crosswalked) 
print("not_crosswalked: %s" % not_crosswalked) 
print("active_with_archive: %s" % active_with_archived) 
assert active == len(active_codes), "Different number of active codes"
assert archived == len(archived_codes), "Different number of archived codes"
assert crosswalked == (len([x for x in crosswalk.crosswalk if x in archived_codes.tolist()]) - len(crosswalk.get_orphaned_codes())), "Different number of crosswalked"
assert not_crosswalked == len(crosswalk.get_orphaned_codes()), "Different number of orphaned codes"
assert crosswalked + not_crosswalked == len(archived_codes), "Crosswalked + Not-Crosswalked don't add up to total number of archived codes"
assert active_with_archived == len([k for k,v in crosswalk.get_active_code_crosswalk().items() if v]), "Different number of active codes with crosswalks"
    

Active: 1736
archived: 969
crosswalked: 882
not_crosswalked: 87
active_with_archive: 748


In [28]:
## Write crosswalk to json output
import json
outfile = "../data/crosswalked_sced_v7.json"
with open(outfile, "w") as fh:
    json.dump(code_info, fh, indent=1)