In [3]:
from bs4 import BeautifulSoup

In [4]:
raw_content = None

with open('fas_crse_cat.html', 'r') as f:
    raw_content = f.read()
    
soup = BeautifulSoup(raw_content, 'html.parser')

In [5]:
# all content will be in div.t
# so extract this
elts = soup.find_all('div', class_='t')

In [6]:
# reload() doesn't always work without importing it first
from importlib import reload
import course
reload(course)
import parser
reload(parser)

<module 'parser' from '/Users/alexgiglio/Documents/HODP Local/course-catalog/parser.py'>

In [7]:
import re

# the .fs2 will be the title
# following .fs3 will be descriptive strings

courses = []
current_course = None

def elt_has_classes(elt, class_list):
    """
    Returns true if the given HTML element has the given
    classes.
    
    Usage: elt_has_classes(my_element, ['btn','btn-primary])
    """
    return set(class_list).issubset(set(elt['class']))

for elt in elts:    
    
    # first, ensure there's something actually here
    # FIXME could be bugs here from page breaks
    elt_text = None
    if elt.text is None or elt.text.strip() == "":
        continue
    else:
        elt_text = elt.text.strip()
    
    if elt_has_classes(elt, ['x5','fs2']):
        # .x5.fs2 marks course numbers (e.g. COMPSCI 50)
        # and begins the start of a section for the course
        
        # start a new course
        current_course = course.Course()
        courses.append(current_course)
        
        current_course.set_course_number(elt_text)
    elif elt_has_classes(elt, ['x5','hc','ff1','fs3']):
        # this is course name (e.g. Introduction to Computer Science)
        # actually, it contains [course name, course id]
        # course id might not be in there, but don't worry about that for now
        
        # ASIDE:
        # one problem is that sometimes the course name
        # is written in a .x5.h6.ff1.fs3 (note h6 not hc) (these are classes, btw)
        # and we can't just search for those b/c many other types of data
        # use those same classes. so we'd have to search for those classes
        # and THEN apply some kind of regex or other conditional to make sure we're extracting
        # a course title and not anything else. basically it's messy and a TODO.
        
        course_name_list = list(elt)
        
        # if it doesn't have exactly 2 elements, it's malformed... skip!
        if len(course_name_list) != 2:
            continue
    
        # now unpack it
        [course_name, course_id_container] = list(elt)
        
        # write the name
        current_course.name = course_name.strip()
        
        # now the id
        current_course.set_id(course_id_container.text)

    elif elt_has_classes(elt, ['h10','ff1','fs5']):
        # this is an alternative way for course ids to be placed
        # now, this includes BAD text like `FAS: Meets Foreign Lang Req Hausa`
        # as well as legit things which are like 
        # `(123591)`
        matcher = re.compile("\(\d+\)")
        results = matcher.findall(elt.text.strip())
        if len(results) > 0:
            # then we have a legit id number!!!!
            # remove parens
            # TODO: factor out to Course class
            current_course.id = elt.text.strip()[1:-1]
    elif elt_has_classes(elt, ['x5','ff3']):
        # this is an instructor name!
        current_course.instructors.append(elt_text)
    elif elt is not None:
        # these are descriptive strings

        # error checking
        if current_course is None:
            continue
            
        # remove page footers
        BANNED_STRINGS = (
            'HARVARD UNIVERSITY',
            '9/4/2017 0:39 AM',
            '9/24/2017 0:39 AM',
            'HARVARD UNIVERSITY 9/24/2017 0:39 AM'
        )
        if elt_text in BANNED_STRINGS:
            continue
            
        current_course.strings.append(elt_text)

In [8]:
# Before processing
courses[2000]

{'number': 'Computer Science 152', 'name': 'Programming Languages', 'instructors': ['Stephen Chong'], 'semester': None, 'schedule': None, 'strings': ['2018 Spring (4 Credits)', 'Schedule:', 'TR 1000 AM - 1129 AM', 'Instructor Permissions:', 'None', 'Enrollment Cap:', 'n/a', 'Comprehensive introduction to the principal features and overall design of both traditional and modern programming', 'languages, including syntax, formal semantics, abstraction mechanisms, modularity, type systems, naming,', 'polymorphism, closures, continuations, and concurrency. Provides the intellectual tools needed to design, evaluate,', 'choose, and use programming languages.', 'Recommended Prep:', 'Computer Science 51; Computer Science 121 is recommended. Students', 'must have good programming skills, be very comfortable with recursion,', 'proofs, basic mathematical ideas and notations, including sets, relations,', 'functions, and induction.', 'Additional Course Attributes:', 'Attribute Value(s)'], 'id': '119

In [18]:
# Now try to pull meaningful info out of the strings
for course in courses:
    # raw data is there, now process it
    course.process_strings()

In [20]:
# After processing
courses[2000]

{'number': 'Computer Science 152', 'name': 'Programming Languages', 'instructors': ['Stephen Chong'], 'semester': '2018 Spring', 'schedule': 'TR 1000 AM - 1129 AM', 'strings': ['2018 Spring (4 Credits)', 'Schedule:', 'TR 1000 AM - 1129 AM', 'Instructor Permissions:', 'None', 'Enrollment Cap:', 'n/a', 'Comprehensive introduction to the principal features and overall design of both traditional and modern programming', 'languages, including syntax, formal semantics, abstraction mechanisms, modularity, type systems, naming,', 'polymorphism, closures, continuations, and concurrency. Provides the intellectual tools needed to design, evaluate,', 'choose, and use programming languages.', 'Recommended Prep:', 'Computer Science 51; Computer Science 121 is recommended. Students', 'must have good programming skills, be very comfortable with recursion,', 'proofs, basic mathematical ideas and notations, including sets, relations,', 'functions, and induction.', 'Additional Course Attributes:', 'Attri

In [8]:
# dump
parser.dump_csv(courses)

In [21]:
# Dump to JSON (Good for internet stuff!)
parser.dump_json(courses)