In [14]:
from bs4 import BeautifulSoup

In [15]:
raw_content = None

with open('fas_crse_cat.html', 'r') as f:
    raw_content = f.read()
    
soup = BeautifulSoup(raw_content, 'html.parser')

In [16]:
# all content will be in div.t
# so extract this
elts = soup.find_all('div', class_='t')

In [104]:
class Course(object):
    
    def __init__(self):
        self.number = None
        self.name = None
        self.instructors = None
        self.semester = None
        self.schedule = None
        self.strings = None
        
    def __str__(self):
        return "{}".format(self.name)
        
    def __repr__(self):
        # for nicer ipython debugging
        return str(self.__dict__)
    
    def to_dict(self):
        # ALIAS
        return self.__dict__
#         return dict(
#             name=self.name,
#             instructors=self.instructors,
#             semester=self.semester,
#             schedule=self.schedule,
#             strings=self.strings
#         )

In [128]:
# the .fs2 will be the title
# following .fs3 will be descriptive strings

courses = []
current_course = None

for elt in elts:    
    
    # first, ensure there's something actually here
    # FIXME could be bugs here from page breaks
    elt_text = None
    if elt.text is None or elt.text.strip() == "":
        continue
    else:
        elt_text = elt.text.strip()
    
    if 'fs2' in elt['class']:
        # .fs2 marks course numbers (e.g. COMPSCI 50)
        # and begins the start of a section for the course
        
        # start a new course
        current_course = Course()
        courses.append(current_course)
        
        current_course.number = elt_text
        current_course.strings = []
        current_course.instructors = []
    elif set(['hc','ff1','fs3']).issubset(set(elt['class'])):
        # FIXME this should run but it doesn't...
        # this is course name (e.g. Introduction to Computer Science)
        print elt_text
        current_course.name = elt_text
    elif set(['x5','ff3']).issubset(set(elt['class'])):
        # this is an instructor name!
        current_course.instructors.append(elt_text)
    elif elt is not None:
        # these are descriptive strings

        # error checking
        if current_course is None:
            continue
            
        # remove page footers
        BANNED_STRINGS = (
            'HARVARD UNIVERSITY',
            '9/4/2017 0:39 AM',
            '9/24/2017 0:39 AM',
            'HARVARD UNIVERSITY 9/24/2017 0:39 AM'
        )
        if elt.string in BANNED_STRINGS:
            continue
            
        current_course.strings.append(elt.string)

In [119]:
import re 

# Now try to pull meaningful info out of the strings
for course in courses:
    # try to extract a course schedule
    times = []
    for string in course.strings:
        matcher = re.compile("[MTWRF]{1,5} \d{4} [AP]M - \d{4} [AP]M")
        times.append(matcher.findall(string))
        
    flattened_times = sum(times,[])
    
    # this array will have max 1 element
    if len(flattened_times) > 0:
        course.schedule = flattened_times[0]
    else:
        course.schedule = None
    
    
    # try to extract a semester
    for string in course.strings:
        semester = re.search("20\d\d ((Fall)|(Spring))", string)
        if semester is not None:
            # just find the first one then quit
            course.semester = semester.group()
            break
        

In [107]:
courses[1200:1210]

[{'name': None, 'schedule': u'F 0100 PM - 0359 PM', 'instructors': [u'Radhika Nagpal'], 'number': u'Computer Science  189', 'semester': u'2018 Spring', 'strings': [u'Autonomous Robot Systems', u'2018 Spring (4 Credits)', u'Schedule: ', u'F 0100 PM - 0359 PM', u'Instructor Permissions:', u'Instructor', u'Enrollment Cap:', u'15', u'Building autonomous robotic systems requires understanding how to make robots that observe, reason, and act.  ', u'Each component uses many engineering principles: how to fuse, multiple, noisy sensors; how to balance short-term', u'versus long-term goals; how to control one&apos;s actions and how to coordinate with others. This year theme will be ', u'"Robots Roam the Halls", where we will focus on kinect-based robots that move in the SEAS buildings, to do ', u'applications like navigating, map building, and interacting with people. The class format will have a mixed lecture ', u'and lab format, and have a final project component.', u'Course Notes:', u'Prefere

In [108]:
# dump data to file

# convert courses to dicts
course_dicts = [c.to_dict() for c in courses]

import json
with open('course_catalog_dump.json', 'w') as outfile:
    json.dump(course_dicts, outfile)

In [109]:
# TODO: extract more info

In [23]:
# OLD: to turn a date into numbers
def get_time(contents):
    matcher = re.compile("[MTWRF]{1,5} \d{4} [AP]M - \d{4} [AP]M")
    times = matcher.findall(contents)

    submatcher = re.compile("([MTWRF]{1,5}) (\d{4}) ([AP]M) - (\d{4}) ([AP]M)")

    for time in times:
        timeTuple = submatcher.findall(time)[0]
        newTime = [timeTuple[0], int(timeTuple[1]), int(timeTuple[3])]
        if timeTuple[2] == "PM" and timeTuple[1][0:2] != "12":
            newTime[1] += 1200
        if timeTuple[4] == "PM" and timeTuple[3][0:2] != "12":
            newTime[2] += 1200
        newTime[1] = 60 * (newTime[1] // 100) + (newTime[1] % 100)
        newTime[2] = 60 * (newTime[2] // 100) + (newTime[2] % 100)
        
        return dict(
            days=newTime[0],
            start=newTime[1],
            end=newTime[2]
        )

        return newTime
    
    # no times found
    return None

In [9]:
get_time("R 0400 PM - 0559 PM")

{'days': 'R', 'end': 1079, 'start': 960}

In [58]:
courses[1200].to_dict()

{'instructors': [u'Radhika Nagpal'],
 'name': u'Computer Science  189 ',
 'schedule': None,
 'semester': None,
 'strings': [u'Autonomous Robot Systems',
  u'2018 Spring (4 Credits)',
  u'Schedule: ',
  u'F 0100 PM - 0359 PM',
  u'Instructor Permissions:',
  u'Instructor',
  u'Enrollment Cap:',
  u'15',
  u'Building autonomous robotic systems requires understanding how to make robots that observe, reason, and act.  ',
  u'Each component uses many engineering principles: how to fuse, multiple, noisy sensors; how to balance short-term',
  u'versus long-term goals; how to control one&apos;s actions and how to coordinate with others. This year theme will be ',
  u'"Robots Roam the Halls", where we will focus on kinect-based robots that move in the SEAS buildings, to do ',
  u'applications like navigating, map building, and interacting with people. The class format will have a mixed lecture ',
  u'and lab format, and have a final project component.',
  u'Course Notes:',
  u'Preference will b

In [132]:
e = None
for elt in elts:    
    if set(['hc','ff1','fs3']).issubset(set(elt['class'])):
        # FIXME this should run but it doesn't...
        # this is course name (e.g. Introduction to Computer Science)
        e = elt
        print elt.string

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


In [140]:
e.string