In [1]:
from bs4 import BeautifulSoup

In [2]:
raw_content = None

with open('fas_crse_cat.html', 'r') as f:
    raw_content = f.read()
    
soup = BeautifulSoup(raw_content, 'html.parser')

In [3]:
# all content will be in div.t
# so extract this
elts = soup.find_all('div', class_='t')

In [16]:
class Course(object):
    
    # static
    # this is the list of fields that are exported to CSV
    # see to_dict_for_csv()
    CSV_FIELDS = [
        'number',
        'semester',
        'name',
        'instructors',
        'schedule',
        'id',
    ]
    
    def __init__(self):
        self.number = None
        self.name = None
        self.instructors = []
        self.semester = None
        self.schedule = None
        self.strings = []
        self.id = None
        
    def __str__(self):
        return "{}".format(self.name)
        
    def __repr__(self):
        # for nicer ipython debugging
        return str(self.__dict__)
    
    def to_dict(self):
        # ALIAS
        return self.__dict__
#         return dict(
#             name=self.name,
#             instructors=self.instructors,
#             semester=self.semester,
#             schedule=self.schedule,
#             strings=self.strings
#         )

    def to_dict_for_csv(self):
        # returns a nicer-formatted dict ready for insertion into a csv
        return dict(
            id=self.id,
            name=self.name,
            number=self.number,
            instructors=(" & ".join(self.instructors)),
            semester=self.semester,
            schedule=self.schedule
        )
    
    


In [None]:
import re

# the .fs2 will be the title
# following .fs3 will be descriptive strings

courses = []
current_course = None

for elt in elts:    
    
    # first, ensure there's something actually here
    # FIXME could be bugs here from page breaks
    elt_text = None
    if elt.text is None or elt.text.strip() == "":
        continue
    else:
        elt_text = elt.text.strip()
    
    if set(['x5','fs2']).issubset(set(elt['class'])):
        # .x5.fs2 marks course numbers (e.g. COMPSCI 50)
        # and begins the start of a section for the course
        
        # start a new course
        current_course = Course()
        courses.append(current_course)
        
        current_course.number = elt_text
    elif set(['x5','hc','ff1','fs3']).issubset(set(elt['class'])):
        # this is course name (e.g. Introduction to Computer Science)
        # actually, it contains [course name, course id]
        # course id might not be in there, but don't worry about that for now
        course_name_list = list(elt)
        
        # if it doesn't have exactly 2 elements, it's malformed... skip!
        if len(course_name_list) != 2:
            continue
    
        # now unpack it
        [course_name, course_id_container] = list(elt)
        
        # the container holds the id... but the id is also
        # within parentheses!
        # first trim whitespace...
        course_id = course_id_container.text.strip()
        
        if len(course_id) <= 2:
            # must not be anything here, except optionally parentheses
            continue

        # then remove parentheses
        course_id_without_parens = course_id[1:-1]
        
        current_course.name = course_name.strip()
        current_course.id = course_id_without_parens
    elif set(['h10','ff1','fs5']).issubset(set(elt['class'])):
        # this is an alternative way for course ids to be placed
        # now, this includes BAD text like `FAS: Meets Foreign Lang Req Hausa`
        # as well as legit things which are like 
        # `(123591)`
        matcher = re.compile("\(\d+\)")
        results = matcher.findall(elt.text.strip())
        if len(results) > 0:
            # then we have a legit id number!!!!
            # remove parens
            # TODO: factor out to Course class
            current_course.id = elt.text.strip()[1:-1]
    elif set(['x5','ff3']).issubset(set(elt['class'])):
        # this is an instructor name!
        current_course.instructors.append(elt_text)
    elif elt is not None:
        # these are descriptive strings

        # error checking
        if current_course is None:
            continue
            
        # remove page footers
        BANNED_STRINGS = (
            'HARVARD UNIVERSITY',
            '9/4/2017 0:39 AM',
            '9/24/2017 0:39 AM',
            'HARVARD UNIVERSITY 9/24/2017 0:39 AM'
        )
        if elt_text in BANNED_STRINGS:
            continue
            
        current_course.strings.append(elt_text)

In [None]:
import re 

# Now try to pull meaningful info out of the strings
for course in courses:
    # try to extract a course schedule
    times = []
    for string in course.strings:
        matcher = re.compile("[MTWRF]{1,5} \d{4} [AP]M - \d{4} [AP]M")
        times.append(matcher.findall(string))
        
    flattened_times = sum(times,[])
    
    # this array will have max 1 element
    if len(flattened_times) > 0:
        course.schedule = flattened_times[0]
    else:
        course.schedule = None
    
    
    # try to extract a semester
    for string in course.strings:
        semester = re.search("20\d\d ((Fall)|(Spring))", string)
        if semester is not None:
            # just find the first one then quit
            course.semester = semester.group()
            break
        

In [None]:
courses[10]

In [None]:
# dump data to file

# convert courses to dicts
course_dicts = [c.to_dict() for c in courses]

import json
with open('course_catalog_dump.json', 'w') as outfile:
    json.dump(course_dicts, outfile)

In [None]:
test_courses = courses[0:50]

In [None]:
# write a csv
import csv

with open('course_catalog_dump.csv', 'w') as csvfile:
    fieldnames = Course.CSV_FIELDS
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    
    for course in test_courses:
        writer.writerow(course.to_dict_for_csv())

In [None]:
courses[0]