In [3]:
from bs4 import BeautifulSoup

In [4]:
raw_content = None

with open('fas_crse_cat.html', 'r') as f:
    raw_content = f.read()
    
soup = BeautifulSoup(raw_content, 'html.parser')

In [5]:
# all content will be in div.t
# so extract this
elts = soup.find_all('div', class_='t')

In [6]:
import course
reload(course)
import parser
reload(parser)

<module 'parser' from 'parser.pyc'>

In [7]:
import re

# the .fs2 will be the title
# following .fs3 will be descriptive strings

courses = []
current_course = None

def elt_has_classes(elt, class_list):
    """
    Returns true if the given HTML element has the given
    classes.
    
    Usage: elt_has_classes(my_element, ['btn','btn-primary])
    """
    return set(class_list).issubset(set(elt['class']))

for elt in elts:    
    
    # first, ensure there's something actually here
    # FIXME could be bugs here from page breaks
    elt_text = None
    if elt.text is None or elt.text.strip() == "":
        continue
    else:
        elt_text = elt.text.strip()
    
    if elt_has_classes(elt, ['x5','fs2']):
        # .x5.fs2 marks course numbers (e.g. COMPSCI 50)
        # and begins the start of a section for the course
        
        # start a new course
        current_course = course.Course()
        courses.append(current_course)
        
        current_course.set_course_number(elt_text)
    elif elt_has_classes(elt, ['x5','hc','ff1','fs3']):
        # this is course name (e.g. Introduction to Computer Science)
        # actually, it contains [course name, course id]
        # course id might not be in there, but don't worry about that for now
        
        # ASIDE:
        # one problem is that sometimes the course name
        # is written in a .x5.h6.ff1.fs3 (note h6 not hc) (these are classes, btw)
        # and we can't just search for those b/c many other types of data
        # use those same classes. so we'd have to search for those classes
        # and THEN apply some kind of regex or other conditional to make sure we're extracting
        # a course title and not anything else. basically it's messy and a TODO.
        
        course_name_list = list(elt)
        
        # if it doesn't have exactly 2 elements, it's malformed... skip!
        if len(course_name_list) != 2:
            continue
    
        # now unpack it
        [course_name, course_id_container] = list(elt)
        
        # write the name
        current_course.name = course_name.strip()
        
        # now the id
        current_course.set_id(course_id_container.text)

    elif elt_has_classes(elt, ['h10','ff1','fs5']):
        # this is an alternative way for course ids to be placed
        # now, this includes BAD text like `FAS: Meets Foreign Lang Req Hausa`
        # as well as legit things which are like 
        # `(123591)`
        matcher = re.compile("\(\d+\)")
        results = matcher.findall(elt.text.strip())
        if len(results) > 0:
            # then we have a legit id number!!!!
            # remove parens
            # TODO: factor out to Course class
            current_course.id = elt.text.strip()[1:-1]
    elif elt_has_classes(elt, ['x5','ff3']):
        # this is an instructor name!
        current_course.instructors.append(elt_text)
    elif elt is not None:
        # these are descriptive strings

        # error checking
        if current_course is None:
            continue
            
        # remove page footers
        BANNED_STRINGS = (
            'HARVARD UNIVERSITY',
            '9/4/2017 0:39 AM',
            '9/24/2017 0:39 AM',
            'HARVARD UNIVERSITY 9/24/2017 0:39 AM'
        )
        if elt_text in BANNED_STRINGS:
            continue
            
        current_course.strings.append(elt_text)

In [8]:
# Now try to pull meaningful info out of the strings
for course in courses:
    # raw data is there, now process it
    course.process_strings()

In [9]:
courses[2000]

{'name': u'Programming Languages', 'schedule': u'TR 1000 AM - 1129 AM', 'instructors': [u'Stephen Chong'], 'number': u'Computer Science 152', 'id': u'119629', 'semester': u'2018 Spring', 'actual_number': u'152', 'level': 'Undergraduate', 'strings': [u'2018 Spring (4 Credits)', u'Schedule:', u'TR 1000 AM - 1129 AM', u'Instructor Permissions:', u'None', u'Enrollment Cap:', u'n/a', u'Comprehensive introduction to the principal features and overall design of both traditional and modern programming', u'languages, including syntax, formal semantics, abstraction mechanisms, modularity, type systems, naming,', u'polymorphism, closures, continuations, and concurrency. Provides the intellectual tools needed to design, evaluate,', u'choose, and use programming languages.', u'Recommended Prep:', u'Computer Science 51; Computer Science 121 is recommended. Students', u'must have good programming skills, be very comfortable with recursion,', u'proofs, basic mathematical ideas and notations, including

In [10]:
# dump
parser.dump_csv(courses)

In [31]:
import re

schedules = [
    "MW 1100 AM - 1159 AM",
    "TR 1000 AM - 1059 AM",
    "W 0200 PM - 0359 PM",
    "TR 1130 AM - 1259 PM",
    "TBD"
]

schedule_matcher = re.compile("([MTWRF]{1,5}) (\d{4}) ([AP]M) - (\d{4}) ([AP]M)")

for schedule in schedules:
    matches = schedule_matcher.findall(schedule)
    if len(matches) == 1:
        # we found something!
        # this contains, in order:
        # 0. Days
        # 1. Start time
        # 2. Start AM or PM
        # 3. End time
        # 4. End AM or PM
        # e.g. ('MW', '1100', 'AM', '1159', 'AM')
        schedule_chunks = matches[0]
        
        # convert start and end time to military time
        # e.g. 0130 PM => 1330
        start_time = time_to_military(schedule_chunks[1], schedule_chunks[2])
        end_time = time_to_military(schedule_chunks[3], schedule_chunks[4])
        
        print (start_time, end_time)
        
        # convert the days into better-readable ones
        # "MW" => ["Monday", "Wednesday]
        # e.g. "MW"
        days_abbreviation_string = schedule_chunks[0]
        # e.g. ["M","W"]
        days_abbreviation_list = list(days_abbreviation_string)
        
        # convert "M" to "Monday", etc
        abbreviation_dict = {
            "M": "Monday",
            "T": "Tuesday",
            "W": "Wednesday",
            "R": "Thursday",
            "F": "Friday"
        }
        days_full_list = [abbreviation_dict[d] for d in days_abbreviation_list]
        print days_full_list
        
        
def time_to_military(time_string, meridian_string):
    """
    Usage: `time_to_military('0130','PM') => 1330
    """
    
    cleaned_time = int(time_string)
    
    # first off, 12:xx should actually be 0:xx
    if int(cleaned_time / 100) == 12:
        cleaned_time -= 1200
    
    # now add 1200 to PMs
    if meridian_string == "PM":
        # e.g. 1:30pm == 1330 hours
        cleaned_time += 1200
        
    return cleaned_time


(1100, 1159)
['Monday', 'Wednesday']
(1000, 1059)
['Tuesday', 'Thursday']
(1400, 1559)
['Wednesday']
(1130, 1259)
['Tuesday', 'Thursday']
