# Web scraper for UM academic calendar data

@Author: [Jeff Lockhart](http://www-personal.umich.edu/~jwlock/)

### Example URLs: 

- http://ro.umich.edu/calendar/ss17.php
- http://ro.umich.edu/calendar/fa18.php
- http://ro.umich.edu/calendar/wn10.php

### Imports

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import re

## Downloading one web page

In [None]:
url = "http://ro.umich.edu/calendar/fa18.php"
r = requests.get(url)
r

In [None]:
r.status_code

In [None]:
r.content

## Downloading many web pages

In [None]:
years = range(2003, 2019)
list(years)

In [None]:
str(years[0])

In [None]:
str(years[0])[2:]

In [None]:
terms = ['fa', 'wn']
base_url = 'http://ro.umich.edu/calendar/'
end_url = '.php'

for y in years:
    for t in terms:
        url = base_url + t + str(y)[2:] + end_url
        print(url)

## Working with the HTML

In [None]:
page = r.content

#parse page with bs4
soup = BeautifulSoup(page, 'html.parser')
soup

In [None]:
#select just the table of interest
table = soup.find('table')
table

In [None]:
# select just the rows in the table
rows = table.find_all('tr')
rows[0]

In [None]:
cells = rows[0].find_all(['td'])
cells[0]

In [None]:
cells[0].text

## Getting a table from the HTML

In [None]:
labels = ['event', 'times']
data = []
rows = table.find_all('tr')

for r in rows:
    tmp = {}
    for i, txt in enumerate(r.find_all('td')):
        tmp[labels[i]] = txt.text
        
    data.append(tmp)
    
#convert our findings to a dataframe
df = pd.DataFrame(data)
df

### Converting the text to dates
- This particular text is messy, so getting dates is hard
- We don't have time to go into `regular expressions` during the talk, but they are the solution.
- The code below uses regular expressions to get the dates from the text.
- Don't worry about how it works right now, just look to see that it does.

In [None]:
def get_dates(txt, y):
    date = None
    m = re.search('^(\w+\s\d+)', str(txt))
    if m:
        date = m.group(1)
        date += ', '+str(y)
        date = pd.to_datetime(date)
    
    return date

df['date'] = df.times.apply(get_dates, y=y)
df

## Some helpful functions for selecting information out of the text

In [None]:
def exam(txt):
    txt = str(txt).lower()
    result = 0
    if 'exam' in txt:
        result = 1
    return result

def class_start(txt):
    txt = str(txt).lower()
    result = 0
    if 'classes' in txt:
        if 'begin' in txt:
            result = 1
        elif 'resume' in txt:
            result = 1
    return result

def class_stop(txt):
    txt = str(txt).lower()
    result = 0
    if 'classes' in txt:
        if 'end' in txt:
            result = 1
    elif 'recess' in txt:
        result = 1
    elif 'vacation' in txt:
        if 'begin' in txt:
            result = 1
    return result

def get_table(page, y):
    #parse page with bs4
    soup = BeautifulSoup(page, 'html.parser')
    #select just the table of interest
    table = soup.find('table')

    labels = ['event', 'times']
    data = []
    rows = table.find_all('tr')
    #each row is a game
    for r in rows:
        tmp = {}
        
        for i, txt in enumerate(r.find_all('td')):
            tmp[labels[i]] = txt.text

        data.append(tmp)
    #convert our findings to a dataframe
    df = pd.DataFrame(data)

    
    df['date'] = df.times.apply(get_dates, y=y)
    df['exams'] = df.event.apply(exam)
    df['class_start'] = df.event.apply(class_start)
    df['class_stop'] = df.event.apply(class_stop)
    
    #drop the pesky null rows
    df = df.dropna(axis=0, how='any')
    
    return df

## The full scraper

In [None]:
df = pd.DataFrame()

terms = ['fa', 'wn']
years = range(2003, 2019)
base_url = 'http://ro.umich.edu/calendar/'
end_url = '.php'

for y in years:
    for t in terms:
        #get the page for this term
        url = base_url+t+str(y)[2:]+end_url
        r = requests.get(url)

        #if the page exists
        if r.status_code == 200:
            print('Processing', url)
            tmp = get_table(r.content, y)
            df = pd.concat([df, tmp])
        else:
            #some years don't have data. Ignore them and move on.
            print('Error with', url)

        #wait to be a polite lil spider
        time.sleep(2)
    
df.shape

In [None]:
df.head()

## Exploring ang saving the data

In [None]:
#sort our data and peak at it.
df = df.sort_values(by='date')
df.head()

In [None]:
df.to_csv('data/UM_academic_calendar_no_summer.tsv', 
          sep='\t', index=False)

## Selecting just one year

In [None]:
df[df.date.dt.year == 2015]

## Figuring out when class is in session

In [None]:
#variables for storing data
data = []
tmp = {}

In [None]:
for r in df.iterrows():
    if r[1].class_start == 1:
        data.append(tmp)
        tmp = {}
        tmp['class_start'] = r[1].date
    elif r[1].class_stop == 1:
        tmp['class_end'] = r[1].date
        
data

In [None]:
#convert data to dataframe
classes = pd.DataFrame(data)
classes.head()

In [None]:
#rearrange columns
classes = classes[['class_start', 'class_end']]
#drop empty rows
classes = classes.dropna(axis=0)
classes.head()

In [None]:
#saving results
classes.to_csv('data/UM_class_periods_no_summer.tsv', 
          sep='\t', index=False)