In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pprint import pprint

main_url = 'https://catalog.upenn.edu/courses/'
page_data = requests.get(main_url)

soup = BeautifulSoup(page_data.content, 'html.parser')

pages = soup.find(id='content')
depts = pages.find_all('a')

link_extensions = []
for dept in depts:
    link = dept.get('href')
    if isinstance(link, str) == True and '/courses/' in link:
        link_extensions.append(link)

base_url = 'https://catalog.upenn.edu'

courses_dict = {
    'department' : [],
    'course_name' : [],
    'course_description' : [],
    'instructor' : [],
    'course_type' : [],
    'credit_units' : [],
    'course_prereqs' : [],
    'term' : [],
    'also_offered_as' : []
}
course_counter = 0
dept_counter = 0
for link in link_extensions:
    full_url = base_url + link
    dept_abbreviation = link.replace('/courses/','').replace('/','').strip()
    
    page_data = requests.get(full_url)

    soup = BeautifulSoup(page_data.content, 'html.parser')

    results = soup.find(id='content')

    courses = results.find_all('div', class_='courseblock')

    for course in courses:
        course_name = str(course.find('p', class_='courseblocktitle noindent').text.strip())
        course_elements = course.find_all('p', class_='courseblockextra noindent')
        course_description = 'Unknown'
        instructor = 'Unknown'
        course_type = 'Unknown'
        credit_units = 'Unknown'
        course_prereqs = 'No Pre-Requisites for this Course'
        term = 'Unknown'
        also_offered_as = 'No Other Courses Match This Course'
        for element in course_elements:
            if 'Taught by' in element.text:
                instructor = element.text.replace('Taught by:','').strip()
            elif 'Activity:' in element.text:
                course_type = element.text.replace('Activity:', '').strip()
            elif 'Course Unit' in element.text:
                credit_units = element.text.replace('Course Unit', '').replace('s','').strip()
            elif 'Prerequisite:' in element.text:
                prereq_text = element.text.split('Prerequisite:',1)[1].strip()
#                 course_prereqs = element.text.replace('Prerequisite:', '').strip()
            elif 'course' in element.text.lower() and 'term' in element.text.lower() and 'offered' in element.text.lower():
                term = element.text
            elif 'Also Offered As:' in element.text:
                also_offered_as = element.text.replace('Also Offered As:', '').strip()
            else:
                course_description = element.text
        courses_dict['department'].append(dept_abbreviation)
        courses_dict['course_name'].append(str(course_name))
        courses_dict['course_description'].append(course_description)
        courses_dict['instructor'].append(instructor)
        courses_dict['course_type'].append(course_type)
        courses_dict['credit_units'].append(credit_units)
        courses_dict['course_prereqs'].append(prereq_text)
        courses_dict['term'].append(term)
        courses_dict['also_offered_as'].append(also_offered_as)
        course_counter += 1
    dept_counter += 1
    if dept_counter % 25 == 0:
        print('{} courses for {} departments scraped!'.format(course_counter, dept_counter))

courses_df = pd.DataFrame(courses_dict)
courses_df

1151 courses for 25 departments scraped!
2540 courses for 50 departments scraped!
3103 courses for 75 departments scraped!
4541 courses for 100 departments scraped!
5620 courses for 125 departments scraped!
6407 courses for 150 departments scraped!
7339 courses for 175 departments scraped!
8542 courses for 200 departments scraped!
9356 courses for 225 departments scraped!


Unnamed: 0,department,course_name,course_description,instructor,course_type,credit_units,course_prereqs,term,also_offered_as
0,acfd,ACFD 600 Personal Development I,Unknown,"Jessica Dine, MD",Hybrid Course,0.5,"Faculty interested in enrolling in ACFD 600, P...",Unknown,No Other Courses Match This Course
1,acfd,ACFD 601 Scholarship Development I,Unknown,"Jessica Dine, MD",Hybrid Course,0.5,"Faculty interested in enrolling in ACFD 601, S...",Unknown,No Other Courses Match This Course
2,acfd,ACFD 602 Personal Development II,Unknown,"Jessica Dine, MD",Hybrid Course,0.5,"Faculty interested in enrolling in ACFD 602, P...",Unknown,No Other Courses Match This Course
3,acfd,ACFD 603 Scholarship Development II,Unknown,"Jessica Dine, MD",Hybrid Course,0.5,"Faculty interested in enrolling in ACFD 603, S...",Unknown,No Other Courses Match This Course
4,acct,ACCT 101 Accounting and Financial Reporting,This course is an introduction to the basic co...,Unknown,Lecture,1.0,"Faculty interested in enrolling in ACFD 603, S...",One-term course offered either term,No Other Courses Match This Course
...,...,...,...,...,...,...,...,...,...
9746,zulu,ZULU 251 Intermediate Zulu II,For BA Students: Last Language Course,Unknown,Lecture,1.0,Offered through Penn Language Center,Course usually offered in spring term,No Other Courses Match This Course
9747,zulu,ZULU 350 Advanced Zulu I,For BA Students: Advanced Language Course,Mbeje,Lecture,1.0,Offered through Penn Language Center,Course usually offered in fall term,No Other Courses Match This Course
9748,zulu,ZULU 351 Advanced Zulu II,For BA Students: Advanced Language Course,Mbeje,Lecture,1.0,Offered through Penn Language Center,Course usually offered in spring term,No Other Courses Match This Course
9749,zulu,ZULU 450 Zulu Language and Culture I,Unknown,Unknown,Lecture,1.0,Offered through Penn Language Center,Course usually offered in spring term,No Other Courses Match This Course


In [2]:
def extract_course_num(course_name):
    course_num = [int(i) for i in course_name.split() if i.isdigit()][0]
    return course_num

courses_df['course_number'] = courses_df['course_name'].apply(extract_course_num)
courses_df = courses_df[['department','course_number','course_name','course_description','instructor','course_type','credit_units','course_prereqs','term','also_offered_as']]
courses_df.to_csv('grad_courses.csv', index=False)