## Web Scraping

In [1]:
import requests
from bs4 import BeautifulSoup, Tag, NavigableString
import pandas as pd
from IPython.display import display

In [2]:
url = 'https://catalog.mit.edu/subjects/6/'
page = requests.get(url)
doc = BeautifulSoup(page.text, 'html.parser')
#print(doc.prettify())

In [3]:
main= doc.find("div", id="sc_sccoursedescs")

In [4]:
table =[]
sectionhead= None

for tag in list(main.children):
    if not isinstance(tag, Tag):
        continue
        
    if "sectionhead" in tag['class'] :
        sectionhead= tag.string
        
    elif "courseblock" in tag['class']:
        title= prereq= terms= hours= optional= desc= instructor= None
        for child in tag.children:
            if not isinstance(child, Tag):
                    continue
            #print("child's class: ", child['class'])
            
            if "courseblocktitle" in child['class']:
                title= child.find("strong").get_text(strip=True, separator=' ')
                
            elif "courseblockextra" in child['class'] :
                prereq= child.find(class_="courseblockprereq").get_text(strip=True, separator=' ')
                
                terms= child.find(class_="courseblockterms").get_text(strip=True, separator=' ')
                hours= child.find(class_="courseblockhours").get_text(strip=True, separator=' ')
                optional= child.find(class_="courseblockoptional")
                if optional:
                    optional = optional.get_text(strip=True, separator=' ')
                
            elif "courseblockdesc" in child['class']:
                desc= child.get_text(strip=True, separator=' ')
                
            elif "courseblockinstructors" in child['class']:
                
                instructor= child.find("i").get_text(strip=True, separator=' ')
                
        table.append({
        "title": title, "prereq":prereq,
        "terms":terms, "hours":hours,"optional":optional,
        "description":desc, "instructor": instructor,
        "section_head": sectionhead})

In [5]:
df=pd.DataFrame(table)
display(df.head())
display(df.describe())

Unnamed: 0,title,prereq,terms,hours,optional,description,instructor,section_head
0,6.100A Introduction to Computer Science Progra...,Prereq: None,"U (Fall, Spring; first half of term)",2-0-4 units,Credit cannot also be received for 6.100L,Introduction to computer science and programmi...,A. Bell,Programming & Software Engineering
1,6.100B Introduction to Computational Thinking ...,Prereq: 6.100A or permission of instructor,"U (Fall, Spring; second half of term)",2-0-4 units,"Credit cannot also be received for 9.C20[J] , ...",Provides an introduction to using computation ...,"A. Bell, J. V. Guttag",Programming & Software Engineering
2,6.100L Introduction to Computer Science and Pr...,Prereq: None,"U (Fall, Spring)",2-0-4 units,Credit cannot also be received for 6.100A,Introduction to computer science and programmi...,"A. Bell, J. V. Guttag",Programming & Software Engineering
3,6.1010 Fundamentals of Programming,Prereq: 6.100A,"U (Fall, Spring)",2-4-6 units. Institute LAB,,Introduces fundamental concepts of programming...,"D. S. Boning, A. Chlipala, S. Devadas, A. Hartz",Programming & Software Engineering
4,6.1020 Software Construction,Prereq: 6.1010,U (Spring),3-0-12 units,,Introduces fundamental principles and techniqu...,"M. Goldman, R. C. Miller",Programming & Software Engineering


Unnamed: 0,title,prereq,terms,hours,optional,description,instructor,section_head
count,406,406,406,406,143,406,406,406
unique,406,179,40,70,12,286,219,32
top,6.C85[J] Interactive Data Visualization and So...,Prereq: None,G (Fall),3-0-9 units,Can be repeated for credit.,Covers subject matter not offered in the regul...,Consult Department,Special Subjects
freq,1,95,56,126,130,76,95,108


In [6]:
df[df['prereq'].str[:8] != "Prereq: "]

Unnamed: 0,title,prereq,terms,hours,optional,description,instructor,section_head


In [7]:
df['prereq'] = df['prereq'].str[8:] #remove "Prereq: " from the start of every entry

In [8]:
temp= df['title'].str.split(n=1)
df['code']= temp.str.get(0)
df['title']= temp.str.get(1)
del(temp)

df.insert(0, 'code', df.pop('code')) #remove the code column and reinsert it at the beginning
df.head()

Unnamed: 0,code,title,prereq,terms,hours,optional,description,instructor,section_head
0,6.100A,Introduction to Computer Science Programming i...,,"U (Fall, Spring; first half of term)",2-0-4 units,Credit cannot also be received for 6.100L,Introduction to computer science and programmi...,A. Bell,Programming & Software Engineering
1,6.100B,Introduction to Computational Thinking and Dat...,6.100A or permission of instructor,"U (Fall, Spring; second half of term)",2-0-4 units,"Credit cannot also be received for 9.C20[J] , ...",Provides an introduction to using computation ...,"A. Bell, J. V. Guttag",Programming & Software Engineering
2,6.100L,Introduction to Computer Science and Programming,,"U (Fall, Spring)",2-0-4 units,Credit cannot also be received for 6.100A,Introduction to computer science and programmi...,"A. Bell, J. V. Guttag",Programming & Software Engineering
3,6.1010,Fundamentals of Programming,6.100A,"U (Fall, Spring)",2-4-6 units. Institute LAB,,Introduces fundamental concepts of programming...,"D. S. Boning, A. Chlipala, S. Devadas, A. Hartz",Programming & Software Engineering
4,6.1020,Software Construction,6.1010,U (Spring),3-0-12 units,,Introduces fundamental principles and techniqu...,"M. Goldman, R. C. Miller",Programming & Software Engineering


In [9]:
df['terms'].str[0].value_counts()

terms
U    198
G    174
A     34
Name: count, dtype: int64

In [10]:
df[df['terms'].str[0] == 'A'].head()

Unnamed: 0,code,title,prereq,terms,hours,optional,description,instructor,section_head
13,6.5120,Formal Reasoning About Programs,6.1020 and 6.1200[J],Acad Year 2024-2025: Not offered Acad Year 202...,3-0-9 units,,Surveys techniques for rigorous mathematical r...,A. Chlipala,Programming Languages
22,6.1420,Fixed Parameter and Fine-grained Computation,"6.1200[J] , 6.1210 , and ( 6.1220[J] , 6.1400[...",Acad Year 2024-2025: U (Fall) Acad Year 2025-2...,3-0-9 units,,An overview of the theory of parameterized alg...,"R. Williams, V. Williams",Theoretical Computer Science
24,6.5220[J],Randomized Algorithms,( 6.1200[J] or 6.3700 ) and ( 6.1220[J] or 6.5...,Acad Year 2024-2025: G (Spring) Acad Year 2025...,5-0-7 units,,Studies how randomization can be used to make ...,D. R. Karger,Theoretical Computer Science
25,6.5230,Advanced Data Structures,6.1220[J],Acad Year 2024-2025: G (Spring) Acad Year 2025...,3-0-9 units,,More advanced and powerful data structures for...,E. D. Demaine,Theoretical Computer Science
26,6.5240,Sublinear Time Algorithms,6.1220[J] or permission of instructor,Acad Year 2024-2025: G (Fall) Acad Year 2025-2...,3-0-9 units,,Sublinear time algorithms understand parameter...,R. Rubinfeld,Theoretical Computer Science


In [11]:
def GU_extract(term):
    temp=term.split()
    offered_to=[]
    if 'G' in temp:
        offered_to.append('G')
    if 'U' in temp:
        offered_to.append('U')
    if len(offered_to) == 1:
        return offered_to[0]
    else:
        return offered_to
    
df['offered_to']= df['terms'].apply(GU_extract)
display(df['offered_to'].value_counts())
display(df.head())

offered_to
U    206
G    200
Name: count, dtype: int64

Unnamed: 0,code,title,prereq,terms,hours,optional,description,instructor,section_head,offered_to
0,6.100A,Introduction to Computer Science Programming i...,,"U (Fall, Spring; first half of term)",2-0-4 units,Credit cannot also be received for 6.100L,Introduction to computer science and programmi...,A. Bell,Programming & Software Engineering,U
1,6.100B,Introduction to Computational Thinking and Dat...,6.100A or permission of instructor,"U (Fall, Spring; second half of term)",2-0-4 units,"Credit cannot also be received for 9.C20[J] , ...",Provides an introduction to using computation ...,"A. Bell, J. V. Guttag",Programming & Software Engineering,U
2,6.100L,Introduction to Computer Science and Programming,,"U (Fall, Spring)",2-0-4 units,Credit cannot also be received for 6.100A,Introduction to computer science and programmi...,"A. Bell, J. V. Guttag",Programming & Software Engineering,U
3,6.1010,Fundamentals of Programming,6.100A,"U (Fall, Spring)",2-4-6 units. Institute LAB,,Introduces fundamental concepts of programming...,"D. S. Boning, A. Chlipala, S. Devadas, A. Hartz",Programming & Software Engineering,U
4,6.1020,Software Construction,6.1010,U (Spring),3-0-12 units,,Introduces fundamental principles and techniqu...,"M. Goldman, R. C. Miller",Programming & Software Engineering,U


In [17]:
df['prereq']= df['prereq'].replace("None", None) #set prereq string "None" values to actual None values

In [18]:
#save to csv for future reference
df.to_csv("data/Cleaned_EECE.csv", index=False)