# Import Libraries:

In [1]:
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests

# <center><i><b>Environment & Sustainability

### <center><b> Web Scraping the Data (Environment & Sustainability: Earth Systems and Climate Science)

## Extracting Course Title, Course Level, Course Description

In [5]:
driver = webdriver.Chrome() 

# Load the webpage
url = "https://ocw.mit.edu/course-lists/environment-earth-systems-and-climate-science-5/" 
driver.get(url)

# Find all course cards
course_cards = driver.find_elements(By.CSS_SELECTOR, ".card.course-collection-row.mb-1")
courses_data = []
for card in course_cards:
    # Extract information from each card
    course_title = card.find_element(By.CSS_SELECTOR, ".course-title h4").text
    course_number = card.find_element(By.CSS_SELECTOR, ".coursenum").text
    course_level = card.find_element(By.CSS_SELECTOR, ".level").text
    course_link = card.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
    
    driver.execute_script(f"window.open('{course_link}', '_blank');")
    driver.switch_to.window(driver.window_handles[-1])

    try:
        
        show_more_button = driver.find_element(By.ID, "expand-description")
        if show_more_button:
            show_more_button.click()
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#expanded-description"))
            )
            course_description = driver.find_element(By.CSS_SELECTOR, "#expanded-description").text
            courses_data.append({
                'Title':course_title, 
                'Difficult':course_level, 
                'Description':course_description,
                'Link': course_link
            })
        else:
            course_description = driver.find_element(By.CSS_SELECTOR, "#full-description").text
            
            courses_data.append({'Description': np.nan})
        
        # print(f"Course Title: {course_title}")
        # print(f"Course Description: {course_description}")
    except:
        print(f"Error scraping data for: {course_link}")


    driver.close()
    driver.switch_to.window(driver.window_handles[0])

driver.quit()

Error scraping data for: https://ocw.mit.edu/courses/1-72-groundwater-hydrology-fall-2005/
Error scraping data for: https://ocw.mit.edu/courses/12-009j-theoretical-environmental-analysis-spring-2015/
Error scraping data for: https://ocw.mit.edu/courses/12-842-climate-physics-and-chemistry-fall-2008/


In [6]:
courses_data_df = pd.DataFrame(courses_data)

In [7]:
courses_data_df.head()

Unnamed: 0,Title,Difficult,Description,Link
0,Ecology I: The Earth System,Undergraduate,"We will cover fundamentals of ecology, conside...",https://ocw.mit.edu/courses/1-018j-ecology-i-t...
1,Ecology II: Engineering for Sustainability,Undergraduate,"This course provides a review of physical, che...",https://ocw.mit.edu/courses/1-020-ecology-ii-e...
2,Transport Processes in the Environment,Undergraduate,This class serves as an introduction to mass t...,https://ocw.mit.edu/courses/1-061-transport-pr...
3,Advanced Fluid Dynamics of the Environment,Graduate,Designed to familiarize students with theories...,https://ocw.mit.edu/courses/1-63-advanced-flui...
4,"Land, Water, Food, and Climate",Graduate,"This reading seminar examines land, water, foo...",https://ocw.mit.edu/courses/1-74-land-water-fo...


## Extracting Course Title, Course Level, Course Description For Links facing Error

In [8]:
driver = webdriver.Chrome() 

url = "https://ocw.mit.edu/course-lists/environment-earth-systems-and-climate-science-5/"

driver.get(url)

course_cards = driver.find_elements(By.CSS_SELECTOR, ".card.course-collection-row.mb-1")
courses_data_1 = []
for card in course_cards:

    
    course_title = card.find_element(By.CSS_SELECTOR, ".course-title h4").text
    course_number = card.find_element(By.CSS_SELECTOR, ".coursenum").text
    course_level = card.find_element(By.CSS_SELECTOR, ".level").text
    course_link = card.find_element(By.CSS_SELECTOR, "a").get_attribute("href")

    driver.execute_script(f"window.open('{course_link}', '_blank');")
    driver.switch_to.window(driver.window_handles[-1])

    try:

        course_description = driver.find_element(By.ID, "full-description").text
        courses_data_1.append({
                'Title':course_title, 
                'Difficult':course_level, 
                'Description':course_description,
                'Link':course_link
            })
        # print(f"Course Title: {course_title}")
        # print(f"Course Description: {course_description}")
    except:
        print(f"Error scraping data for: {course_link}")
    
    driver.close()
    
    driver.switch_to.window(driver.window_handles[0])
    
driver.quit()

Error scraping data for: https://ocw.mit.edu/courses/1-018j-ecology-i-the-earth-system-fall-2009/
Error scraping data for: https://ocw.mit.edu/courses/1-020-ecology-ii-engineering-for-sustainability-spring-2008/
Error scraping data for: https://ocw.mit.edu/courses/1-061-transport-processes-in-the-environment-fall-2008/
Error scraping data for: https://ocw.mit.edu/courses/1-63-advanced-fluid-dynamics-of-the-environment-fall-2002/
Error scraping data for: https://ocw.mit.edu/courses/1-74-land-water-food-and-climate-fall-2020/
Error scraping data for: https://ocw.mit.edu/courses/1-84j-atmospheric-chemistry-fall-2013/
Error scraping data for: https://ocw.mit.edu/courses/5-60-thermodynamics-kinetics-spring-2008/
Error scraping data for: https://ocw.mit.edu/courses/8-21-the-physics-of-energy-fall-2009/
Error scraping data for: https://ocw.mit.edu/courses/12-000-solving-complex-problems-fall-2009/
Error scraping data for: https://ocw.mit.edu/courses/12-001-introduction-to-geology-fall-2013/
E

In [9]:
courses_data_df_1 = pd.DataFrame(courses_data_1)

In [10]:
courses_data_df_1

Unnamed: 0,Title,Difficult,Description,Link
0,Groundwater Hydrology,Graduate,This course covers fundamentals of subsurface ...,https://ocw.mit.edu/courses/1-72-groundwater-h...
1,Theoretical Environmental Analysis,Undergraduate,This course analyzes cooperative processes tha...,https://ocw.mit.edu/courses/12-009j-theoretica...
2,Climate Physics and Chemistry,Graduate,This course introduces students to climate stu...,https://ocw.mit.edu/courses/12-842-climate-phy...


## Concatinating both DataFrame consist of Title, Difficult, Description, Link

In [11]:
compelte_df = pd.concat([courses_data_df, courses_data_df_1])
compelte_df.head()

Unnamed: 0,Title,Difficult,Description,Link
0,Ecology I: The Earth System,Undergraduate,"We will cover fundamentals of ecology, conside...",https://ocw.mit.edu/courses/1-018j-ecology-i-t...
1,Ecology II: Engineering for Sustainability,Undergraduate,"This course provides a review of physical, che...",https://ocw.mit.edu/courses/1-020-ecology-ii-e...
2,Transport Processes in the Environment,Undergraduate,This class serves as an introduction to mass t...,https://ocw.mit.edu/courses/1-061-transport-pr...
3,Advanced Fluid Dynamics of the Environment,Graduate,Designed to familiarize students with theories...,https://ocw.mit.edu/courses/1-63-advanced-flui...
4,"Land, Water, Food, and Climate",Graduate,"This reading seminar examines land, water, foo...",https://ocw.mit.edu/courses/1-74-land-water-fo...


## Extracting Course Department

In [12]:
def extract_department_name(url):
    
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    try:
        department_element = soup.find('a', class_='course-info-department')
        department_name = department_element.text.strip()
        dep_names.append(department_name)
    except AttributeError:
        dep_names.append(np.nan)

    return dep_names

course_urls = compelte_df['Link'].values
dep_names = []
for url in course_urls:
    department_names = extract_department_name(url)
    print(f"Department for {url}: {department_names}")

Department for https://ocw.mit.edu/courses/1-018j-ecology-i-the-earth-system-fall-2009/: ['Civil and Environmental Engineering']
Department for https://ocw.mit.edu/courses/1-020-ecology-ii-engineering-for-sustainability-spring-2008/: ['Civil and Environmental Engineering', 'Civil and Environmental Engineering']
Department for https://ocw.mit.edu/courses/1-061-transport-processes-in-the-environment-fall-2008/: ['Civil and Environmental Engineering', 'Civil and Environmental Engineering', 'Civil and Environmental Engineering']
Department for https://ocw.mit.edu/courses/1-63-advanced-fluid-dynamics-of-the-environment-fall-2002/: ['Civil and Environmental Engineering', 'Civil and Environmental Engineering', 'Civil and Environmental Engineering', 'Civil and Environmental Engineering']
Department for https://ocw.mit.edu/courses/1-74-land-water-food-and-climate-fall-2020/: ['Civil and Environmental Engineering', 'Civil and Environmental Engineering', 'Civil and Environmental Engineering', 'Ci

In [13]:
compelte_df['Departments'] = dep_names

In [14]:
compelte_df.head(3)

Unnamed: 0,Title,Difficult,Description,Link,Departments
0,Ecology I: The Earth System,Undergraduate,"We will cover fundamentals of ecology, conside...",https://ocw.mit.edu/courses/1-018j-ecology-i-t...,Civil and Environmental Engineering
1,Ecology II: Engineering for Sustainability,Undergraduate,"This course provides a review of physical, che...",https://ocw.mit.edu/courses/1-020-ecology-ii-e...,Civil and Environmental Engineering
2,Transport Processes in the Environment,Undergraduate,This class serves as an introduction to mass t...,https://ocw.mit.edu/courses/1-061-transport-pr...,Civil and Environmental Engineering


## Extracting Course Topics

In [None]:
def extract_course_topics(url):
    response = requests.get(url=url)
    soup = BeautifulSoup(response.content, 'html.parser')

    topics = []
    
    try:
        topic_element = soup.find_all('span', class_ = "topic-text-wrapper")
        for topic in topic_element:

            course_topic = topic.find('a', class_ = "text-black course-info-topic strip-link-offline").text.strip()
            topics.append(course_topic)
    except:
        topics.append(np.nan)
        print(f"No course topic elements found for URL: {url}")
        
    return topics

def extract_topic_from_multiple_links(urls):
    all_course_topics = []
    for url in urls:
        course_topics = extract_course_topics(url)
        all_course_topics.append(course_topics)
    return all_course_topics

In [16]:
all_course_topics = extract_topic_from_multiple_links(compelte_df['Link'].values)
for i, url in enumerate(compelte_df['Link'].values):
    print(f"Topics for URL {url}: {all_course_topics[i]}")

Topics for URL https://ocw.mit.edu/courses/1-018j-ecology-i-the-earth-system-fall-2009/: ['Science', 'Biology', 'Ecology', 'Earth Science', 'Science', 'Biology', 'Ecology', 'Earth Science']
Topics for URL https://ocw.mit.edu/courses/1-020-ecology-ii-engineering-for-sustainability-spring-2008/: ['Engineering', 'Civil Engineering', 'Science', 'Biology', 'Ecology', 'Earth Science', 'Environmental Science', 'Sustainability', 'Engineering', 'Civil Engineering', 'Science', 'Biology', 'Ecology', 'Earth Science', 'Environmental Science', 'Sustainability']
Topics for URL https://ocw.mit.edu/courses/1-061-transport-processes-in-the-environment-fall-2008/: ['Engineering', 'Chemical Engineering', 'Transport Processes', 'Environmental Engineering', 'Aquatic Sciences and Water Quality Control', 'Hydrology and Water Resource Systems', 'Engineering', 'Chemical Engineering', 'Transport Processes', 'Environmental Engineering', 'Aquatic Sciences and Water Quality Control', 'Hydrology and Water Resource S

## Final Data for Environment & Sustainability: Earth Systems and Climate Science

In [37]:
## Final Data for Environment & Sustainability: Earth Systems and Climate Science
compelte_df['Topics'] = all_course_topics

In [18]:
compelte_df.to_csv("Environment & Sustainability: Earth Systems and Climate Science.csv", index=False)

### <center><b> Web Scraping the Data (Environment & Sustainability: Engineering)

## Extracting Course Title, Course Level, Course Description

In [20]:
driver = webdriver.Chrome() 

# Load the webpage
url = "https://ocw.mit.edu/course-lists/environment-engineering-for-sustainability-5/" 
driver.get(url)

# Find all course cards
course_cards = driver.find_elements(By.CSS_SELECTOR, ".card.course-collection-row.mb-1")
courses_data = []
for card in course_cards:
    # Extract information from each card
    course_title = card.find_element(By.CSS_SELECTOR, ".course-title h4").text
    course_number = card.find_element(By.CSS_SELECTOR, ".coursenum").text
    course_level = card.find_element(By.CSS_SELECTOR, ".level").text
    course_link = card.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
    
    driver.execute_script(f"window.open('{course_link}', '_blank');")
    driver.switch_to.window(driver.window_handles[-1])

    try:
        show_more_button = driver.find_element(By.ID, "expand-description")
        if show_more_button:
            show_more_button.click()
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#expanded-description"))
            )
            course_description = driver.find_element(By.CSS_SELECTOR, "#expanded-description").text
            courses_data.append({
                'Title':course_title, 
                'Difficult':course_level, 
                'Description':course_description,
                'Link': course_link
            })
        else:
            course_description = driver.find_element(By.ID, "full-description").text
            
            courses_data.append({'Description': np.nan})
        
        # print(f"Course Title: {course_title}")
        # print(f"Course Description: {course_description}")
    except:
        print(f"Error scraping data for: {course_link}")


    driver.close()
    driver.switch_to.window(driver.window_handles[0])

driver.quit()

Error scraping data for: https://ocw.mit.edu/courses/4-406-ecologies-of-construction-spring-2007/


In [21]:
courses_data_df = pd.DataFrame(courses_data)

In [23]:
courses_data_df.head()

Unnamed: 0,Title,Difficult,Description,Link
0,Introduction to Civil and Environmental Engine...,Undergraduate,"In this sophomore design course, you will be c...",https://ocw.mit.edu/courses/1-101-introduction...
1,Public Transportation Systems,Graduate,This course discusses the evolution and role o...,https://ocw.mit.edu/courses/1-258j-public-tran...
2,Urban Energy Systems and Policy,"Undergraduate, Graduate",This class is about figuring out together what...,https://ocw.mit.edu/courses/11-165j-urban-ener...
3,Urban Transportation Planning,Graduate,"This course examines the policy, politics, pla...",https://ocw.mit.edu/courses/1-252j-urban-trans...
4,Water Resource Systems,Graduate,This subject is concerned with quantitative me...,https://ocw.mit.edu/courses/1-731-water-resour...


## Extracting Course Title, Course Level, Course Description For Links facing Error

In [24]:
driver = webdriver.Chrome() 

url = "https://ocw.mit.edu/course-lists/environment-engineering-for-sustainability-5/"

driver.get(url)

course_cards = driver.find_elements(By.CSS_SELECTOR, ".card.course-collection-row.mb-1")
courses_data_1 = []
for card in course_cards:

    
    course_title = card.find_element(By.CSS_SELECTOR, ".course-title h4").text
    course_number = card.find_element(By.CSS_SELECTOR, ".coursenum").text
    course_level = card.find_element(By.CSS_SELECTOR, ".level").text
    course_link = card.find_element(By.CSS_SELECTOR, "a").get_attribute("href")

    driver.execute_script(f"window.open('{course_link}', '_blank');")
    driver.switch_to.window(driver.window_handles[-1])

    try:

        course_description = driver.find_element(By.ID, "full-description").text
        courses_data_1.append({
                'Title':course_title, 
                'Difficult':course_level, 
                'Description':course_description,
                'Link':course_link
            })
        # print(f"Course Title: {course_title}")
        # print(f"Course Description: {course_description}")
    except:
        print(f"Error scraping data for: {course_link}")
    
    driver.close()
    
    driver.switch_to.window(driver.window_handles[0])
    
driver.quit()

Error scraping data for: https://ocw.mit.edu/courses/1-101-introduction-to-civil-and-environmental-engineering-design-i-fall-2006/
Error scraping data for: https://ocw.mit.edu/courses/1-258j-public-transportation-systems-spring-2017/
Error scraping data for: https://ocw.mit.edu/courses/11-165j-urban-energy-systems-and-policy-fall-2022/
Error scraping data for: https://ocw.mit.edu/courses/1-252j-urban-transportation-planning-fall-2016/
Error scraping data for: https://ocw.mit.edu/courses/1-731-water-resource-systems-fall-2006/
Error scraping data for: https://ocw.mit.edu/courses/1-85-water-and-wastewater-treatment-engineering-spring-2006/
Error scraping data for: https://ocw.mit.edu/courses/2-00aj-exploring-sea-space-earth-fundamentals-of-engineering-design-spring-2009/
Error scraping data for: https://ocw.mit.edu/courses/2-500-desalination-and-water-purification-spring-2009/
Error scraping data for: https://ocw.mit.edu/courses/2-60j-fundamentals-of-advanced-energy-conversion-spring-202

In [27]:
courses_data_df_1 = pd.DataFrame(courses_data_1)

In [28]:
courses_data_df_1

Unnamed: 0,Title,Difficult,Description,Link
0,Ecologies of Construction,Graduate,Ecologies of Construction examines the resourc...,https://ocw.mit.edu/courses/4-406-ecologies-of...


## Concatinating both DataFrame consist of Title, Difficult, Description, Link

In [29]:
compelte_df = pd.concat([courses_data_df, courses_data_df_1])
compelte_df.head()

Unnamed: 0,Title,Difficult,Description,Link
0,Introduction to Civil and Environmental Engine...,Undergraduate,"In this sophomore design course, you will be c...",https://ocw.mit.edu/courses/1-101-introduction...
1,Public Transportation Systems,Graduate,This course discusses the evolution and role o...,https://ocw.mit.edu/courses/1-258j-public-tran...
2,Urban Energy Systems and Policy,"Undergraduate, Graduate",This class is about figuring out together what...,https://ocw.mit.edu/courses/11-165j-urban-ener...
3,Urban Transportation Planning,Graduate,"This course examines the policy, politics, pla...",https://ocw.mit.edu/courses/1-252j-urban-trans...
4,Water Resource Systems,Graduate,This subject is concerned with quantitative me...,https://ocw.mit.edu/courses/1-731-water-resour...


In [30]:
compelte_df.shape

(27, 4)

## Extracting Course Department

In [31]:
def extract_department_name(url):
    
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    try:
        department_element = soup.find('a', class_='course-info-department')
        department_name = department_element.text.strip()
        dep_names.append(department_name)
    except AttributeError:
        dep_names.append(np.nan)

    return dep_names

course_urls = compelte_df['Link'].values
dep_names = []
for url in course_urls:
    department_names = extract_department_name(url)
    print(f"Department for {url}: {department_names}")

Department for https://ocw.mit.edu/courses/1-101-introduction-to-civil-and-environmental-engineering-design-i-fall-2006/: ['Civil and Environmental Engineering']
Department for https://ocw.mit.edu/courses/1-258j-public-transportation-systems-spring-2017/: ['Civil and Environmental Engineering', 'Civil and Environmental Engineering']
Department for https://ocw.mit.edu/courses/11-165j-urban-energy-systems-and-policy-fall-2022/: ['Civil and Environmental Engineering', 'Civil and Environmental Engineering', 'Urban Studies and Planning']
Department for https://ocw.mit.edu/courses/1-252j-urban-transportation-planning-fall-2016/: ['Civil and Environmental Engineering', 'Civil and Environmental Engineering', 'Urban Studies and Planning', 'Civil and Environmental Engineering']
Department for https://ocw.mit.edu/courses/1-731-water-resource-systems-fall-2006/: ['Civil and Environmental Engineering', 'Civil and Environmental Engineering', 'Urban Studies and Planning', 'Civil and Environmental Eng

In [32]:
compelte_df['Departments'] = dep_names

In [33]:
compelte_df.head(3)

Unnamed: 0,Title,Difficult,Description,Link,Departments
0,Introduction to Civil and Environmental Engine...,Undergraduate,"In this sophomore design course, you will be c...",https://ocw.mit.edu/courses/1-101-introduction...,Civil and Environmental Engineering
1,Public Transportation Systems,Graduate,This course discusses the evolution and role o...,https://ocw.mit.edu/courses/1-258j-public-tran...,Civil and Environmental Engineering
2,Urban Energy Systems and Policy,"Undergraduate, Graduate",This class is about figuring out together what...,https://ocw.mit.edu/courses/11-165j-urban-ener...,Urban Studies and Planning


## Extracting Course Topics

In [35]:
def extract_course_topics(url):
    response = requests.get(url=url)
    soup = BeautifulSoup(response.content, 'html.parser')

    topics = []
    
    try:
        topic_element = soup.find_all('span', class_ = "topic-text-wrapper")
        for topic in topic_element:

            course_topic = topic.find('a', class_ = "text-black course-info-topic strip-link-offline").text.strip()
            topics.append(course_topic)
    except:
        topics.append(np.nan)
        print(f"No course topic elements found for URL: {url}")
        
    return topics

def extract_topic_from_multiple_links(urls):
    all_course_topics = []
    for url in urls:
        course_topics = extract_course_topics(url)
        all_course_topics.append(course_topics)
    return all_course_topics

In [36]:
all_course_topics = extract_topic_from_multiple_links(compelte_df['Link'].values)
for i, url in enumerate(compelte_df['Link'].values):
    print(f"Topics for URL {url}: {all_course_topics[i]}")

Topics for URL https://ocw.mit.edu/courses/1-101-introduction-to-civil-and-environmental-engineering-design-i-fall-2006/: ['Engineering', 'Civil Engineering', 'Environmental Engineering', 'Engineering', 'Civil Engineering', 'Environmental Engineering']
Topics for URL https://ocw.mit.edu/courses/1-258j-public-transportation-systems-spring-2017/: ['Engineering', 'Civil Engineering', 'Transportation Engineering', 'Social Science', 'Urban Studies', 'Transportation Planning', 'Engineering', 'Civil Engineering', 'Transportation Engineering', 'Social Science', 'Urban Studies', 'Transportation Planning']
Topics for URL https://ocw.mit.edu/courses/11-165j-urban-energy-systems-and-policy-fall-2022/: ['Energy', 'Social Science', 'Urban Studies', 'Urban Planning', 'Energy', 'Social Science', 'Urban Studies', 'Urban Planning']
Topics for URL https://ocw.mit.edu/courses/1-252j-urban-transportation-planning-fall-2016/: ['Engineering', 'Civil Engineering', 'Transportation Engineering', 'Social Science

## Final Data for Environment & Sustainability: Engineering

In [38]:
compelte_df['Topics'] = all_course_topics

In [41]:
compelte_df.to_csv("Environment & Sustainability: Engineering.csv", index=False)

In [42]:
compelte_df.columns

Index(['Title', 'Difficult', 'Description', 'Link', 'Departments', 'Topics'], dtype='object')