In [6]:
from bs4 import BeautifulSoup
import json
import requests

# URL of the course descriptions page
url = 'https://catalog.northeastern.edu/course-descriptions/info/'

# Fetch the HTML content from the URL
response = requests.get(url)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Find all course blocks
course_blocks = soup.find_all('div', class_='courseblock')

# Extract course titles and descriptions
courses = []
for block in course_blocks:
    course_title = block.find('p', class_='courseblocktitle').get_text(strip=True)
    course_description = block.find('p', class_='cb_desc').get_text(strip=True)
    
    # Initialize prerequisites as None
    prerequisites = "None"
    
    # Check for prerequisites
    prereq_tag = block.find('p', class_='courseblockextra')
    if prereq_tag and 'Prerequisite' in prereq_tag.get_text():
        prerequisites = prereq_tag.get_text(strip=True)
    
    courses.append({
        'course': course_title,
        'description': course_description,
        'prerequisites': prerequisites
    })

# Print the extracted courses and descriptions
# print(json.dumps(courses, indent=4))

In [7]:
with open('courses.json', 'w') as json_file:
    json.dump(courses, json_file, indent=4)

# Pre Process the text

In [27]:
def load_data(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def clean_text(text):
    # Add more cleaning steps as needed
    text = text.replace('\u2019', "'").replace('\u201c', '"').replace('\u201d', '"')
    text = text.replace('\u00a0', ' ').replace('\u2122', 'TM').replace('\u2013', '-')
    return text

def preprocess_course_data(course_list):
    for i in course_list:
        i['course'] = clean_text(i['course'])
        i['description'] = clean_text(i['description'])
        i['prerequisites'] = clean_text(i['prerequisites'])
    return course_list

def save_data(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

In [28]:
input_filename = 'courses.json'
output_filename = 'cleaned_course.json'
data = load_data(input_filename)
clean_data = preprocess_course_data(data)
save_data(clean_data, "cleaned_courses.json")
