In [1]:
from explorecourses import *
from explorecourses import filters

In [2]:
connect = CourseConnection()

In [3]:
courses = connect.get_courses_by_query(query='CS 109: Introduction to Probability for Computer Scientists')

In [4]:
course = courses[0]

In [5]:
vars(course)

{'year': '2023-2024',
 'subject': 'CS',
 'code': '109',
 'title': 'Introduction to Probability for Computer Scientists',
 'description': 'Topics include: counting and combinatorics, random variables, conditional probability, independence, distributions, expectation, point estimation, and limit theorems. Applications of probability in computer science including machine learning and the use of probability in the analysis of algorithms. Prerequisites: 103, 106B or X, multivariate calculus at the level of MATH 51 or CME 100 or equivalent.',
 'gers': ('GER:DB-EngrAppSci', 'WAY-AQR', 'WAY-FR'),
 'repeatable': False,
 'grading_basis': 'Letter or Credit/No Credit',
 'units_min': 3,
 'units_max': 5,
 'objectives': (<explorecourses.classes.LearningObjective at 0x7fcc61ce1df0>,
  <explorecourses.classes.LearningObjective at 0x7fcc61ce1d90>),
 'final_exam': True,
 'sections': (<explorecourses.classes.Section at 0x7fcc61ce1b50>,
  <explorecourses.classes.Section at 0x7fcc61ce1820>,
  <explorecourse

In [6]:
import re

In [7]:
def fix(input_string, subject):
    
    # Start by making everything upper-case and adding prefix if needed
    input_string = input_string.upper()
    if not (input_string[0]).isalpha(): 
        input_string = subject + input_string
    
    # Use regular expression to split letters followed numbers
    match = re.match(r'([a-zA-Z]+)([0-9]+[a-zA-Z]*)', input_string)
    
    if match:
        # Join the matched groups with a space
        result = ' '.join(match.groups())
        return result
    
    else:
        # Return the input unchanged if no match is found
        return input_string

In [8]:
def extract_prerequisites(course):
    # Get course description and do some minor cleanup
    description = course.description
    description = re.sub(r'https?://\S+', '', description)
    description = re.sub(r'EarthSys|Earth Systems', 'EARTHSYS', description)
    
    # Partition course description for prerequisites
    prerequisites = re.split('\. |,|[\W]or |[\W]and |/', description.partition('Prerequisite')[2])
    courses = []
    
    subject = course.subject
    for prerequisite in prerequisites:

        # Remove some edge cases
        keys = ['minute', 'hour', 'day', 'month', 'year', 'unit', 'Ways', \
                'application', 'enroll', 'register']
        if any(item in prerequisite for item in keys):
            continue

        # Define a regular expression pattern to match course numbers
        # Math 51, MATH 51, formerly 188, 103
        pattern = r'([A-Z]*[a-z]*\s*\d+[A-Z]*)'
        matches = re.findall(pattern, prerequisite)

        # Clean up depending on case
        # matches = [fix(match, course) for match in matches]
        for index, match in enumerate(matches):
            fixed_match = fix(match, subject)
            subject = fixed_match.split(' ')[0]
            matches[index] = fixed_match
            
        courses.extend(matches)

    # return (courses, description)
    return courses

In [9]:
extract_prerequisites(course)

['CS 103', 'CS 106B', 'MATH 51', 'CME 100']

In [10]:
import json

In [11]:
year = "2022-2023"
nodes = []
reference = {}
# keys = ['year', 'subject', 'code', 'title', 'gers', 'repeatable', \
#         'grading_basis', 'units_min', 'units_max', 'final_exam', \
#         'course_id', 'active', 'offer_num', 'academic_group', 'academic_org', \
#         'academic_career', 'max_units_repeat', 'max_times_repeat']

keys = ['academic_group']

for school in connect.get_schools(year):
    for dept in school.departments:
        courses = connect.get_courses_by_department(dept.code, year=year)
        
        # Create nodes
        for course in courses:
            # Use dictionary comprehension to store serializable attributes
            # course_prop = {key: course.__dict__[key] for key in keys}
            course_id = f'{course.subject} {course.code}'
            
            # Create node for the course
            course_node = {
                "type": "node",
                "labels": ["Course"],
                "id": course_id,
                "group": course.academic_group # <- remove later?
                # "properties": course_prop
            }
            
            nodes.append(course_node)
            reference[course_id] = course

In [12]:
relationships = []
relationship_id = 0

# Create relationships
for course_id, course in reference.items():
    
    # Infer prerequisites
    # prerequisites, description = extract_prerequisites(course)
    prerequisites = extract_prerequisites(course)
    for prereq_id in prerequisites:
        
        # for whatever reason, we're having difficulty extracting the prerequisite
        # TODO: make this procedure more robust
        if prereq_id not in reference:
            # print(f'{prereq_id}')
            # print(f'{description}')
            continue

        # otherwise we ``claim'' to have a valid prerequisite
        else:
            
#             relationship = {
#                 "id": str(relationship_id),
#                 "type": "relationship",
#                 "label": "PREREQUISITE",
#                 "start": {
#                     "labels": ["Course"],
#                     "id": prereq_id,
#                 },
#                 "end": {
#                     "labels": ["Course"],
#                     "id": course_id,
#                 },
#             }

            relationship = {
                "source": prereq_id,
                "target": course_id
            }
            relationships.append(relationship)
            relationship_id += 1

In [13]:
# # Write nodes and relationships to a JSON file
# with open("graph.json", "w") as json_file:
#     for node in nodes:
#         json.dump(node, json_file)
#         json_file.write("\n")
    
#     for relationship in relationships:
#         json.dump(relationship, json_file)
#         json_file.write("\n")

In [14]:
# Create a dictionary with nodes and links
graph_data = {
    "nodes": nodes,
    "links": relationships
}

# Write nodes and relationships to a JSON file
with open("graph.json", "w") as json_file:
    json.dump(graph_data, json_file, indent=2)