In [2]:
# %pip install numpy pandas openpyxl networkx

Helper function to combine transfer classes into passing classes.

In [3]:
#Merge trans columns into pass columns
def merge_trans_classes(df):
    pass_classes = [col for col in df.columns if "Pass" in col]
    trans_classes = [col for col in df.columns if "Tran" in col]

    #Combine the trans into the pass
    for t in trans_classes:
        class_name = t.split("_")[0]
        for p in pass_classes:
            if class_name in p:
                df[p] = df[p].combine_first(df[t])

    #Drop the trans cols
    for t in trans_classes:
        df = df.drop(t,axis=1)
    
    return df

Create an array of every CS class a student can take in a given semester

In [4]:
import pandas as pd
import numpy as np

# Load the Excel file
df = pd.read_excel('CS Course Enrollment Data 030824.xlsx')
df = merge_trans_classes(df)

# Drop the last column.
df = df.drop(df.columns[-1],axis=1)

# Get all the "Pass" semester columns.
semester_cols = [col for col in df.columns if 'Pass' in col]

classes = np.array([])

for col in semester_cols:
    classes = np.append(classes, [str(col).split("_")[0]])

print("Classes: ")
print(np.array_str(classes))

Classes: 
['CSCI101' 'CSCI128' 'CSCI200' 'CSCI210' 'CSCI220' 'CSCI261' 'CSCI262'
 'CSCI274' 'CSCI306' 'CSCI341' 'CSCI358' 'CSCI370' 'CSCI400' 'CSCI406'
 'CSCI442' 'MATH111' 'MATH112' 'MATH113' 'MATH122' 'MATH213' 'MATH214'
 'MATH223' 'MATH224' 'MATH225' 'MATH235' 'MATH300' 'MATH307' 'MATH332'
 'MATH342' 'PHGN100' 'PHGN200' 'CSCI404' 'CSCI410' 'CSCI422' 'CSCI423'
 'CSCI425' 'CSCI432' 'CSCI436' 'CSCI437' 'CSCI440' 'CSCI441' 'CSCI443'
 'CSCI444' 'CSCI445' 'CSCI446' 'CSCI448' 'CSCI455' 'CSCI470' 'CSCI471'
 'CSCI473' 'CSCI474' 'CSCI475' 'CSCI477' 'CSCI478']


Create directed graph representing the CS flowchart. For the purposes of reading this data into the stochastic matrix, we will want to include cumulative prereqs, so while 403 doesn't require 101 directly, we will still list it as a requirement.

In [5]:
import networkx as nx 

requirements = nx.DiGraph()

requirements.add_nodes_from(classes)

# Explicit Prerequisites
# Taken from here: https://catalog.mines.edu/undergraduate/programs/cs/#coursestext
# For completenes, I included edges to/from classes we aren't tracking
# Also note that some prereqs rquire you to earn a certain grade in another class (usually a C-), to pass the class, here I treat all prereqs as the same
requirements.add_edge('CSCI101', 'CSCI200')
requirements.add_edge('CSCI128', 'CSCI200')

requirements.add_edge('CSCI200', 'CSCI210')

requirements.add_edge('CSCI200', 'CSCI220')

requirements.add_edge('CSCI128', 'CSCI250')

requirements.add_edge('CSCI101', 'CSCI261')

requirements.add_edge('CSCI261', 'CSCI262')

requirements.add_edge('CSCI200', 'CSCI274')
requirements.add_edge('CSCI261', 'CSCI274')

requirements.add_edge('CSCI220', 'CSCI290')
requirements.add_edge('CSCI262', 'CSCI290')

# For some reason CSCI 303, Introduction to Data Science, lists all of these classes under an "X or Y or Z"-style prerequisite, which I think is odd
requirements.add_edge('CSCI101', 'CSCI303')
requirements.add_edge('CSCI102', 'CSCI303')
requirements.add_edge('CSCI128', 'CSCI303')
requirements.add_edge('CSCI200', 'CSCI303')
requirements.add_edge('CSCI261', 'CSCI303')

requirements.add_edge('CSCI210', 'CSCI306')
requirements.add_edge('CSCI220', 'CSCI306')
requirements.add_edge('CSCI262', 'CSCI306')

# CSCI341 prereqs are listed as: "Prerequisite: CSCI200 or CSCI261, CSCI262.", the phrasing seems odd to me
requirements.add_edge('CSCI200', 'CSCI341')
requirements.add_edge('CSCI261', 'CSCI341')
requirements.add_edge('CSCI262', 'CSCI341')

# 358 requires MATH112 or MATH113 or MATH122
requirements.add_edge('MATH112', 'CSCI358')
requirements.add_edge('MATH113', 'CSCI358')
requirements.add_edge('MATH122', 'CSCI358')

requirements.add_edge('CSCI306', 'CSCI370')

requirements.add_edge('CSCI290', 'CSCI390')

requirements.add_edge('CSCI306', 'CSCI400')
requirements.add_edge('CSCI358', 'CSCI400')

requirements.add_edge('CSCI200', 'CSCI403')
requirements.add_edge('CSCI262', 'CSCI403')

# AI requires Intro to Stats OR Intro to Probability, I don't think most AI students will have taken Intro to Probability
requirements.add_edge('CSCI220', 'CSCI404')
requirements.add_edge('CSCI262', 'CSCI404')
requirements.add_edge('MATH201', 'CSCI404')
requirements.add_edge('MATH334', 'CSCI404')

# "MATH358" is listen on the official course catalog, but the link is broken, I don't think that class exists anymore
requirements.add_edge('CSCI220', 'CSCI406')
requirements.add_edge('CSCI262', 'CSCI406')
requirements.add_edge('MATH213', 'CSCI406')
requirements.add_edge('MATH223', 'CSCI406')
requirements.add_edge('MATH224', 'CSCI406')
requirements.add_edge('MATH300', 'CSCI406')
requirements.add_edge('MATH358', 'CSCI406')
requirements.add_edge('CSCI358', 'CSCI406')

requirements.add_edge('CSCI341', 'CSCI410')
requirements.add_edge('EENG383', 'CSCI410')

requirements.add_edge('CSCI262', 'CSCI422')

# Prequisites for Computer Simulation are especially odd: (CSCI210 or CSCI274) AND CSCI306 AND (MATH201 or MATH334).
requirements.add_edge('CSCI210', 'CSCI423')
requirements.add_edge('CSCI274', 'CSCI423')
requirements.add_edge('CSCI306', 'CSCI423')
requirements.add_edge('MATH201', 'CSCI423')
requirements.add_edge('MATH334', 'CSCI423')

requirements.add_edge('CSCI274', 'CSCI425')
requirements.add_edge('CSCI306', 'CSCI425')
requirements.add_edge('CSCI341', 'CSCI425')

requirements.add_edge('CSCI220', 'CSCI432')
requirements.add_edge('CSCI262', 'CSCI432')
requirements.add_edge('MATH201', 'CSCI432')

# Note that the prereqs for this class require CSCI200 OR CSCI262, despite those not being equivalent courses
requirements.add_edge('CSCI200', 'CSCI436')
requirements.add_edge('CSCI262', 'CSCI436')
requirements.add_edge('MATH201', 'CSCI436')

# The requirements here are listed as: (MATH201 or MATH334 or EENG311),and,MATH332,and,(CSCI200 or CSCI261).
requirements.add_edge('MATH201', 'CSCI437')
requirements.add_edge('MATH334', 'CSCI437')
requirements.add_edge('EENG311', 'CSCI437')
requirements.add_edge('MATH332', 'CSCI437')
requirements.add_edge('CSCI200', 'CSCI437')
requirements.add_edge('CSCI261', 'CSCI437')

requirements.add_edge('CSCI220', 'CSCI440')
requirements.add_edge('CSCI262', 'CSCI440')
requirements.add_edge('CSCI341', 'CSCI440')

requirements.add_edge('CSCI220', 'CSCI441')
requirements.add_edge('CSCI262', 'CSCI441')
requirements.add_edge('MATH332', 'CSCI441')

requirements.add_edge('CSCI220', 'CSCI442')
requirements.add_edge('CSCI262', 'CSCI442')
requirements.add_edge('CSCI274', 'CSCI442')
requirements.add_edge('CSCI341', 'CSCI442')

requirements.add_edge('CSCI306', 'CSCI443')

requirements.add_edge('CSCI441', 'CSCI444')

requirements.add_edge('CSCI306', 'CSCI445')

requirements.add_edge('CSCI220', 'CSCI446')
requirements.add_edge('CSCI262', 'CSCI446')

requirements.add_edge('CSCI306', 'CSCI448')

requirements.add_edge('CSCI358', 'CSCI455')
requirements.add_edge('CSCI406', 'CSCI455')

# Listed as: CSCI101 or CSCI102 or CSCI200 or CSCI261, MATH201, MATH332.
requirements.add_edge('CSCI101', 'CSCI470')
requirements.add_edge('CSCI102', 'CSCI470')
requirements.add_edge('CSCI200', 'CSCI470')
requirements.add_edge('CSCI261', 'CSCI470')
requirements.add_edge('MATH201', 'CSCI470')
requirements.add_edge('MATH332', 'CSCI470')

# Listed as: (CSCI220 or CSCI262) AND (CSCI210 or CSCI274)
requirements.add_edge('CSCI220', 'CSCI471')
requirements.add_edge('CSCI262', 'CSCI471')
requirements.add_edge('CSCI210', 'CSCI471')
requirements.add_edge('CSCI274', 'CSCI471')

# Listed as: (CSCI220 or CSCI262),and,(MATH201 or MATH334)
requirements.add_edge('CSCI220', 'CSCI473')
requirements.add_edge('CSCI262', 'CSCI473')
requirements.add_edge('MATH201', 'CSCI473')
requirements.add_edge('MATH334', 'CSCI473')

# Listed as: CSCI220 or CSCI262, CSCI358, MATH334 or MATH335 or MATH201
requirements.add_edge('CSCI220', 'CSCI474')
requirements.add_edge('CSCI262', 'CSCI474')
requirements.add_edge('CSCI358', 'CSCI474')
requirements.add_edge('MATH334', 'CSCI474')
requirements.add_edge('MATH335', 'CSCI474')
requirements.add_edge('MATH201', 'CSCI474')

requirements.add_edge('CSCI220', 'CSCI475')
requirements.add_edge('CSCI262', 'CSCI475')
requirements.add_edge('CSCI341', 'CSCI475')
requirements.add_edge('CSCI274', 'CSCI475')

requirements.add_edge('CSCI220', 'CSCI477')
requirements.add_edge('CSCI262', 'CSCI477')

# Listed as: CSCI101 or CSCI102 or CSCI128 or CSCI200 or CSCI261
requirements.add_edge('CSCI101', 'CSCI478')
requirements.add_edge('CSCI102', 'CSCI478')
requirements.add_edge('CSCI128', 'CSCI478')
requirements.add_edge('CSCI200', 'CSCI478')
requirements.add_edge('CSCI261', 'CSCI478')

# Note that this is for an honors thesis
requirements.add_edge('CSCI306', 'CSCI480')

requirements.add_edge('CSCI390', 'CSCI490')


# Note that Special Topics are not currently included since we don't know what the prereqs for those classes will be for now
# Once these are available, they can be put here


# Corequisites
requirements.add_edge('MATH213', 'CSCI250')
requirements.add_edge('PHGN200', 'CSCI250')

requirements.add_edge('CSCI358', 'CSCI390')

requirements.add_edge('CSCI403', 'CSCI445')

requirements.add_edge('CSCI403', 'CSCI446')

requirements.add_edge('CSCI406', 'CSCI490')

# Implicit Prerequisites
# For the Markov chain solution, we want to imply that if a student is taking Algorithms that they probably won't take CSCI128 next semester

# The requirements should be a DAG
assert(nx.is_directed_acyclic_graph(requirements))

for node in requirements.nodes:
    node_descendents = nx.descendants(requirements, node)

    for other_node in node_descendents:
        requirements.add_edge(node, other_node)

Create the stochastic matrix. 

Note that right now, the stochastic matrix only handles the requirements for a class and the cummulative prerequisite courses for a class. In the future, we will want to edit the stochastic matrix using data from previous semesters to determine what classes students are actually likely to take after completing some class.

In [6]:
num_classes = classes.size

# Fill the matrix with 1's
prereq_matrix = np.empty(shape=(num_classes, num_classes))
prereq_matrix.fill(1)

# If you pass a class, you won't take that class next semester, so set the diagonal to 0
np.fill_diagonal(prereq_matrix, 0)

# Look at the prereqs, if a class is a prereq for another class, that spot in the matrix should be a 0
for required in classes:

    # Get the outgoing edges in the associated requirements network
    classes_requiring = requirements.adj[required]
    
    for requiring_class in classes_requiring:
        prereq_matrix[np.where(classes == required), np.where(classes == requiring_class)] = 0


# Make the matrix stochastic
# Note that this code currently transposes the matrix, makes the rows stochastic, and then transposes the matrix back to how it originally was
def make_stochastic(matrix):
    matrix = matrix.transpose()
    matrix = matrix/matrix.sum(axis=1)[:,None]
    matrix = matrix.transpose()
    return matrix

stochastic = np.copy(prereq_matrix)
stochastic = make_stochastic(stochastic)

print("Stochastic Matrix: ")
print(stochastic)

Stochastic Matrix: 
[[0.         0.01886792 0.         ... 0.         0.         0.        ]
 [0.01886792 0.         0.         ... 0.         0.         0.        ]
 [0.01886792 0.01886792 0.         ... 0.         0.         0.        ]
 ...
 [0.01886792 0.01886792 0.01960784 ... 0.         0.0212766  0.02040816]
 [0.01886792 0.01886792 0.01960784 ... 0.02222222 0.         0.02040816]
 [0.01886792 0.01886792 0.01960784 ... 0.02222222 0.0212766  0.        ]]


For the sake of demonstration, create a student who is only taking CS461. Estimate what their next semester will look like.

In [7]:
# Create a demo student
demo_student = np.zeros(num_classes)
demo_student[np.where(classes == "CSCI406")] = 1

print("Initial Student Array:")
print(demo_student)

# Use matrix multiplication to guess what their next semester would look like
demo_prediction = np.matmul(stochastic, demo_student)

print("Predicted Schedule: ")
print(demo_prediction)

Initial Student Array:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
Predicted Schedule: 
[0.         0.         0.         0.02564103 0.         0.
 0.         0.02564103 0.02564103 0.02564103 0.         0.02564103
 0.02564103 0.         0.02564103 0.02564103 0.         0.
 0.         0.         0.02564103 0.         0.         0.02564103
 0.02564103 0.         0.02564103 0.02564103 0.02564103 0.02564103
 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103
 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103
 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103
 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103]


Above, since the student just took CSCI 406, we know that next semester they will not take CSCI 406 since that is a prereq, and they also won't take any of 406's prerequisite courses.

Define a function to get a single student's schedule for a given semester + year

In [8]:
# Note that CSCI 200 and 261 store their names a bit differently in the excel file
def class_to_pass_sem(class_name):
    if class_name == 'CSCI200' or class_name == 'CSCI261':
        return class_name + "_Pass_C_Sem"
    
    else:
        return class_name + "_Pass_Sem"


def get_student_semester(student_id, semester, year):
    student_semester = np.zeros(classes.size)
    student_data = df.iloc[student_id]

    # Seeing what the student data looks like can be helpful if you aren't sure what's going on with the code below
    # print(student_data)

    semester_year = semester + " " + year

    for class_name in classes:
        class_string = class_to_pass_sem(class_name)

        # Get the cell corresponding to the student + class
        semester_taken = student_data.loc[class_string]
        
        # If the semester the student took the class is equal to the semester + year being examined, set that position in the array to be true
        if (semester_taken == semester_year):
            student_semester[np.where(classes == class_name)] = 1

    return student_semester
        
# Sanity check: this should print an array with a single 1, representing that the student has passed CSCI 358 with a C in Fall 2018
print(get_student_semester(0, "Fall", "2018"))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]


Define a function that will find the sum up all of the above arrays for a given semester + year

In [9]:
def total_student_semester(semester, year):
    total_student_semester = np.zeros(classes.size)

    for ind in df.index:
        total_student_semester = total_student_semester + get_student_semester(ind, semester, year)

    return total_student_semester

Get a vector representing all of the classes taken last semester, Fall 2023, and use it to estimate what Spring 2024 will look like

In [10]:
fall_2023_semester = total_student_semester("Fall", "2023")

spring_2024_estimate = np.matmul(stochastic, fall_2023_semester)

fall_2024_estimate = np.matmul(stochastic, spring_2024_estimate)

print("Fall 2023 Total: ")
print(fall_2023_semester)

print("Spring 2024 Estimate: ")
print(spring_2024_estimate)

print("Fall 2024 Estimate: ")
print(fall_2024_estimate)


Fall 2023 Total: 
[147. 924. 314.   0. 221.  22.  13. 199. 150. 125. 174. 125. 174. 147.
 117. 815. 236.   0.   0. 509.   0.  57.   0. 416.  33.  26. 303. 258.
   0. 513. 502.   0.   0.   0.  51.   0.   0.   0.  27.   0.  40.   0.
   0.  51.   0.   0.  20. 164.  85.   0.   0. 119.   0.  40.]
Spring 2024 Estimate: 
[ 90.12150943  76.13911096  93.57307322 126.02037654 111.19082032
 103.47195708 112.75175169 128.13059072 127.90926543 132.63562835
 128.46479046 137.70646168 136.24033702 136.1886929  137.88423946
 125.10688097 124.01196027 128.46479046 128.46479046 126.58491931
 140.48423946 135.1132212  136.1886929  132.63518285 139.86159795
 135.69812686 134.76725832 130.76758304 140.48423946 130.80499417
 131.01254134 140.48423946 140.48423946 140.48423946 139.32514855
 140.48423946 140.48423946 140.48423946 139.92173946 140.48423946
 139.61467424 140.48423946 140.48423946 139.35090612 140.48423946
 140.48423946 139.95792367 137.06757279 138.59535057 140.48423946
 140.48423946 137.839795

First attempt at making a more accurate stochastic matrix:

In [11]:
improved_matrix = np.copy(prereq_matrix)

# Note that this function will change whatever matrix is passed into it
# This function will not return a stochastic matrix
def add_student_data(matrix, student_id, semester1, year1, semester2, year2, data_weight):
    schedule1 = get_student_semester(student_id, semester1, year1)
    schedule2 = get_student_semester(student_id, semester2, year2)

    # The matrix math going on in this line is complicated,
    # but basically, multiplying the two schedule vectors together will get a matrix that can convert vector 1 to vector 2
    matrix_addition = np.outer(schedule1, schedule2).transpose()

    # print(np.array2string(matrix_addition, threshold= np.inf))

    matrix += data_weight * matrix_addition

# Sanity Check:
# Starting with a completely empty matrix, try using the above the function on a single student, make this matrix stochastic, and then multiply it by the student's first semester
# This operation should return something close to the student's second semester
test_matrix = np.ones(shape=(num_classes, num_classes))

add_student_data(test_matrix, 2, "Fall", "2018", "Spring", "2019", 100)

test_matrix = make_stochastic(test_matrix)

print("Test Matrix: ")
print(test_matrix)

print("\nStudent 2 Fall 2018 Semester: ")
print(get_student_semester(2, "Fall", "2018"))

print("\nStudent 2 Sprint 2019 Semester: ")
print(get_student_semester(2, "Spring", "2019"))

print("\nStudent 2 Predicted 2019 Semester: ")

# Note that I round the results below to remove some fuzziness from the data that results from starting with a matrix of all 1's
# The make_stochastic method, as currently written, requires the sum of a row to not be equal to 0
# In the final estimate this shouldn't matter as none of the rows will have a sum of 0
print(np.matmul(test_matrix, get_student_semester(2, "Fall", "2018")).round())

Test Matrix: 
[[0.00180505 0.01851852 0.01851852 ... 0.01851852 0.01851852 0.01851852]
 [0.00180505 0.01851852 0.01851852 ... 0.01851852 0.01851852 0.01851852]
 [0.00180505 0.01851852 0.01851852 ... 0.01851852 0.01851852 0.01851852]
 ...
 [0.00180505 0.01851852 0.01851852 ... 0.01851852 0.01851852 0.01851852]
 [0.00180505 0.01851852 0.01851852 ... 0.01851852 0.01851852 0.01851852]
 [0.00180505 0.01851852 0.01851852 ... 0.01851852 0.01851852 0.01851852]]

Student 2 Fall 2018 Semester: 
[1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]

Student 2 Sprint 2019 Semester: 
[0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]

Student 2 Predicted 2019 Semester: 
[0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 

In [12]:
def add_all_semester_data(matrix, semester1, year1, semester2, year2, data_weight):
    for ind in df.index:
        add_student_data(matrix, ind, semester1, year1, semester2, year2, data_weight)

# Do the same experiment above but across all students
total_test_matrix = np.ones(shape=(num_classes, num_classes))
add_all_semester_data(total_test_matrix, "Fall", "2018", "Spring", "2019", 100)
total_test_matrix = make_stochastic(total_test_matrix)


print("Test Matrix: ")
print(total_test_matrix)

print("\nFall 2018 Semester: ")
print(total_student_semester("Fall", "2018"))

print("\nSprint 2019 Semester: ")
print(total_student_semester("Spring", "2019"))

print("\nPredicted 2019 Semester: ")

print(np.matmul(total_test_matrix, total_student_semester("Fall", "2018")).round())

print("\nFall 2018 Sum: " + str(np.sum(total_student_semester("Fall", "2018"))))
print("\nSpring 2019 Sum: " + str(np.sum(total_student_semester("Spring", "2019"))))
print("\nSpring 2019 Estimated Sum: " + str(np.sum(np.matmul(total_test_matrix, total_student_semester("Fall", "2018")).round())))


Test Matrix: 
[[1.29779116e-05 1.85185185e-02 1.85185185e-02 ... 6.04594921e-04
  1.85185185e-02 1.85185185e-02]
 [1.29779116e-05 1.85185185e-02 1.85185185e-02 ... 6.04594921e-04
  1.85185185e-02 1.85185185e-02]
 [1.29779116e-05 1.85185185e-02 1.85185185e-02 ... 6.04594921e-04
  1.85185185e-02 1.85185185e-02]
 ...
 [1.29779116e-05 1.85185185e-02 1.85185185e-02 ... 6.04594921e-04
  1.85185185e-02 1.85185185e-02]
 [2.60856023e-03 1.85185185e-02 1.85185185e-02 ... 1.21523579e-01
  1.85185185e-02 1.85185185e-02]
 [1.29779116e-05 1.85185185e-02 1.85185185e-02 ... 6.04594921e-04
  1.85185185e-02 1.85185185e-02]]

Fall 2018 Semester: 
[367.   0.   0.   0.   0. 330. 134.  84.  48.  74.  45.   0.  16.  55.
  25. 688. 153. 159.  18. 243.  11.   0.  50. 225.  34.  28.  50. 115.
   0. 397. 269.   0.   0.   0.   5.   0.   0.   0.   7.   0.  10.   0.
   0.  26.   0.   0.   0.  31.   9.   0.   0.  12.   0.   0.]

Sprint 2019 Semester: 
[ 85.   0.   0.   0.   0. 218. 170. 128.  93.  97. 114.   0.  36.

Try using this method to calculate what the expected schedule will look like for a given semester without using data from that semester

In [14]:
test_matrix_prev = np.ones(shape=(num_classes, num_classes))
add_all_semester_data(test_matrix_prev, "Fall", "2017", "Spring", "2018", 100)
test_matrix_prev = make_stochastic(test_matrix_prev)

print("\nFall 2018 Semester: ")
print(total_student_semester("Fall", "2018"))

print("\nSprint 2019 Semester: ")
print(total_student_semester("Spring", "2019"))

print("\nPredicted 2019 Semester: ")
print(np.matmul(test_matrix_prev, total_student_semester("Fall", "2018")).round())


Fall 2018 Semester: 
[367.   0.   0.   0.   0. 330. 134.  84.  48.  74.  45.   0.  16.  55.
  25. 688. 153. 159.  18. 243.  11.   0.  50. 225.  34.  28.  50. 115.
   0. 397. 269.   0.   0.   0.   5.   0.   0.   0.   7.   0.  10.   0.
   0.  26.   0.   0.   0.  31.   9.   0.   0.  12.   0.   0.]

Sprint 2019 Semester: 
[ 85.   0.   0.   0.   0. 218. 170. 128.  93.  97. 114.   0.  36.  57.
  44.  44. 245.   2.   0. 197.   2.  25.   0. 229.  36.  16.  76. 164.
   0. 275. 218.  38.   0.   0.   0.   0.   0.   0.   0.  18.   0.   0.
   5.   0.   0.  27.   0.   0.   0.  16.  16.   0.  26.   0.]

Predicted 2019 Semester: 
[119.   0.   0.   0.   0. 261. 282. 133. 150. 130. 141.   0.  23.  71.
  83.   7. 201.   2.   0. 248.   0.  73.   0. 379.  71.  24.  73. 228.
   0. 226. 552.  51.   0.   0.   0.   0.   0.   0.   0.  16.   0.   0.
   0.   0.  44.  46.   0.   0.   0.   4.  31.   0.  39.   0.]


Known Issues:
- ~~Currently this Demo runs using an older version of data ("CSCurriculumRevertedData"), the new set ("CS Course Enrollment Data") should include more up-to-date data, specifically on pass rates for Fall 2023~~ EDIT: Modified to use recent data and also modified sanity check.
- ~~Right now the stochastic matrix assumes that if a class is not a prereq for another class, that there is an equal likelyhood of someone going from one class to another. This means that someone is CSCI200 will have an equal chance of going to CSCI406, CSCI358, and CSCI220, even though this probably isn't the case in real life. The solution to this issue would be to weigh the values in the stochastic matrix differentlyy. To do this, we could access an individual student X's schedule in a specific semester (i.e. Spring 2022) and X's schedule in the next Semester (Fall 2022) and increment values in the stochastic matrix such that taking the same classes X took in Spring 2022 would increase the probability of taking the classes X took in Fall 2022 next semester.~~ EDIT: Modified to implement a basic version of this, see "add_all_semester_data"
- This procedure estimates the numer of students who are likely to pass a class, not register for it. Students who registered for CSCI200 in Spring 2023 but who did not pass the class are not accounted for in the data. Registration numbers would be slightly higher than passing numbers, but our professor said that this wasn't a big deal in the classes he taught at least.
- This method does not take into account students who are graduating or who just started school. Accordingly, this method will underestimate the number of students taking CS128 and overestimate the number of students taking several 400 level classes. We can account for new students by increasing the estimate for students who will register for CSCI128 next semester by some reasonable estimate. We can account for graduating students by seeing how much our model tends to overpredict student registration for high level classes when ran against previous semesters and adjusting accordingly.
- The make_stochastic method could be improved to handle some additional edge cases (like where a column sums to 0)
- The results from the experiment feeding the matrix data about the transition from Fall 2018 to Spring 2019 and making the matrix predict Spring 2019 were somewhat unexpected, it may be worth revisiting those results to figure out what exactly happened.