In [1]:
# imports 

# pip install --upgrade "tensorflow<2.12,>=2.11.0"
# pip install --upgrade tensorflow-hub

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import json
from collections import defaultdict

2023-07-13 15:44:54.826535: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-13 15:44:55.500684: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-13 15:44:55.500744: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-07-13 15:44:57.739459: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [2]:
# transform and prepare data 

pd.options.display.max_rows = 9999

courseDF = pd.read_csv('data/raw.csv', on_bad_lines='skip')

prereqDF = pd.read_csv('data/raw_prereq.csv', on_bad_lines='skip')
antireqDF = pd.read_csv('data/raw_antireq.csv', on_bad_lines='skip')

courseCodesDF = courseDF.iloc[:,0] + courseDF.iloc[:,1]
descriptionsDF = courseDF.iloc[:,3]

# undergrad data
descriptionsUG = [] 
courseCodesUG = [] 

# grad data
descriptionsG = [] 
courseCodesG = [] 

# pre/anti req maps
prereqMap = defaultdict(list)
antireqMap = defaultdict(list)

# transform course data into ingestable format
for index, row in courseDF.iterrows():
    formattedCourseNum = "".join(re.findall('[0-9]+', row['course_number']))
    fullCourseName = str(row['subject'] + row['course_number'])
    description = str(row['description'])

    if len(formattedCourseNum) > 0 and int(formattedCourseNum) > 499:
        descriptionsG.append(description)
        courseCodesG.append(fullCourseName)
    elif len(formattedCourseNum) > 0 and int(formattedCourseNum) <= 499:
        descriptionsUG.append(description)
        courseCodesUG.append(fullCourseName)

# transform pre req data into ingestable format
for index, row in prereqDF.iterrows():
    course = row["subject"] + str(row["course_number"])
    prereq = row["pre_requisite_subject"] + str(row["pre_requisite_number"])

    prereqMap[course].append(prereq) 

# transform anti req data into ingestable format
for index, row in antireqDF.iterrows():
    course = row["subject"] + str(row["course_number"])
    antireq = row["anti_requisite_subject"] + str(row["anti_requisite_number"])

    antireqMap[antireq].append(course) 


In [3]:
# load Universal Sentence Embedding model from tensorflow hub, this can take over 1 minute to complete

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

2023-07-13 15:46:08.211073: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-07-13 15:46:08.211407: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-07-13 15:46:08.211448: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (mega): /proc/driver/nvidia/version does not exist
2023-07-13 15:46:08.212057: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [16]:
# Extract meaning from data

rankingsMapUG = dict()
rankingsMapG = dict()

def extractMeaning(data, dataLabels, output):
  dataEmbeddings = embed(data)
  populateRankings(dataLabels, dataEmbeddings, output)

def populateRankings(labels, features, output):
    corr = np.inner(features, features)
    for i,row in enumerate(corr):
      currCourse = labels[i] 
      
      ranks = []
      for j, col in enumerate(row):
        relativeCourse = labels[j]
        if col < 0.95 and col > 0.2 and relativeCourse not in antireqMap[currCourse]:
          ranks.append((relativeCourse, col * 1.2 if currCourse in prereqMap[relativeCourse] else col))
      
      top = sorted(ranks, key=lambda x: x[1], reverse=True)

      k = 50
      output[currCourse] = top[:k] # limit storage to top 50 courses 

extractMeaning(descriptionsUG,courseCodesUG, rankingsMapUG)
extractMeaning(descriptionsG,courseCodesG, rankingsMapG)

2023-07-13 16:07:13.984030: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 378748160 exceeds 10% of free system memory.


In [17]:
# store extracted data into persistent format

# transform to JSON storable format
outputUG = dict()
for key in rankingsMapUG:
    outputUG[key] = []
    for courseName, simVal in rankingsMapUG[key]:
        outputUG[key].append([courseName, str(simVal)])

outputG = dict()
for key in rankingsMapG:
    outputG[key] = []
    for courseName, simVal in rankingsMapG[key]:
        outputG[key].append([courseName, str(simVal)])


with open("output/undergrad_ranks.json", "w") as outfile:
    json.dump(outputUG, outfile)


with open("output/grad_ranks.json", "w") as outfile:
    json.dump(outputG, outfile)
    


In [19]:
# example usage of data  

with open("output/undergrad_ranks.json", "r") as infile:
    resultsUG = json.load(infile)    

with open("output/grad_ranks.json", "r") as infile:
    resultsG = json.load(infile)    

sub = "CS"
num = 454
course = sub + str(num)

#results = rankingsMapUG[course]
results = resultsUG[course]

c = 10

for resCourse,val in results:
    if resCourse.startswith(sub) and int("".join(re.findall('[0-9]+', resCourse)))  > (num // 100) * 100:
        print((resCourse,val))
        c -= 1
        if c < 0:
            break




('CS457', '0.6486266')
('CS448', '0.6235086')
('CS451', '0.60912204')
('CS431', '0.60794425')
('CS456', '0.5285964')
('CS450', '0.52484745')
('CS446', '0.5175047')
('CS447', '0.51427215')
('CS452', '0.51314104')
('CS445', '0.49388883')
('CS486', '0.47623995')
