In [None]:
# imports 

import tensorflow_hub as hub
import numpy as np
import pandas as pd
import re
import json
from collections import defaultdict

In [2]:
# transform and prepare data 

pd.options.display.max_rows = 9999

courseDF = pd.read_csv('data/raw.csv', on_bad_lines='skip')

prereqDF = pd.read_csv('data/raw_prereq.csv', on_bad_lines='skip')
antireqDF = pd.read_csv('data/raw_antireq.csv', on_bad_lines='skip')

courseCodesDF = courseDF.iloc[:,0] + courseDF.iloc[:,1]
descriptionsDF = courseDF.iloc[:,3]

# undergrad data
descriptionsUG = [] 
courseCodesUG = [] 

# grad data
descriptionsG = [] 
courseCodesG = [] 

# pre/anti req maps
prereqMap = defaultdict(list)
antireqMap = defaultdict(list)

# transform course data into ingestable format
for index, row in courseDF.iterrows():
    formattedCourseNum = "".join(re.findall('[0-9]+', row['course_number']))
    fullCourseName = str(row['subject'] + row['course_number'])
    description = str(row['description'])

    if len(formattedCourseNum) > 0 and int(formattedCourseNum) > 499:
        descriptionsG.append(description)
        courseCodesG.append(fullCourseName)
    elif len(formattedCourseNum) > 0 and int(formattedCourseNum) <= 499:
        descriptionsUG.append(description)
        courseCodesUG.append(fullCourseName)

# transform pre req data into ingestable format
for index, row in prereqDF.iterrows():
    course = row["subject"] + str(row["course_number"])
    prereq = row["pre_requisite_subject"] + str(row["pre_requisite_number"])

    prereqMap[course].append(prereq) 

# transform anti req data into ingestable format
for index, row in antireqDF.iterrows():
    course = row["subject"] + str(row["course_number"])
    antireq = row["anti_requisite_subject"] + str(row["anti_requisite_number"])

    antireqMap[antireq].append(course) 


In [None]:
# load Universal Sentence Embedding model from tensorflow hub, this can take over 1 minute to complete

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

In [None]:
# Extract meaning from data

rankingsMapUG = dict()
rankingsMapG = dict()

# create text embeddings in higher dimension for given text data
def extractMeaning(data, dataLabels, output):
  dataEmbeddings = embed(data)
  populateRankings(dataLabels, dataEmbeddings, output)

# populate outputted data using several filters
def populateRankings(labels, features, output):
    corr = np.inner(features, features)
    for i,row in enumerate(corr):
      currCourse = labels[i] 
      
      ranks = []
      for j, col in enumerate(row):
        relativeCourse = labels[j]
        if col < 0.95 and col > 0.2 and relativeCourse not in antireqMap[currCourse]:
          ranks.append((relativeCourse, col * 1.2 if currCourse in prereqMap[relativeCourse] else col))
      
      top = sorted(ranks, key=lambda x: x[1], reverse=True)

      k = 50
      output[currCourse] = top[:k] # limit storage to top 50 courses 

extractMeaning(descriptionsUG,courseCodesUG, rankingsMapUG)
extractMeaning(descriptionsG,courseCodesG, rankingsMapG)

In [17]:
# store extracted data into persistent format

# transform to JSON storable format
outputUG = dict()
for key in rankingsMapUG:
    outputUG[key] = []
    for courseName, simVal in rankingsMapUG[key]:
        outputUG[key].append([courseName, str(simVal)])

outputG = dict()
for key in rankingsMapG:
    outputG[key] = []
    for courseName, simVal in rankingsMapG[key]:
        outputG[key].append([courseName, str(simVal)])


with open("output/undergrad_ranks.json", "w") as outfile:
    json.dump(outputUG, outfile)


with open("output/grad_ranks.json", "w") as outfile:
    json.dump(outputG, outfile)
    


In [None]:
# example usage of data

with open("output/undergrad_ranks.json", "r") as infile:
    resultsUG = json.load(infile)    

with open("output/grad_ranks.json", "r") as infile:
    resultsG = json.load(infile)    

sub = "CS"
num = 454
course = sub + str(num)

results = resultsUG[course]

c = 10

for resCourse,val in results:
    if resCourse.startswith(sub) and int("".join(re.findall('[0-9]+', resCourse)))  > (num // 100) * 100:
        print((resCourse,val))
        c -= 1
        if c < 0:
            break


