In [216]:
import mysql.connector
from pprint import pprint
import spacy
nlp = spacy.load('en_core_web_lg')

mydb = mysql.connector.connect(
    host="35.226.133.103",
    user="root",
    passwd="keonp2",
    db="project_db"
)

cur = mydb.cursor()
courseNo = 126
courseName = 'Software Design Studio'

def tokenize(s_lst):
    filtered_tokens = []
    for tk in course_text_nlp:
        if not tk.is_stop and not tk.is_punct:
            filtered_tokens.append(tk)
    return filtered_tokens    

In [217]:
def get_prof_rankings_for_course(cur, courseNo, courseName):
    
    def normalize_score(min_s, max_s, score):
        return (score - min_s) / (max_s - min_s)
    
    # 1. get course and its text data
    q = '''
        SELECT courseDesc
        FROM csCourse
        WHERE courseNo = %s AND courseName = %s;
    '''
    
    cur.execute(q, (courseNo, courseName))
    res = cur.fetchone()
    courseDesc = res[0]
    
    course_text = courseName + ", " + courseDesc
    course_text_nlp = nlp(course_text)
    
    # 2. get all instructors' research interests 
    q = '''
        SELECT instructorId, instructorName, researchInterests
        FROM csInstructor;
    '''
    
    cur.execute(q)
    res = cur.fetchall()
    
    prof_research_dict = {}
    prof_id_dict = {}
    for r in res:
        prof_research_dict[r[0]] = nlp(r[2])
        prof_id_dict[r[0]] = str(r[1])
        
    # 3. get average GPA for instructors who have taught the course
    q = '''
        SELECT csInstructor.instructorId, (((SUM(aPlus) * 4) + (SUM(a) * 4) + (SUM(aMinus) * 3.67) + (SUM(bPlus) * 3.33) + (SUM(b) * 3) + (SUM(bMinus) * 2.67) + (SUM(cPlus) * 2.33) + (SUM(c) * 2) + (SUM(cMinus) * 1.67) + (SUM(dPlus) * 1.33) + (SUM(d) * 1) + (SUM(dMinus) * 0.67) + (SUM(f) * 0)) / (SUM(aPlus) + SUM(a) + SUM(aMinus) + SUM(bPlus) + SUM(b) + SUM(bMinus) + SUM(cPlus) + SUM(c) + SUM(cMinus) + SUM(dPlus) + SUM(d) + SUM(dMinus) + SUM(f))) as averageGPA
        FROM csGrade LEFT JOIN csInstructor ON csInstructor.instructorId = csGrade.primaryInstructor
        WHERE csGrade.courseNo = %s AND csGrade.courseName = %s
        GROUP BY csInstructor.instructorId;
    '''
    
    cur.execute(q, (courseNo, courseName))
    res = cur.fetchall()
    
    prof_avg_dict = {}
    for r in res:
        prof_avg_dict[r[0]] = float(r[1])
    base_score = float(min([r[1] for r in res]))
    
    prof_total_scores = {}
    for prof in prof_research_dict:
        sim_score = 0
        if prof_research_dict[prof].vector_norm:
            sim_score = float(course_text_nlp.similarity(prof_research_dict[prof]))

        gpa_score = base_score
        if prof in prof_avg_dict:
            gpa_score = prof_avg_dict[prof]

        # this is the formula for calculating total score.
        # research_similarity * 0.1 + average_gpa_score (lowest gpa is used as the base score for all professors)
        prof_total_scores[prof] = sim_score * 0.1 + gpa_score 
    
    max_s = max(prof_total_scores.values())
    min_s = min(prof_total_scores.values())
    highest_scores = [(r[0], prof_id_dict[r[0]], normalize_score(min_s, max_s, r[1])) for r in sorted(prof_total_scores.items(), key=lambda kv : (kv[1], kv[0]), reverse=True) ]
    return highest_scores[0:5]

In [218]:
def get_course_rankings_for_prof(cur, instructorId):
    
    def normalize_score(min_s, max_s, score):
        return (score - min_s) / (max_s - min_s)
    
    # 1. get prof and his/her research interests
    q = '''
        SELECT instructorId, instructorName, researchInterests
        FROM csInstructor
        WHERE instructorId = %s;
    '''
    
    cur.execute(q, (instructorId,))
    instructorId, instructorName, researchInterests = cur.fetchone()
    instructor_text_nlp = nlp(researchInterests)
    
#     print(instructorId, instructorName, researchInterests)
    
    if not instructor_text_nlp.vector_norm:
        return [] # cant match prof to courses if prof research interests is empty
    
    # 2. get courses and their text data
    q = '''
        SELECT courseNo, courseName, courseDesc
        FROM csCourse;
    '''
    
    cur.execute(q)
    res = cur.fetchall()
    
    course_desc_dict = {}
    for r in res:
        course_desc_dict[(r[0], r[1])] = nlp(str(r[2]))
    
    course_total_scores = {}
    for course in course_desc_dict:
        course_total_scores[course] = 0
        if course_desc_dict[course].vector_norm:
            # this is the formula for calculating total score = research_similarity 
            # this is different from prof_rankings_for_course, which takes into account the averageGPA
            course_total_scores[course] = instructor_text_nlp.similarity(course_desc_dict[course])
            
    return list(sorted(course_total_scores.items(), key= lambda kv: (kv[1], kv[0]), reverse=True))[0:10]

In [219]:
# for a given course, get and rank instructors for that course (get top 5)
results = get_prof_rankings_for_course(cur, courseNo, courseName)
pprint(results)

[(115, 'Craig Zilles', 1.0),
 (76, 'Graham Evans', 0.8809724989921174),
 (152, 'David Padua', 0.4487579558835001),
 (60, 'Matthew Caesar', 0.44532047970535704),
 (71, 'Abdussalam Alawini', 0.4446303412193027)]


In [220]:
instructorId = 20
results = get_course_rankings_for_prof(cur, instructorId)
pprint(results)

20 Nikita Borisov Security and Privacy, Distributed Systems 
[((461, 'Computer Security I'), 0.9258387932675674),
 ((463, 'Computer Security II'), 0.8937231446373293),
 ((460, 'Security Laboratory'), 0.891875421118551),
 ((423, 'Operating Systems Design'), 0.8687742597531047),
 ((523, 'Advanced Operating Systems'), 0.8625780609993267),
 ((538, 'Advanced Computer Networks'), 0.8550994451702643),
 ((425, 'Distributed Systems'), 0.8520427547239174),
 ((241, 'System Programming'), 0.8355563924307718),
 ((438, 'Communication Networks'), 0.8308867177730709),
 ((412, 'Introduction to Data Mining'), 0.8231902048105624)]
