In [1]:
import pandas as pd
from scipy import spatial

In [2]:
data = pd.read_csv('data-final.csv')

In [3]:
data = data[['Subject Number', 'Subject Name', 'Group Assignments', 'Assignment Types', 'Keywords']]

In [4]:
data = data.drop([22, 23, 24])

In [5]:
data['Assignment Types'] = data['Assignment Types'].str.split(',')
data['Keywords'] = data['Keywords'].str.split(',')

In [6]:
assignmentset = set()
for index, row in data.iterrows():
    types = row['Assignment Types']
    for assgntype in types:
        if assgntype not in ['', ' ']:
            assignmentset.add(assgntype.strip())

assignmentset = list(assignmentset)

In [7]:
keywordset = set()
for index, row in data.iterrows():
    keywords = row['Keywords']
    for keyword in keywords:
        if keyword not in ['', ' ']:
            keywordset.add(keyword.strip())
keywordset = list(keywordset)

In [8]:
def onehot_encode(valueset, valuelist):
    return [1 if value in valuelist else 0 for value in valueset]

In [9]:
data['onehot_assignment'] = data['Assignment Types'].apply(lambda x: onehot_encode(assignmentset, x))
data['onehot_keywords'] = data['Keywords'].apply(lambda x: onehot_encode(keywordset, x))

In [10]:
import math
def calc_similarityscore(subject, datapoint):
    
    ga = int(subject['Group Assignments'])
    gdataframe = int(datapoint['Group Assignments'])
    
    gscore = abs(ga - gdataframe)
    ascore  = spatial.distance.cosine(subject['onehot_assignment'], datapoint['onehot_assignment'])
    kwscore  = spatial.distance.cosine(subject['onehot_keywords'], datapoint['onehot_keywords'])
    
    return ((0.01)*(1-gscore) + (0.1)*(1-ascore) + (1-kwscore))/(1.11)

In [11]:
def calc_similaritylist(datapoint):
    
    datapoint['onehot_assignment'] = onehot_encode(assignmentset, datapoint['Assignment Types'])
    datapoint['onehot_keywords'] = onehot_encode(keywordset, datapoint['Keywords'])
    
    print(datapoint['Assignment Types'])
    
    similarity_list = []
    for index, row in data.iterrows():
        similarity_list.append((index, calc_similarityscore(row, datapoint)))
        
    return sorted(similarity_list, reverse=True, key=lambda score: score[1])

In [12]:
def K_nearest(similaritylist, k):
    return [(data['Subject Name'][similaritylist[i][0]], round(similaritylist[i][1], 6)) for i in range(0, k)]

In [13]:
assignmentset

['Portfolio',
 'Project',
 'Journal',
 'Reflection',
 'Presentation',
 'Literature review',
 'Exam',
 'Demonstration',
 'Mid-session examination',
 'Essay',
 'Case study',
 'Exercises',
 'Report',
 'Laboratory/practical',
 'Design/drawing/plan/sketch',
 'Quiz/test',
 'Examination']

In [14]:
keywordset[:10]

['Environmental Law',
 'Urban Stormwater Systems',
 'Nervous System',
 'Engine',
 'Programming',
 'Cyberspace',
 'Site Contaminant Types',
 'Groundwater',
 'Software Architecture Patterns and Styles',
 'Ziegler-Nichols Techniques']

In [15]:
datapoint = {'Group Assignments': 1, 'Assignment Types': ['Quiz/test', 'Examination'], 'Keywords':['Embedded C', 'DNA', 'Scrum','Biomedical Signals/Images', 'Travel Behaviour', 'Data Transformation', 'Linear Optimal Control', 'Aluminium Design', 'Cloud Processing', 'Mathematics', 'Frictional Devices', 'Sequential Circuits', 'Analog Controller Design', 'ad-hoc WLAN networks']}

In [16]:
similaritylist = calc_similaritylist(datapoint)

['Quiz/test', 'Examination']


In [17]:
K_nearest(similaritylist, 10)

[('Wireless Access Network Technologies', 0.202955),
 ('Sensors and Control for Mechatronic Systems', 0.180864),
 ('Bioinformatics', 0.166176),
 ('Biomedical Signal Processing', 0.161732),
 ('Mechanical Design 2', 0.152723),
 ('Control Studio A', 0.135794),
 ('Advanced Robotics', 0.131865),
 ('Introductory Digital Systems', 0.125304),
 ('Accelerating Your Prototyping and Commercialisation Strategies', 0.120388),
 ('Facade Engineering', 0.107678)]