In [1]:
import pandas as pd
from scipy import spatial

In [2]:
data = pd.read_csv('data-final.csv')

In [3]:
data = data[['Subject Number', 'Subject Name', 'Group Assignments', 'Assignment Types', 'Keywords']]

In [4]:
data = data.drop([22, 23, 24])

In [5]:
data['Assignment Types'] = data['Assignment Types'].str.split(',')
data['Keywords'] = data['Keywords'].str.split(',')

In [6]:
assignmentset = set()
for index, row in data.iterrows():
    types = row['Assignment Types']
    for assgntype in types:
        if assgntype not in ['', ' ']:
            assignmentset.add(assgntype.strip())

assignmentset = list(assignmentset)

In [7]:
keywordset = set()
for index, row in data.iterrows():
    keywords = row['Keywords']
    for keyword in keywords:
        if keyword not in ['', ' ']:
            keywordset.add(keyword.strip())
keywordset = list(keywordset)

In [8]:
def onehot_encode(valueset, valuelist):
    return [1 if value in valuelist else 0 for value in valueset]

In [9]:
data['onehot_assignment'] = data['Assignment Types'].apply(lambda x: onehot_encode(assignmentset, x))
data['onehot_keywords'] = data['Keywords'].apply(lambda x: onehot_encode(keywordset, x))

In [10]:
import math
def calc_similarityscore(subject, datapoint):
    
    ga = int(subject['Group Assignments'])
    gdataframe = int(datapoint['Group Assignments'])
    
    gscore = abs(ga - gdataframe)
    ascore  = spatial.distance.cosine(subject['onehot_assignment'], datapoint['onehot_assignment'])
    kwscore  = spatial.distance.cosine(subject['onehot_keywords'], datapoint['onehot_keywords'])
    
    return ((0.01)*(1-gscore) + (0.1)*(1-ascore) + (1-kwscore))/(1.11)

In [11]:
def calc_similaritylist(datapoint):
    
    datapoint['onehot_assignment'] = onehot_encode(assignmentset, datapoint['Assignment Types'])
    datapoint['onehot_keywords'] = onehot_encode(keywordset, datapoint['Keywords'])
    
    print(datapoint['Assignment Types'])
    
    similarity_list = []
    for index, row in data.iterrows():
        similarity_list.append((index, calc_similarityscore(row, datapoint)))
        
    return sorted(similarity_list, reverse=True, key=lambda score: score[1])

In [12]:
def K_nearest(similaritylist, k):
    return [(data['Subject Name'][similaritylist[i][0]], round(similaritylist[i][1], 6)) for i in range(0, k)]

In [13]:
assignmentset

['Demonstration',
 'Quiz/test',
 'Project',
 'Examination',
 'Report',
 'Case study',
 'Journal',
 'Mid-session examination',
 'Portfolio',
 'Presentation',
 'Exercises',
 'Literature review',
 'Essay',
 'Exam',
 'Reflection',
 'Design/drawing/plan/sketch',
 'Laboratory/practical']

In [14]:
keywordset[:10]

['Bioengineering',
 'Mechanical Systems',
 'Engineering Design',
 'Windows',
 'Biomedical Devices',
 'Beams',
 'Risk Evaluation',
 'Objected Orientated Paradigm',
 'Contaminant Transport Processes',
 'Slabs']

In [15]:
datapoint = {'Group Assignments': 1, 'Assignment Types': ['Quiz/test', 'Examination'], 'Keywords':['Embedded C', 'DNA', 'Scrum','Biomedical Signals/Images', 'Travel Behaviour', 'Data Transformation', 'Linear Optimal Control', 'Aluminium Design', 'Cloud Processing', 'Mathematics', 'Frictional Devices', 'Sequential Circuits', 'Analog Controller Design', 'ad-hoc WLAN networks']}

In [16]:
similaritylist = calc_similaritylist(datapoint)

['Quiz/test']


In [17]:
K_nearest(similaritylist, 10)

[('4G/5G Mobile Technologies', 0.363562),
 ('Accelerating Your Prototyping and Commercialisation Strategies', 0.327526),
 ('Environmental Planning and Law', 0.099099),
 ('Dynamics and Control', 0.09009),
 ('Electronics and Circuits', 0.09009),
 ('Engineering Computations', 0.09009),
 ('Aboriginal Sydney Now', 0.072712),
 ('Fundamentals of Mechanical Engineering', 0.072712),
 ('Hydraulics and Hydrology', 0.072712),
 ('Introductory Digital Systems', 0.072712)]