In [1]:
import pandas as pd
from scipy import spatial

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data = data[['Subject Number', 'Subject Name', 'Group Assignments', 'Assignment Types', 'Keywords']]

In [4]:
data = data.drop([22, 23, 24])

In [5]:
data['Assignment Types'] = data['Assignment Types'].str.split(',')
data['Keywords'] = data['Keywords'].str.split(',')

In [6]:
assignmentset = set()
for index, row in data.iterrows():
    types = row['Assignment Types']
    for assgntype in types:
        if assgntype not in ['', ' ']:
            assignmentset.add(assgntype.strip())

assignmentset = list(assignmentset)

In [7]:
keywordset = set()
for index, row in data.iterrows():
    keywords = row['Keywords']
    for keyword in keywords:
        if keyword not in ['', ' ']:
            keywordset.add(keyword.strip())
keywordset = list(keywordset)

In [8]:
def onehot_encode(valueset, valuelist):
    return [1 if value in valuelist else 0 for value in valueset]

In [9]:
data['onehot_assignment'] = data['Assignment Types'].apply(lambda x: onehot_encode(assignmentset, x))
data['onehot_keywords'] = data['Keywords'].apply(lambda x: onehot_encode(keywordset, x))

In [31]:
def calc_similarityscore(subjid, datapoint):
    
    a = data.iloc[subjid]
    datapoint['onehot_assignment'] = onehot_encode(assignmentset, datapoint['Assignment Types'])
    datapoint['onehot_keywords'] = onehot_encode(keywordset, datapoint['Keywords'])
    
    ga = int(a['Group Assignments'])
    gdataframe = int(datapoint['Group Assignments'])
    
    gscore = abs(ga - gdataframe)
    ascore  = spatial.distance.cosine(a['onehot_assignment'], datapoint['onehot_assignment'])
    kwscore  = spatial.distance.cosine(a['onehot_keywords'], datapoint['onehot_keywords'])
    
    return ((0.01)*(1-gscore) + (0.1)*(1-ascore) + (1-kwscore))/(1.11)

In [32]:
def calc_similaritylist(datapoint):
    
    similarity_list = []
    for index, row in data.iterrows():
        similarity_list.append((index, calc_similarityscore(index, datapoint)))
        
    return sorted(similarity_list, reverse=True, key=lambda score: score[1])

In [33]:
def K_nearest(similaritylist, k):
    return [(data['Subject Name'][similaritylist[i][0]], round(similaritylist[i][1], 6)) for i in range(0, k)]

In [34]:
assignmentset

['Presentation',
 'Report',
 'Case Study',
 'Literature Review',
 'Exercises',
 'Quiz/test',
 'Laboratory/practical',
 'Project',
 'Portfolio',
 'Essay',
 'Examination']

In [35]:
keywordset[:10]

['Biomedical Signals/Images',
 'Human Rights',
 'Arguments and Evidence',
 'Signals',
 'Embedded C',
 'Joint Time-Frequency Analysis',
 'DNA',
 'Timber',
 'Water Quality',
 'Mobile Networks']

In [173]:
datapoint = {'Group Assignments': 1, 'Assignment Types': ['Quiz/test', 'Project','Examination'], 'Keywords':['Embedded C', 'DNA', 'Mobile Networks', '5G', 'Biomedical Signals/Images']}

In [174]:
similaritylist = calc_similaritylist(datapoint)

In [175]:
K_nearest(similaritylist, 10)

[('4G/5G Mobile Technologies', 0.463918),
 ('Biomedical Signal Processing', 0.262747),
 ('Bioinformatics', 0.240487),
 ('Advanced Robotics', 0.187296),
 ('Chemistry 1', 0.069069),
 ('Advanced Engineering Computing', 0.061023),
 ('Applications Programming', 0.06006),
 ('Airconditioning', 0.052014),
 ('Advanced Flow Modelling', 0.045788),
 ('Biomedical Instrumentation', 0.045788)]