In [1]:
import pandas as pd
from scipy import spatial

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data = data[['Subject Number', 'Subject Name', 'Group Assignments', 'Assignment Types', 'Keywords']]

In [4]:
data = data.drop([22, 23, 24])

In [5]:
data['Assignment Types'] = data['Assignment Types'].str.split(',')
data['Keywords'] = data['Keywords'].str.split(',')

In [6]:
assignmentset = set()
for index, row in data.iterrows():
    types = row['Assignment Types']
    for assgntype in types:
        if assgntype not in ['', ' ']:
            assignmentset.add(assgntype.strip())

assignmentset = list(assignmentset)

In [7]:
keywordset = set()
for index, row in data.iterrows():
    keywords = row['Keywords']
    for keyword in keywords:
        if keyword not in ['', ' ']:
            keywordset.add(keyword.strip())
keywordset = list(keywordset)

In [8]:
def onehot_encode(valueset, valuelist):
    return [1 if value in valuelist else 0 for value in valueset]

In [9]:
data['onehot_assignment'] = data['Assignment Types'].apply(lambda x: onehot_encode(assignmentset, x))
data['onehot_keywords'] = data['Keywords'].apply(lambda x: onehot_encode(keywordset, x))

In [31]:
def calc_similarityscore(subjid, datapoint):
    
    a = data.iloc[subjid]
    datapoint['onehot_assignment'] = onehot_encode(assignmentset, datapoint['Assignment Types'])
    datapoint['onehot_keywords'] = onehot_encode(keywordset, datapoint['Keywords'])
    
    ga = int(a['Group Assignments'])
    gdataframe = int(datapoint['Group Assignments'])
    
    gscore = abs(ga - gdataframe)
    ascore  = spatial.distance.cosine(a['onehot_assignment'], datapoint['onehot_assignment'])
    kwscore  = spatial.distance.cosine(a['onehot_keywords'], datapoint['onehot_keywords'])
    
    return ((0.25)*(1-gscore) + (1-ascore) + 2*(1-kwscore))/(3.2)

In [32]:
def calc_similaritylist():
    
    similarity_list = []
    for index, row in data.iterrows():
        similarity_list.append((index, calc_similarityscore(index, datapoint)))
        
    return sorted(similarity_list, reverse=True, key=lambda score: score[1])

In [33]:
datapoint = {'Group Assignments': 0, 'Assignment Types': ['Quiz/test'], 'Keywords':['LTE', 'Prototyping']}

In [34]:
calc_similaritylist()

[(0, 0.37722086912079605),
 (1, 0.29909586912079605),
 (2, 0.29909586912079605),
 (12, 0.25854695912175807),
 (21, 0.22097086912079608),
 (15, 0.18042195912175807),
 (20, 0.18042195912175807),
 (7, 0.078125),
 (8, 0.078125),
 (10, 0.078125),
 (13, 0.078125),
 (14, 0.078125),
 (18, 0.078125),
 (19, 0.078125),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (9, 0.0),
 (11, 0.0),
 (16, 0.0),
 (17, 0.0)]

In [35]:
data.iloc[0]

Subject Number                                                 42890.0
Subject Name                                 4G/5G Mobile Technologies
Group Assignments                                                  1.0
Assignment Types     [Quiz/test, Laboratory/practical, Examination,...
Keywords             [LTE, 5G, 4G, Mobile Networks, Mobile Technolo...
onehot_assignment                    [1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1]
onehot_keywords      [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
Name: 0, dtype: object