In [1]:
import pandas as pd
from scipy import spatial

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data = data[['Subject Number', 'Subject Name', 'Group Assignments', 'Assignment Types', 'Keywords']]

In [4]:
data = data.drop([22, 23, 24])

In [5]:
data['Assignment Types'] = data['Assignment Types'].str.split(',')
data['Keywords'] = data['Keywords'].str.split(',')

In [6]:
assignmentset = set()
for index, row in data.iterrows():
    types = row['Assignment Types']
    for assgntype in types:
        if assgntype not in ['', ' ']:
            assignmentset.add(assgntype.strip())

assignmentset = list(assignmentset)

In [7]:
keywordset = set()
for index, row in data.iterrows():
    keywords = row['Keywords']
    for keyword in keywords:
        if keyword not in ['', ' ']:
            keywordset.add(keyword.strip())
keywordset = list(keywordset)

In [8]:
def onehot_encode(valueset, valuelist):
    return [1 if value in valuelist else 0 for value in valueset]

In [9]:
data['onehot_assignment'] = data['Assignment Types'].apply(lambda x: onehot_encode(assignmentset, x))
data['onehot_keywords'] = data['Keywords'].apply(lambda x: onehot_encode(keywordset, x))

In [10]:
def calc_similarityscore(subjid, datapoint):
    
    a = data.iloc[subjid]
    datapoint['onehot_assignment'] = onehot_encode(assignmentset, datapoint['Assignment Types'])
    datapoint['onehot_keywords'] = onehot_encode(keywordset, datapoint['Keywords'])
    
    ga = int(a['Group Assignments'])
    gdataframe = int(datapoint['Group Assignments'])
    
    gscore = abs(ga - gdataframe)
    ascore  = spatial.distance.cosine(a['onehot_assignment'], datapoint['onehot_assignment'])
    kwscore  = spatial.distance.cosine(a['onehot_keywords'], datapoint['onehot_keywords'])
    
    return ((0.3)*(1-gscore) + (1-ascore) + (1-kwscore))/(2.3)

In [14]:
def calc_similaritylist():
    
    similarity_list = []
    for index, row in data.iterrows():
        similarity_list.append((index, calc_similarityscore(index, datapoint)))
        
    return sorted(similarity_list, reverse=True, key=lambda score: score[1])

In [15]:
datapoint = {'Group Assignments': 0, 'Assignment Types': ['Quiz/test'], 'Keywords':['LTE', 'Prototyping']}

In [16]:
calc_similaritylist()

[(1, 0.4378725135593685),
 (12, 0.3814566387780982),
 (0, 0.37111016982316253),
 (21, 0.30743773095067284),
 (2, 0.2841536480840321),
 (15, 0.25102185616940254),
 (20, 0.25102185616940254),
 (7, 0.13043478260869565),
 (8, 0.13043478260869565),
 (10, 0.13043478260869565),
 (13, 0.13043478260869565),
 (14, 0.13043478260869565),
 (18, 0.13043478260869565),
 (19, 0.13043478260869565),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (9, 0.0),
 (11, 0.0),
 (16, 0.0),
 (17, 0.0)]