In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


url_student = 'https://raw.githubusercontent.com/Instantutor/Instantutor-Research/main/Datasets/Data.csv'
student = pd.read_csv(url_student)
student.rename({'Category(Mentor/Mentee/Both)':"Category"},axis='columns',inplace=True)
url_rating = 'https://raw.githubusercontent.com/Instantutor/Instantutor-Research/main/Datasets/Rating.csv'
rating = pd.read_csv(url_rating)
student.head(3)

Unnamed: 0,id_student,gender,Category,Area,Degree,Course,Time zone,Availability time start,Availability time end
0,11391,M,1,IT,Undergraduate,,Asia,9:00,14:00
1,28400,F,3,IT,Graduate,,Pacific,16:00,18:00
2,30268,F,2,IT,PhD,,Europe,12:00,15:00


In [14]:
rating.describe()

Unnamed: 0,Mentor,Mentee,Rating,Comments,Time stamp
count,3999.0,0.0,3999.0,0.0,3999.0
mean,720146.5,,3.644286,,1188795000.0
std,562393.9,,0.999781,,210672200.0
min,25629.0,,0.5,,834753700.0
25%,508807.0,,3.0,,979971600.0
50%,592158.0,,4.0,,1179200000.0
75%,647048.5,,4.5,,1392410000.0
max,2710343.0,,5.0,,1498765000.0


In [15]:
rating.dtypes
student.dtypes

id_student                   int64
gender                      object
Category                     int64
Area                        object
Degree                      object
Course                     float64
Time zone                   object
Availability time start     object
Availability time end       object
dtype: object

In [16]:
rating_agg = rating.groupby('Mentor').Rating.agg(['count','mean']).reset_index()
#rating_agg.sort_values('count',ascending=False).head(5)
#rating_agg[rating_agg['Mentor']==600175]
#rating[rating['Mentor']==600175]

# Merge average rating and count of ratings to student data
mentor_agg = student.merge(rating_agg,left_on='id_student',right_on='Mentor').reset_index()
mentor_agg = mentor_agg.drop(['id_student','gender'],axis=1)

#Filter out only mentor data
#mentor_agg = mentor_agg[mentor_agg['Category'].isin([1,3])]

mentor_agg.sort_values('count',ascending=False).head(5)

Unnamed: 0,index,Category,Area,Degree,Course,Time zone,Availability time start,Availability time end,Mentor,count,mean
1993,1993,2,Materials Science,PhD,,Antarctica,17:00,23:00,600175,4,2.5
1992,1992,3,Chemical Engineering,PhD,,Europe,19:00,23:00,600175,4,2.5
1397,1397,3,Materials Science,Graduate,,Europe,19:00,23:00,598662,4,3.75
1398,1398,3,IT,Graduate,,Africa,17:00,23:00,598662,4,3.75
1417,1417,1,Geology,MS,,Antarctica,11:00,15:00,603791,4,4.625


In [17]:
#Merge duplicate mentor records and create extended list of areas

In [18]:
#Create a new column with skillsets - courses, area, major
mentor_agg['Expertise'] = mentor_agg['Area'] + " " + mentor_agg['Degree'] #+mentor_agg['Course']

mentor_agg.head(5)

Unnamed: 0,index,Category,Area,Degree,Course,Time zone,Availability time start,Availability time end,Mentor,count,mean,Expertise
0,0,1,Math,PhD,,North America,20:00,23:00,45462,1,3.5,Math PhD
1,1,1,IT,MS,,Europe,11:00,13:00,65002,1,3.0,IT MS
2,2,3,Biology,PhD,,Antarctica,9:00,13:00,65002,1,3.0,Biology PhD
3,3,3,IT,Graduate,,Africa,16:00,18:00,74372,1,2.5,IT Graduate
4,4,2,IT,Graduate,,Africa,14:00,19:00,110175,1,3.5,IT Graduate


In [19]:
#Assigning vectors to Expertise text 
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer()

#Fitting TF-IDF on the Expertise column
tfv_matrix = tfv.fit_transform(mentor_agg['Expertise'])
tfv_matrix.shape

(4469, 22)

In [20]:
#Finding similarity between vectors using sigmoid kernel
from sklearn.metrics.pairwise import sigmoid_kernel

sig = sigmoid_kernel(tfv_matrix,tfv_matrix)
sig[0]

array([0.78003289, 0.76159416, 0.7650908 , ..., 0.77387252, 0.78003289,
       0.76807446])

In [23]:
#Index mapping of student in matrix
indices = pd.Series(mentor_agg.index,index=mentor_agg['Mentor']).drop_duplicates()
indices.head(5)


Mentor
45462     0
65002     1
65002     2
74372     3
110175    4
dtype: int64

In [39]:
def find_mentor(student_id,sig = sig):
    id = indices[student_id]

    sig_score = list(enumerate(sig[id]))
    sig_score = sorted(sig_score, key = lambda x:x[1], reverse=True)

    sig_score = sig_score[1:5]

    mentors = [i[0] for i in sig_score]

    return mentor_agg['Mentor'].iloc[mentors]

In [40]:
find_mentor(45462)

8      148993
128    400469
141    465730
188    530065
Name: Mentor, dtype: int64