In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as du
import torch.nn.functional as F

from torch.utils import data
from collections import defaultdict
from torch.utils.data import Dataset
from torchvision import datasets, transforms

In [2]:
#Obtain student data from the github
url_student = 'https://raw.githubusercontent.com/Instantutor/Instantutor-Research/main/Datasets/Data.csv'
student = pd.read_csv(url_student)

#Take a look at the first 5 rows
student.head()

Unnamed: 0,id_student,gender,Category(Mentor/Mentee/Both),Area,Degree,Course,Time zone,Availability time start,Availability time end
0,11391,M,1,IT,Undergraduate,,Asia,9:00,14:00
1,28400,F,3,IT,Graduate,,Pacific,16:00,18:00
2,30268,F,2,IT,PhD,,Europe,12:00,15:00
3,31604,F,1,IT,PhD,,Central,14:00,23:00
4,32885,F,3,IT,Undergraduate,,Europe,20:00,23:00


In [3]:
#Drop unnecessary info, for now, we will drop course. In the future, we can simply do a dot product of course and a tutor's speciality.
student.drop(['Category(Mentor/Mentee/Both)', 'Degree', 'Course'], axis = 1, inplace = True)
student.head()

Unnamed: 0,id_student,gender,Area,Time zone,Availability time start,Availability time end
0,11391,M,IT,Asia,9:00,14:00
1,28400,F,IT,Pacific,16:00,18:00
2,30268,F,IT,Europe,12:00,15:00
3,31604,F,IT,Central,14:00,23:00
4,32885,F,IT,Europe,20:00,23:00


In [4]:
#Obtain rating data from the github
url_rating = 'https://raw.githubusercontent.com/Instantutor/Instantutor-Research/main/Datasets/Rating.csv'
rating = pd.read_csv(url_rating)

#Take a look at the first 5 rows
rating.head()

Unnamed: 0,Mentor,Mentee,Rating,Comments,Time stamp
0,366449,,1.0,,1425941529
1,1677677,,4.5,,1425942435
2,553755,,5.0,,1425941523
3,611182,,5.0,,1425941546
4,646184,,5.0,,1425941556


In [5]:
#Drop Unnecessary Information, in the future we can find do sentiment analysis on comments to create a +- value.
rating.drop(['Mentee', 'Time stamp', 'Comments'], axis = 1, inplace = True)
rating.head()

Unnamed: 0,Mentor,Rating
0,366449,1.0
1,1677677,4.5
2,553755,5.0
3,611182,5.0
4,646184,5.0


In [6]:
#View the Distribution of Data
print(student['Area'].value_counts())
print(student['Time zone'].value_counts())

IT                      3338
Chemistry               3299
Math                    3265
Chemical Engineering    3231
Materials Science       3182
History                 3155
Geology                 3151
Computer Science        3133
Aerospace               3131
Physics                 3114
French                   130
Arabic                   118
Science                  102
English                   90
Biology                   60
Spanish                   50
Quran                     44
Name: Area, dtype: int64
Asia             4162
Europe           4096
Africa           4095
Australia        4082
Pacific          4064
North America    4040
Central          4034
Antarctica       4020
Name: Time zone, dtype: int64


In [7]:
#Obtain all possible labels for each column to be generated
possible_areas = student['Area'].unique()
possible_zones = student['Time zone'].unique()
possible_start = student['Availability time start'].unique()
possible_end = student['Availability time end'].unique()
possible_genders = ['F', 'M']

#3999 rows

student_rows = student['Area'].size
rating_rows = rating['Mentor'].size

#obtain the distribution
spreads = student['Area'].value_counts()
distribution = spreads.div(student_rows)
print(distribution)

IT                      0.102415
Chemistry               0.101218
Math                    0.100175
Chemical Engineering    0.099132
Materials Science       0.097628
History                 0.096800
Geology                 0.096677
Computer Science        0.096125
Aerospace               0.096064
Physics                 0.095542
French                  0.003989
Arabic                  0.003620
Science                 0.003130
English                 0.002761
Biology                 0.001841
Spanish                 0.001534
Quran                   0.001350
Name: Area, dtype: float64


In [8]:
#Only Area generation is weighted. Other data generation will be uniformly distributed.
rating['gender'] = np.random.choice(possible_genders, rating_rows)
rating['Area'] = np.random.choice(possible_areas, rating_rows, p = distribution.values)
rating['Time zone'] = np.random.choice(possible_zones, rating_rows)
rating['Availability time start'] = np.random.choice(possible_start, rating_rows)
rating['Availability time end'] = np.random.choice(possible_end, rating_rows)

#view the new ratings data
rating.head()

Unnamed: 0,Mentor,Rating,gender,Area,Time zone,Availability time start,Availability time end
0,366449,1.0,F,History,Australia,13:00,11:00
1,1677677,4.5,F,Quran,North America,17:00,19:00
2,553755,5.0,M,French,Pacific,18:00,19:00
3,611182,5.0,M,Math,North America,15:00,20:00
4,646184,5.0,M,English,Pacific,12:00,13:00


In [9]:
#Since we are only comparing similarity, we can exclude rating from the encoder and simply sort recommended tutors by their rating at the end.

#Store the mentor:rating pairs as a dictionary for future access
Mentor_to_rating = rating.groupby('Mentor')['Rating'].apply(list).to_dict()

#Drop the rating column
rating.drop('Rating', axis = 1, inplace = True)

#Display the new tutor data
rating.head()

Unnamed: 0,Mentor,gender,Area,Time zone,Availability time start,Availability time end
0,366449,F,History,Australia,13:00,11:00
1,1677677,F,Quran,North America,17:00,19:00
2,553755,M,French,Pacific,18:00,19:00
3,611182,M,Math,North America,15:00,20:00
4,646184,M,English,Pacific,12:00,13:00


In [10]:
#Display the student data
student.head()

Unnamed: 0,id_student,gender,Area,Time zone,Availability time start,Availability time end
0,11391,M,IT,Asia,9:00,14:00
1,28400,F,IT,Pacific,16:00,18:00
2,30268,F,IT,Europe,12:00,15:00
3,31604,F,IT,Central,14:00,23:00
4,32885,F,IT,Europe,20:00,23:00


In [18]:
#Find the number of unique users and tutors, along with the number of features(excluding the user/tutor id)
user_features = student.shape[1]-1
tutor_features = rating.shape[1]-1

num_user = len(student['id_student'].unique())
num_tutors = len(rating['Mentor'].unique())

print(user_features, tutor_features, num_user, num_tutors)

5 5 28785 3607


In [None]:
#TODO: convert this information to an embedding and concatenate it with a word2vec embedding of each user's list of tutors.
#We do not need to train embeddings for the current data, we can simply change them into tensors and append them to the end of our learned word embeddings.

In [11]:
class MLP(nn.Module):
    '''
    A multi layer perceptron with one hidden layer and a dropout layer
    '''
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(MLP, self).__init__()
        
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_dim)
        self.dropout = nn.Dropout(0.12)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)
        
        return x

In [12]:
class Encoder(nn.Module):
    '''
    This class creates and learns an embedding representation for
    both tutors and users.
    '''
    def __init__(self, num_user, user_features, num_tutors, tutor_features, in_dim, hidden_dim, out_dim):
        super(Encoder, self).__init__()
        
        #initialize the embeddings
        self.user_embedding = nn.Embedding(num_user, user_features)
        self.tutor_embedding = nn.Embedding(num_tutors, tutor_features)
        
        #define an mlp
        self.mlp = MLP(in_dim, hidden_dim, out_dim)
        
    def forward(self, user_id, tutor_id):
        
        #obtain embeddings of the user and tutor
        embed_u = self.user_embedding(user_id)
        embed_t = self.tutor_embedding(tutor_id)
        
        #feed both embeddings to the MLP
        output_u = self.mlp(embed_u)
        output_t = self.mlp(embed_t)
        
        #Instead of taking sigmoid here, we can just find the loss with logits
        output = torch.sum(output_u * output_t, dim = 1)
        return output