In [1]:
import numpy as np
import pandas as pd
import torch
import sklearn
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as du
import torch.nn.functional as F

from tqdm import tqdm
from torch.utils import data
from collections import defaultdict
from torch.utils.data import Dataset
from torchvision import datasets, transforms

In [2]:
#Obtain student data from the github
url_student = 'https://raw.githubusercontent.com/Instantutor/Instantutor-Research/main/Datasets/Data.csv'
student = pd.read_csv(url_student)

#Take a look at the first 5 rows
student.head()

Unnamed: 0,id_student,gender,Category(Mentor/Mentee/Both),Area,Degree,Course,Time zone,Availability time start,Availability time end
0,11391,M,1,IT,Undergraduate,,Asia,9:00,14:00
1,28400,F,3,IT,Graduate,,Pacific,16:00,18:00
2,30268,F,2,IT,PhD,,Europe,12:00,15:00
3,31604,F,1,IT,PhD,,Central,14:00,23:00
4,32885,F,3,IT,Undergraduate,,Europe,20:00,23:00


In [3]:
#Drop unnecessary info, for now, we will drop course. In the future, we can simply do a dot product of course and a tutor's speciality.
student.drop(['Category(Mentor/Mentee/Both)', 'Degree', 'Course'], axis = 1, inplace = True)
student.head()

Unnamed: 0,id_student,gender,Area,Time zone,Availability time start,Availability time end
0,11391,M,IT,Asia,9:00,14:00
1,28400,F,IT,Pacific,16:00,18:00
2,30268,F,IT,Europe,12:00,15:00
3,31604,F,IT,Central,14:00,23:00
4,32885,F,IT,Europe,20:00,23:00


In [4]:
#Obtain rating data from the github
url_rating = 'https://raw.githubusercontent.com/Instantutor/Instantutor-Research/main/Datasets/Rating.csv'
rating = pd.read_csv(url_rating)

#Take a look at the first 5 rows
rating.head()

Unnamed: 0,Mentor,Mentee,Rating,Comments,Time stamp
0,366449,,1.0,,1425941529
1,1677677,,4.5,,1425942435
2,553755,,5.0,,1425941523
3,611182,,5.0,,1425941546
4,646184,,5.0,,1425941556


In [5]:
#Drop Unnecessary Information, in the future we can find do sentiment analysis on comments to create a +- value.
rating.drop(['Mentee', 'Time stamp', 'Comments'], axis = 1, inplace = True)
rating.head()

Unnamed: 0,Mentor,Rating
0,366449,1.0
1,1677677,4.5
2,553755,5.0
3,611182,5.0
4,646184,5.0


In [6]:
#View the Distribution of Data
print(student['Area'].value_counts())
print(student['Time zone'].value_counts())

IT                      3338
Chemistry               3299
Math                    3265
Chemical Engineering    3231
Materials Science       3182
History                 3155
Geology                 3151
Computer Science        3133
Aerospace               3131
Physics                 3114
French                   130
Arabic                   118
Science                  102
English                   90
Biology                   60
Spanish                   50
Quran                     44
Name: Area, dtype: int64
Asia             4162
Europe           4096
Africa           4095
Australia        4082
Pacific          4064
North America    4040
Central          4034
Antarctica       4020
Name: Time zone, dtype: int64


In [7]:
#Obtain all possible labels for each column to be generated
possible_areas = student['Area'].unique()
possible_zones = student['Time zone'].unique()
possible_times = ["0:00", "1:00", "2:00", "3:00", "4:00", "5:00", "6:00", "7:00", "8:00", "9:00", "10:00", "11:00", "12:00", "13:00", "14:00", "15:00", "16:00", "17:00", "18:00", "19:00", "20:00", "21:00", "22:00", "23:00"] 
possible_genders = ['F', 'M']

#3999 rows

student_rows = student['Area'].size
rating_rows = rating['Mentor'].size

#obtain the distribution
spreads = student['Area'].value_counts()
distribution = spreads.div(student_rows)
print(distribution)

IT                      0.102415
Chemistry               0.101218
Math                    0.100175
Chemical Engineering    0.099132
Materials Science       0.097628
History                 0.096800
Geology                 0.096677
Computer Science        0.096125
Aerospace               0.096064
Physics                 0.095542
French                  0.003989
Arabic                  0.003620
Science                 0.003130
English                 0.002761
Biology                 0.001841
Spanish                 0.001534
Quran                   0.001350
Name: Area, dtype: float64


In [8]:
# Only Area generation is weighted. Other data generation will be uniformly distributed.
rating['gender'] = np.random.choice(possible_genders, rating_rows)
rating['Area'] = np.random.choice(possible_areas, rating_rows, p = distribution.values)
rating['Time zone'] = np.random.choice(possible_zones, rating_rows)
rating['Availability time start'] = np.random.choice(possible_times, rating_rows)
rating['Availability time end'] = np.random.choice(possible_times, rating_rows)

#view the new ratings data
rating.head()

Unnamed: 0,Mentor,Rating,gender,Area,Time zone,Availability time start,Availability time end
0,366449,1.0,M,IT,North America,9:00,14:00
1,1677677,4.5,F,Math,Antarctica,5:00,15:00
2,553755,5.0,F,French,Antarctica,21:00,6:00
3,611182,5.0,M,Quran,Australia,0:00,14:00
4,646184,5.0,M,Materials Science,Central,14:00,7:00


In [9]:
#Since we are only comparing similarity, we can exclude rating from the encoder and simply sort recommended tutors by their rating at the end.

#Store the mentor:rating pairs as a dictionary for future access
Mentor_to_rating = rating.groupby('Mentor')['Rating'].apply(list).to_dict()

#Drop the rating column
rating.drop('Rating', axis = 1, inplace = True)

We will use a classification task to train our embeddings. To do so, we will create positive pairs using collaborative filtering
and randomly sampled negative pairs. For each user, we want to create 3 positive samples and 15 negative samples. We can choose to pass these as batches.

In [10]:
#First, we want to convert the features of each student and tutor to numerical values.
gender_to_val = {possible_genders[i]:i for i in range(len(possible_genders))}
area_to_val = {possible_areas[i]:i for i in range(len(possible_areas))}
time_to_val = {possible_zones[i]:i for i in range(len(possible_zones))}
avail_to_val = {possible_times[i]:i  for i in range(len(possible_times))}

print(f"Genders: {gender_to_val}\nAreas: {area_to_val}\nTimes: {time_to_val}\nAvailabilities: {avail_to_val}")

Genders: {'F': 0, 'M': 1}
Areas: {'IT': 0, 'Math': 1, 'Arabic': 2, 'Science': 3, 'English': 4, 'Quran': 5, 'Spanish': 6, 'French': 7, 'History': 8, 'Biology': 9, 'Chemistry': 10, 'Geology': 11, 'Chemical Engineering': 12, 'Computer Science': 13, 'Physics': 14, 'Materials Science': 15, 'Aerospace': 16}
Times: {'Asia': 0, 'Pacific': 1, 'Europe': 2, 'Central': 3, 'North America': 4, 'Australia': 5, 'Antarctica': 6, 'Africa': 7}
Availabilities: {'0:00': 0, '1:00': 1, '2:00': 2, '3:00': 3, '4:00': 4, '5:00': 5, '6:00': 6, '7:00': 7, '8:00': 8, '9:00': 9, '10:00': 10, '11:00': 11, '12:00': 12, '13:00': 13, '14:00': 14, '15:00': 15, '16:00': 16, '17:00': 17, '18:00': 18, '19:00': 19, '20:00': 20, '21:00': 21, '22:00': 22, '23:00': 23}


In [11]:
#Replace any non numerical values according to the dictionary.
student = student.replace(gender_to_val)
student = student.replace(area_to_val)
student = student.replace(time_to_val)
student = student.replace(avail_to_val)

#visualize the user data
student.head()

Unnamed: 0,id_student,gender,Area,Time zone,Availability time start,Availability time end
0,11391,1,0,0,9,14
1,28400,0,0,1,16,18
2,30268,0,0,2,12,15
3,31604,0,0,3,14,23
4,32885,0,0,2,20,23


In [12]:
rating = rating.replace(gender_to_val)
rating = rating.replace(area_to_val)
rating = rating.replace(time_to_val)
rating = rating.replace(avail_to_val)

#visualize the rating data
rating.head()

Unnamed: 0,Mentor,gender,Area,Time zone,Availability time start,Availability time end
0,366449,1,0,4,9,14
1,1677677,0,1,6,5,15
2,553755,0,7,6,21,6
3,611182,1,5,5,0,14
4,646184,1,15,3,14,7


In [13]:
#Make a mapping of users and tutors to indicies. This way, we can keep track of the books. Also create a reverse mapping.
list_tutors = rating['Mentor'].tolist()
list_users = student['id_student'].tolist()

#create mappings for users
idx_to_user = {index: user for (index, user) in enumerate(list_users)}
user_to_idx = {user: index for (index, user) in enumerate(list_users)}

#create mappings for tutors
idx_to_tutor = {index: tutor for (index, tutor) in enumerate(list_tutors)}
tutor_to_idx = {tutor: index for (index, tutor) in enumerate(list_tutors)}

In [14]:
#We do not want repeat tutors or students to be overrepresented in the current data.
#Remove duplicate student ids or tutor ids.
duplicate_users = student[student.duplicated(subset = 'id_student', keep = 'first')]
duplicate_tutors = rating[rating.duplicated(subset = 'Mentor', keep = 'first')]

#drop the duplicate students and users
student.drop(index = duplicate_users.index, inplace = True)
rating.drop(index = duplicate_tutors.index, inplace = True)

#Since we have a mapping now, we can remove the ids from the dataframe, set inplace to false so that a new copy of the df is made.
final_user = student.drop('id_student', axis = 1, inplace = False)
final_tutor = rating.drop('Mentor', axis = 1, inplace = False)

Then, we can generate positive and negative samples through collaborative filtering.
When selecting positive samples, cosine similarity will be weighted based on the following criteria:
* Gender: 5%
* Area: 50%
* Time Zone: 20%
* Availability Time Start: 20%
* Availability Time End: 5%

In [15]:
weights = [0.05, 0.50, 0.20, 0.20, 0.05]

In [16]:
def weighted_Norm(user, tutor, weights):
    '''
    Calculate a weighted euclidean distance given a user and a tutor.
    Since we are using distance, we can simply set student rating to 1 as default and ratings from 0 to 5 as 0 - 1 respectively.
    '''
    dist = user-tutor
    weighted_dist = np.sqrt((weights*dist*dist).sum())
    return weighted_dist

In [17]:
class find_Positives:
    def __init__(self, tutors, pos_samples, neg_samples, weights):
        '''
        weights - The weights for each feature of a tutor when deciding a similarity score. the weight of mentor must be 0.
        pos_samples - the number of positive samples per user
        neg_samples - the number of negative samples per user
        tutors - tutor data and their features
        '''
        self.tutors = tutors
        self.num_pos = pos_samples
        self.num_neg = neg_samples
        self.weights = weights
    
    def get_pairs(self, user):
        ''' given a user, generate positive and negative pairs for that user '''
        
        #use weghted euclidean distance to generate num_pos positive samples.
        distances = [weighted_Norm(user, tutor, self.weights) for tutor in self.tutors]
        #obtain indicies of the lowest num_pos distances in linear time.
        pos_pairs = np.argpartition(distances, self.num_pos)[:self.num_pos]
        
        #generate num_neg negative samples that are not part of the positive samples. We can also randomly sample them from the x highest euclidean distance tutors. 
        t_ind = np.arange(len(tutors))
        t_ind = np.delete(t_ind, pos_pairs)
        neg_pairs = np.random.choice(t_ind, self.num_neg)
        
        #generate an array of labels.
        labels = np.zeros(self.num_pos + self.num_neg)
        labels[0:self.num_pos] = 1
        return pos_pairs, neg_pairs, labels

In [18]:
class UserTutorDataset(torch.utils.data.Dataset):
    def __init__(self, pos_samples, neg_samples, users, tutors, weights):
        '''
        pos_samples - The number of positive samples to create per user.
        neg_samples - The number of negative samples to create per user.
        users - A dataset consisting of users and their attributes(gender, area, timezone, availability times).
        tutors - A dataset consisting of tutors and their attributes.
        weights - An array-like of weights for each attribute of a user/tutor. This gives us more information on which attributes are more important.
        '''
        super(UserTutorDataset, self).__init__()
        
        self.pos_samples = pos_samples
        self.neg_samples = neg_samples
        self.users = users
        self.tutors = tutors
        self.weights = weights
        
        self.create_pairs = find_Positives(self.tutors, self.pos_samples, self.neg_samples, self.weights)
        
    def __len__(self):
        '''
        Return the total number of users.
        '''
        return len(self.users)
    
    def __getitem__(self, idx):
        '''
        Return a tensor of positive and negative tutor samples along with a tensor of targets given a user index.
        '''
        pos_pairs, neg_pairs, labels = self.create_pairs.get_pairs(self.users[idx])
        pos_pairs = torch.tensor(pos_pairs)
        neg_pairs = torch.tensor(neg_pairs)
        total_pairs = torch.cat((pos_pairs, neg_pairs), dim = 0)
        labels = torch.tensor(labels)
        
        #create a tensor of targets(user)
        targets = torch.full(total_pairs.shape, idx)
        return targets, total_pairs, labels

In [19]:
def visualize_embeddings(embeddings):
    '''
    Given a learned embedding, display a tsne-visualization of the embeddings.
    '''

In [20]:
class MLP(nn.Module):
    '''
    A multi layer perceptron with one hidden layer and a dropout layer
    '''
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(MLP, self).__init__()
        
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_dim)
        self.dropout = nn.Dropout(0.12)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)
        
        return x

In [21]:
class TwoTower(nn.Module):
    '''
    This class creates and learns an embedding representation for
    both tutors and users.
    '''
    def __init__(self, num_user, user_features, num_tutors, tutor_features, in_dim, hidden_dim, out_dim):
        super(TwoTower, self).__init__()
        
        #initialize the embeddings
        self.user_embedding = nn.Embedding(num_user, user_features)
        self.tutor_embedding = nn.Embedding(num_tutors, tutor_features)
        
        #define an mlp, we can optionally feed the embeddings to a mlp first.
        self.mlp = MLP(in_dim, hidden_dim, out_dim)
        
    def forward(self, user_idx, tutor_idx):
        
        #obtain embeddings of the user and tutor
        embed_u = self.user_embedding(user_idx)
        embed_t = self.tutor_embedding(tutor_idx)
        
        #Instead of taking sigmoid here, we can just find the loss with logits, might need to weigh this.
        output = torch.sum(embed_u * embed_t, dim = 2)
        return output

Begin the Classification Task to train the embeddings.
* All duplicate tutors and users have been removed from their respective dataframes.
* We can include the number of times a tutor has helped a student as another feature in the dataframe if necessary.

In [22]:
#Set all parameters
num_pos_samples = 3
num_neg_samples = 12
users = final_user.to_numpy()
tutors = final_tutor.to_numpy()
feat_weights = weights

#All users and tutors are unique. Therefore, we can just use the shape.
num_users = final_user.shape[0]
num_tutors = final_tutor.shape[0]
num_user_features = final_user.shape[1]
num_tutor_features = final_tutor.shape[1]
in_dim = 5
hidden_dim = 64
out_dim = 15

In [23]:
#pick the device
device = f'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f"using device: {device}")

using device: cuda:0


In [24]:
#Initialize the Model and Optimizer.
epochs = 8
learning_rate = 0.041
batch_size = 64

#create the train data and load it into the dataloader
train_data = UserTutorDataset(num_pos_samples, num_neg_samples, 
                              users, tutors, feat_weights)
train_loader = du.DataLoader(dataset = train_data, batch_size = batch_size, shuffle = True)

model = TwoTower(num_users, num_user_features, num_tutors, 
                 num_tutor_features, in_dim, hidden_dim, out_dim)
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

#send the model to the device and set the mode to train.
model = model.to(device)
model.train()

TwoTower(
  (user_embedding): Embedding(28785, 5)
  (tutor_embedding): Embedding(3607, 5)
  (mlp): MLP(
    (fc1): Linear(in_features=5, out_features=64, bias=True)
    (fc2): Linear(in_features=64, out_features=15, bias=True)
    (dropout): Dropout(p=0.12, inplace=False)
  )
)

In [25]:
#define the loss function
loss_func = nn.BCEWithLogitsLoss()

#train the model and learn the embeddings
for epoch in range(1, epochs+1):
    sum_loss = 0.
    for batch_idx, (targets, total_pairs, labels) in enumerate(tqdm(train_loader)):
        targets, total_pairs, labels = targets.to(device), total_pairs.to(device), labels.to(device)
        
        #zero out prevous gradients
        model.zero_grad()
        
        #predict with the model
        prediction = model(targets, total_pairs)

        #calculate loss
        loss = loss_func(prediction, labels.float())
        sum_loss += loss.item()
        
        #backpropagate and step.
        loss.backward()
        optimizer.step()
    
    
    sum_loss /= (len(train_loader)*15) #Divide by the total number of samples.
    print(f"Epoch: {epoch}/{epochs}, Loss: {sum_loss:.6f}")

100%|██████████| 450/450 [11:51<00:00,  1.58s/it]


Epoch: 1/8, Loss: 0.056711


100%|██████████| 450/450 [11:51<00:00,  1.58s/it]


Epoch: 2/8, Loss: 0.049133


100%|██████████| 450/450 [11:52<00:00,  1.58s/it]


Epoch: 3/8, Loss: 0.040236


100%|██████████| 450/450 [11:52<00:00,  1.58s/it]


Epoch: 4/8, Loss: 0.012574


100%|██████████| 450/450 [11:52<00:00,  1.58s/it]


Epoch: 5/8, Loss: 0.009024


100%|██████████| 450/450 [11:52<00:00,  1.58s/it]


Epoch: 6/8, Loss: 0.007635


100%|██████████| 450/450 [11:52<00:00,  1.58s/it]


Epoch: 7/8, Loss: 0.006647


100%|██████████| 450/450 [11:52<00:00,  1.58s/it]

Epoch: 8/8, Loss: 0.006134





In [26]:
#TODO: create a word2vec embedding of each user's list of tutors and concatenate it to the learned embeddings
#and learn both ambeddings together in a two tower model.
#Also convert the dataset to an iterable dataset and pass in blocks.
#Refactor code to increase training speed.

In [27]:
#Iterable Dataset for Generating positive and negative pairs in batches
class tutor2Vec(torch.utils.data.IterableDataset):
    def __init__(self, pos_samples, neg_samples, users, block_size, workers, weights):
        '''
        pos_samples - The number of positive samples to create per user
        neg_samples - The number of negative samples to create per user
        users - A dataset consisting of users and their attributes(gender, area, timezone, availability times)
        block_size - The size of each block to be yielded to the dataloader(total positive and neg samples in a block)
        workers - The number of workers to use while creating blocks.
        '''
        super(self, tutor2Vec).__init__()
        
        self.pos_samples = pos_samples
        self.neg_samples = neg_samples
        self.users = users
        self.block_size = block_size
        self.workers = (abs(workers) + workers)/2 #sets num workers to 1 if negative.
    
    def generate_block(self):
        
    
    def __len__(self):
        '''
        return the approximate number of pairs per block.
        '''
        total_pairs = len(self.users)*18
        pairs_per_block = total_pairs/self.block_size
        return pairs_per_block
        
    def __iter__(self):
        '''
        Return positive and negative pairs for one user in the dataset from the block.
        '''
        current_worker = worker.get_worker_info()
        if worker is None:
            worker_id = 0
            num_workers = 1
        else:
            worker_id = current_worked.id
            num_workers = current_worker.num_workers

IndentationError: expected an indented block (4042133448.py, line 22)