In [2]:
# base imports
import pandas as pd
import numpy as np

In [16]:
# Column met_o had values 1/2. It needs to be changed to 0/1.
df = pd.read_csv('./data_for_matrix.csv')
df['met_o'] = df['met_o'].apply(lambda x: x - 1)
df.to_csv('data_for_matrix.csv', index=False)

### Below you can see code that prepares data for the basic matrix factorization.
Here in the base matrix we only have information about match. So we need a set of vectors, where each vector describes each date (holds both ids and information about match).

Task will requite two base matrices.
1. Matrix where men are "users" and women are "products". It will then be used to recommend women to men because matrix will say what's the predicted rating of a woman in eyes of man. Basically it will answer the question: **"How likely is that a man will like a woman?"**. Let's call this matrix/data frame **"men_like_women"**.
1. Matrix where women are "users" and men are "products". It will then be used to recommend men to women because matrix will say what's the predicted rating of a man in eyes of woman. Basically it will answer the question: **"How likely is that a woman will like a man?"**. Let's call this matrix/data frame **"women_like_men"**.

Why such analogies? It may help to understand how do this human relations task translates into recommender systems world.

These matrices will be used to train models (with matrix factorization) which will then be saved into csv files.

In [3]:
# Split into vectors, let's have two matrices as describes above.
base_df = pd.read_csv('./data_for_matrix.csv')
men_like_women_data = []
women_like_men_data = []

for _, row in base_df.iterrows():
    vector = {
        'id': row['iid'],
        'pid': row['pid'],
        'match': row['match'],
    }
    if row['gender'] == 0:
        women_like_men_data.append(vector)
    else:
        men_like_women_data.append(vector)

men_like_women_df = pd.DataFrame(men_like_women_data)
women_like_men_df = pd.DataFrame(women_like_men_data)

print("men_like_women_df:")
print(men_like_women_df)
print("\nwomen_like_men_df:")
print(women_like_men_df)

men_like_women_df:
         id    pid  match
0      11.0    1.0    0.0
1      11.0    2.0    0.0
2      11.0    3.0    0.0
3      11.0    4.0    0.0
4      11.0    5.0    0.0
...     ...    ...    ...
4179  552.0  526.0    0.0
4180  552.0  527.0    0.0
4181  552.0  528.0    0.0
4182  552.0  529.0    0.0
4183  552.0  530.0    0.0

[4184 rows x 3 columns]

women_like_men_df:
         id    pid  match
0       1.0   11.0    0.0
1       1.0   12.0    0.0
2       1.0   13.0    1.0
3       1.0   14.0    1.0
4       1.0   15.0    1.0
...     ...    ...    ...
4179  530.0  548.0    0.0
4180  530.0  549.0    0.0
4181  530.0  550.0    0.0
4182  530.0  551.0    0.0
4183  530.0  552.0    0.0

[4184 rows x 3 columns]


## Let's prepare train and test data sets

### Make ids contiguous
We need to change the data. As you can see ids are not contiguous and they must be. So let's change that.

In [4]:
def proc_col(column):
    """Encodes a pandas column with continous ids."""
    unique = column.unique()
    old_to_new = {o: i for i, o in enumerate(unique)}
    return np.array([old_to_new.get(x, -1) for x in column])


def encode_data(df):
    """Encodes data with continous person and partner ids."""
    df = df.copy()
    for col_name in ["id", "pid"]:
        col = proc_col(df[col_name])
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

And now use these functions to make ids contiguous.

In [5]:
print('Before:')
print(men_like_women_df)
men_like_women_df = encode_data(men_like_women_df)
print('\nAfter:')
print(men_like_women_df)

women_like_men_df = encode_data(women_like_men_df)

Before:
         id    pid  match
0      11.0    1.0    0.0
1      11.0    2.0    0.0
2      11.0    3.0    0.0
3      11.0    4.0    0.0
4      11.0    5.0    0.0
...     ...    ...    ...
4179  552.0  526.0    0.0
4180  552.0  527.0    0.0
4181  552.0  528.0    0.0
4182  552.0  529.0    0.0
4183  552.0  530.0    0.0

[4184 rows x 3 columns]

After:
       id  pid  match
0       0    0    0.0
1       0    1    0.0
2       0    2    0.0
3       0    3    0.0
4       0    4    0.0
...   ...  ...    ...
4179  276  269    0.0
4180  276  270    0.0
4181  276  271    0.0
4182  276  272    0.0
4183  276  273    0.0

[4184 rows x 3 columns]


### Let's split into train and test data sets
Standard sklearn function `train_test_split` doesn't do a job here because both test and train data sets should include all people. So after using `train_test_split` we need to transition some people between sets to ensure that both sets are correct. The same goes for partners.

In [6]:
from sklearn.model_selection import train_test_split as tts

def train_test_split(df, test_size=0.2):
    # Use standard train_test_split.
    df_train, df_test = tts(df, test_size=test_size)
    no_train_unique = len(df_train.id.unique())
    no_test_unique = len(df_test.id.unique())
    
    # See what people are missing in the test set.
    diff = np.setdiff1d(df_train.id.unique(), df_test.id.unique())
    for id in diff:
        # For every missing person we need to exchange them for a
        # different one so that sets still have the same number of elements.
        person_to_send_to_test = df_train.loc[df_train.id == id].iloc[0]
        person_to_send_to_train = None
        # Choose some person from the test set to send to the train set.
        ids = df_test.id.unique()
        np.random.shuffle(ids)
        for test_id in ids:
            person = df_test.loc[df_test.id == test_id]
            if len(person.index) > 1:
                person_to_send_to_train = person.iloc[0]
                break
        if person_to_send_to_train is not None:
            # Remove people that transit from old sets.
            # .name holds the id of that row in the oryginal df.
            df_train = df_train.drop(person_to_send_to_test.name)
            df_test = df_test.drop(person_to_send_to_train.name)
            # Add new people to sets.
            df_train = pd.concat([df_train, person_to_send_to_train.to_frame().T], ignore_index=True)
            df_test = pd.concat([df_test, person_to_send_to_test.to_frame().T], ignore_index=True)
        else:
            raise Exception("Couldn't find any person from people to send from the test to the train.")
        
    # See what partners are missing in the test set.
    diff = np.setdiff1d(df_train.pid.unique(), df_test.pid.unique())
    for pid in diff:
        # For every missing partner we need to exchange them for a
        # different one so that sets still have the same number of elements.
        partner_to_send_to_test = df_train.loc[df_train.pid == pid].iloc[0]
        partner_to_send_to_train = None
        
        # Choose some partner from the test set to send to the train set.
        # Need to make sure that both sets will still have all the people.
        pids = df_test.pid.unique()
        np.random.shuffle(pids)
        for test_pid in pids:
            partner = df_test.loc[df_test.pid == test_pid]
            if len(partner.index) > 1:
                # Make sure we don't remove a person completely.
                id = partner.iloc[0].id
                person_qty = len(df_test.loc[df_test.id == id].index)
                if person_qty > 1:
                    partner_to_send_to_train = partner.iloc[0]
                    break
        if partner_to_send_to_train is not None:
            # Remove partners that transit from old sets.
            df_train = df_train.drop(partner_to_send_to_test.name)
            df_test = df_test.drop(partner_to_send_to_train.name)
            # Add new people to sets.
            df_train = pd.concat([df_train, partner_to_send_to_train.to_frame().T], ignore_index=True)
            df_test = pd.concat([df_test, partner_to_send_to_test.to_frame().T], ignore_index=True)
        else:
            raise Exception("Couldn't find any partner from partners to send from the test to the train.")
                    
    df_train = df_train.sort_values(by='id')
    df_test = df_test.sort_values(by='id')
    return df_train.reset_index(drop=True), df_test.sort_values(by='id').reset_index(drop=True)

### And now just get correct data

In [7]:
men_like_women_train_df, men_like_women_test_df = train_test_split(men_like_women_df, test_size=0.2)
men_like_women_no_men = len(men_like_women_train_df.id.unique())
men_like_women_no_women = len(men_like_women_train_df.pid.unique())

women_like_men_train_df, women_like_men_test_df = train_test_split(women_like_men_df, test_size=0.2)
women_like_men_no_women = len(women_like_men_train_df.id.unique())
women_like_men_no_men = len(women_like_men_train_df.pid.unique())

## Let's create matrix factorization models

We will create and train several base MF (matrix factorization) models and for each of them let's do a cross validation to learn the best hyperparameters and parameters. Then we will compare the results and choose the best model.

Some general explanations for models:
* Models are train on only one batch because our data set is rather small.

Good reading resource: https://towardsdatascience.com/weight-initialization-techniques-in-neural-networks-26c649eb3b78

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# dev = torch.device('cpu')

### Matrix factorization without bias

In [26]:
class MatrixFactorizationWithoutBiasNoXavier(nn.Module):
    def __init__(self, num_people, num_partners, weights=(0, 1), emb_size=100):
        super(MatrixFactorizationWithoutBiasNoXavier, self).__init__()
        self.person_emb = nn.Embedding(num_people, emb_size)
        self.partner_emb = nn.Embedding(num_partners, emb_size)
        self.person_emb.weight.data.uniform_(weights[0], weights[1])
        self.partner_emb.weight.data.uniform_(weights[0], weights[1])
        
    def forward(self, u, v):
        u = self.person_emb(u)
        v = self.partner_emb(v)
        # calculate dot product
        # u*v is a element wise vector multiplication
        return torch.sigmoid((u*v).sum(1))

    
class MatrixFactorizationWithoutBiasXavier(nn.Module):
    def __init__(self, num_people, num_partners, emb_size=100):
        super(MatrixFactorizationWithoutBiasXavier, self).__init__()
        self.person_emb = nn.Embedding(num_people, emb_size)
        self.partner_emb = nn.Embedding(num_partners, emb_size)
        torch.nn.init.xavier_uniform_(self.person_emb.weight)
        torch.nn.init.xavier_uniform_(self.partner_emb.weight)
        
    def forward(self, u, v):
        u = self.person_emb(u)
        v = self.partner_emb(v)
        # calculate dot product
        # u*v is a element wise vector multiplication
        return torch.sigmoid((u*v).sum(1))

    
# Example small models demonstrating weights
example_model_no_xavier = MatrixFactorizationWithoutBiasNoXavier(10, 10, emb_size=3)
example_model_xavier = MatrixFactorizationWithoutBiasXavier(10, 10, emb_size=3)
print("Model with without xavier weights are:\n")
for p in example_model_no_xavier.parameters():
    print(p)
print('\n\n', '='*20)
print("\n\nModel with with xavier weights are:\n")
for p in example_model_xavier.parameters():
    print(p)

Model with without xavier weights are:

Parameter containing:
tensor([[0.7120, 0.8189, 0.0101],
        [0.0931, 0.0724, 0.7042],
        [0.1305, 0.7589, 0.8507],
        [0.6871, 0.3154, 0.0042],
        [0.2440, 0.8844, 0.3479],
        [0.5350, 0.9071, 0.9786],
        [0.8459, 0.7783, 0.2059],
        [0.3553, 0.8199, 0.7585],
        [0.4458, 0.7538, 0.3314],
        [0.7804, 0.7772, 0.1336]], requires_grad=True)
Parameter containing:
tensor([[0.4934, 0.2227, 0.2833],
        [0.6845, 0.9441, 0.1662],
        [0.6096, 0.7740, 0.2633],
        [0.2765, 0.7729, 0.3081],
        [0.5981, 0.9635, 0.2720],
        [0.3445, 0.7380, 0.8843],
        [0.6935, 0.3796, 0.3887],
        [0.7533, 0.9706, 0.4557],
        [0.1222, 0.9620, 0.9174],
        [0.8961, 0.8109, 0.7335]], requires_grad=True)




Model with with xavier weights are:

Parameter containing:
tensor([[ 0.6036, -0.6227,  0.2255],
        [ 0.4493, -0.6746, -0.2371],
        [ 0.6770, -0.3100, -0.5055],
        [ 0.5352,  0

### Matrix factorization without bias

In [27]:
class MatrixFactorizationWithBiasNoXavier(nn.Module):
    def __init__(self, num_people, num_partners, weights=(0, 1), bias=(-0.01, 0.01), emb_size=100):
        super(MatrixFactorizationWithBiasNoXavier, self).__init__()
        self.person_emb = nn.Embedding(num_people, emb_size)
        self.person_bias = nn.Embedding(num_people, 1)
        self.partner_emb = nn.Embedding(num_partners, emb_size)
        self.parnter_bias = nn.Embedding(num_partners, 1)
        self.person_emb.weight.data.uniform_(weights[0], weights[1])
        self.partner_emb.weight.data.uniform_(weights[0], weights[1])
        self.person_bias.weight.data.uniform_(bias[0], bias[1])
        self.parnter_bias.weight.data.uniform_(bias[0], bias[1])
            
    def forward(self, u, v):
        u = self.person_emb(u)
        v = self.partner_emb(v)
        bias_u = self.person_bias(u)
        bias_v = self.parnter_bias(v)
        # calculate dot product
        # u*v is a element wise vector multiplication
        return torch.sigmoid((u*v).sum(1) + bias_u + bias_v)
    
    
class MatrixFactorizationWithBiasXavier(nn.Module):
    def __init__(self, num_people, num_partners, bias=(-0.01, 0.01), emb_size=100):
        super(MatrixFactorizationWithBiasXavier, self).__init__()
        self.person_emb = nn.Embedding(num_people, emb_size)
        self.person_bias = nn.Embedding(num_people, 1)
        self.partner_emb = nn.Embedding(num_partners, emb_size)
        self.parnter_bias = nn.Embedding(num_partners, 1)
        torch.nn.init.xavier_uniform_(self.person_emb.weight)
        torch.nn.init.xavier_uniform_(self.partner_emb.weight)
        self.person_bias.weight.data.uniform_(bias[0], bias[1])
        self.parnter_bias.weight.data.uniform_(bias[0], bias[1])
            
    def forward(self, u, v):
        u = self.person_emb(u)
        v = self.partner_emb(v)
        bias_u = self.person_bias(u).squeeze()
        bias_v = self.parnter_bias(v).squeeze()
        # calculate dot product
        # u*v is a element wise vector multiplication
        return torch.sigmoid((u*v).sum(1) + bias_u + bias_v)
    

# Example small models demonstrating weights
example_model_no_xavier = MatrixFactorizationWithBiasNoXavier(10, 10, bias=(0, 0), emb_size=3)
example_model_xavier = MatrixFactorizationWithBiasXavier(10, 10, emb_size=3)
print("Model with without xavier weights are:\n")
for p in example_model_no_xavier.parameters():
    print(p)
print('\n\n', '='*20)
print("\n\nModel with with xavier weights are:\n")
for p in example_model_xavier.parameters():
    print(p)

Model with without xavier weights are:

Parameter containing:
tensor([[0.1631, 0.9964, 0.3742],
        [0.0420, 0.8915, 0.0653],
        [0.5343, 0.1460, 0.0942],
        [0.8407, 0.7184, 0.4695],
        [0.4109, 0.1924, 0.5849],
        [0.4078, 0.0563, 0.3179],
        [0.8126, 0.4192, 0.5784],
        [0.6659, 0.3131, 0.9779],
        [0.7302, 0.4877, 0.4248],
        [0.8038, 0.3622, 0.3845]], requires_grad=True)
Parameter containing:
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], requires_grad=True)
Parameter containing:
tensor([[0.6894, 0.6275, 0.1707],
        [0.6975, 0.6747, 0.5286],
        [0.2491, 0.1676, 0.5661],
        [0.3966, 0.2961, 0.8607],
        [0.1317, 0.1785, 0.8187],
        [0.5793, 0.7943, 0.0522],
        [0.2403, 0.3177, 0.0309],
        [0.6750, 0.6408, 0.1986],
        [0.7921, 0.4211, 0.3351],
        [0.9851, 0.9208, 0.0483]], requires_grad=True)
Parameter c

### Training and testing functions are below

In [28]:
def test(model, df_test, verbose=False):
    model.eval()
    # .to(dev) puts code on either gpu or cpu.
    people = torch.LongTensor(df_test.id.values).to(dev)
    partners = torch.LongTensor(df_test.pid.values).to(dev)
    attraction = torch.FloatTensor(df_test.match.values).to(dev)
    y_hat = model(people, partners)
    loss = F.mse_loss(y_hat, attraction)
    if verbose:
        print('test loss %.3f ' % loss.item())
    return loss.item()


# Default values assigned below are ones that I found online.
# Cross validadtion will be done later but it's good to have some defaults.
def train(model, df_train, epochs=100, learning_rate=0.01, weight_decay=1e-5, verbose=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    model.train()
    
    for epoch in range(epochs):
        # From numpy to PyTorch tensors.
        # .to(dev) puts code on either gpu or cpu.
        people = torch.LongTensor(df_train.id.values).to(dev)
        partners = torch.LongTensor(df_train.pid.values).to(dev)
        attraction = torch.FloatTensor(df_train.match.values).to(dev)
        
        # calls forward method of the model
        y_hat = model(people, partners)
        # Using mean squared errors loss function
        loss = F.mse_loss(y_hat, attraction)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if verbose and epoch % 100 == 0: 
            print(loss.item())

# Let's finally train models and choose the best one

## Let's first train men_like_women

## Cross validation on each model

In [31]:
from itertools import product
from tqdm import tqdm

epochs_poss = list(range(10, 310, 10))
weight_decay_poss = [0.0, 1e-7, 1e-6, 1e-5, 1e-4, 0.0001, 0.001, 0.01, 0.1, 1.0, 10]
learning_rate_poss = [1e-7, 1e-6, 1e-5, 1e-4, 0.0001, 0.001, 0.01, 0.1, 1.0, 10]
weights_poss = [(0, 1), (-1, 1), (0, 0.2), (-0.2, 0.2), 'xavier']
bias_poss = [None, (0, 0), (-0.01, 0.01), (0, 1), (-1, 1)]
emb_size_poss = [10, 50, 100, 150, 200]

def cross_vaild_model(model, train_df, test_df, verbose=False):
    """Function to choose the best hyperparameters for a model."""
    min_loss = float('inf')
    best_settings = None

    for (epochs, wd, lr) in tqdm(product(epochs_poss, weight_decay_poss, learning_rate_poss)):
        train(model, train_df, epochs=epochs, learning_rate=lr, weight_decay=wd)
        test_loss = test(model, test_df)
        if (test_loss < min_loss) or (test_loss == min_loss and best_settings is not None and epochs < best_settings['epochs']):
            min_loss = test_loss
            best_settings = {'epochs': epochs, 'weight_decay': wd, 'learning_rate_poss': lr}
    if verbose:
        print('min loss %.3f' % min_loss)
        print('best settings are', best_settings)
    return min_loss, best_settings


def cross_vaild(num_people, num_partners, train_df, test_df, verbose=False):
    """Function to choose the best model."""
    min_loss = float('inf')
    best_settings = None
    
    for (weights, bias, emb_size) in tqdm(product(weights_poss, bias_poss, emb_size_poss)):
        model = None
        if weights == 'xavier':
            if bias is None:
                model = MatrixFactorizationWithoutBiasXavier(num_people, num_partners, emb_size=emb_size).to(dev)
            else:
                model = MatrixFactorizationWithBiasXavier(num_people, num_partners, bias=bias, emb_size=emb_size).to(dev)
        else:
            if bias is None:
                model = MatrixFactorizationWithoutBiasNoXavier(num_people, num_partners, weights=weights, emb_size=emb_size).to(dev)
            else:
                model = MatrixFactorizationWithBiasNoXavier(num_people, num_partners, weights=weights, bias=bias, emb_size=emb_size).to(dev)
        
        model_min_loss, model_best_settings = cross_vaild_model(model, train_df, test_df, verbose)
        if (model_min_loss < min_loss) or (model_min_loss == min_loss and best_settings and emb_size < best_settings['emb_size']):
            min_loss = model_min_loss
            curr_settings = {'model': model, 'weights': weights, 'bias': bias, 'emb_size': emb_size}
            best_settings = {**curr_settings, **model_best_settings}
            
    return min_loss, best_settings

In [30]:
min_loss, best_settings = cross_vaild(men_like_women_no_men, men_like_women_no_women, men_like_women_train_df, men_like_women_test_df)
print(min_loss)
print(best_settings)

MatrixFactorizationWithoutBiasXavier(
  (person_emb): Embedding(2, 100)
  (partner_emb): Embedding(2, 100)
)