In [1]:
# base imports
import pandas as pd
import numpy as np
from itertools import product
from tqdm import tqdm
import pickle
from time import time 

In [13]:
# Data preparation: 
# Some data is missing (but we know what it should be) or some data should be changed.
df = pd.read_csv('./speed_dating_data.csv', encoding = 'ISO-8859-1')

def fooCareer(col):
    selected_df = df[col]
    career_c = {
        'lawyer': 1.,
        'law': 1.,
        'Economics': 7.,
        'tech professional': 5.
    }
    
    for x, row in selected_df.iterrows():
            career = df.at[x, col[0]]
            if career in career_c:
                df.at[x, col[1]] = career_c[career]
                

def fooGoal(col):
    selected_df = df[col]
    goal_c = {
        1: 1,
        2: 1,
        3: 1,
        4: 0,
        5: 1,
        6: 1,
    }
    
    for x, row in selected_df.iteritems():
        goal = df.at[x, col]
        if goal in goal_c:
            df.at[x, col] = goal_c[goal]

fooCareer(['career','career_c'])
fooGoal('goal')
df.to_csv('base_dating_data.csv', index=False)

In [14]:
# data is inconsistent:
# Some values (it’s based on waves) are 1-10 and others are 1-100 so
# they should be normalized so that all are 1-100.
df = pd.read_csv('base_dating_data.csv')

def func(lista):
    selected_df = df[lista]
    for x, row in selected_df.iterrows():
        multiplier = 100/row.sum()
        for col in lista:
            df.at[x, col] *= multiplier

lista = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']
lista1 = ['attr_o', 'sinc_o', 'intel_o', 'fun_o', 'amb_o', 'shar_o']
lista3 = ['attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1']
lista4 = ['attr1_2', 'sinc1_2', 'intel1_2', 'fun1_2', 'amb1_2', 'shar1_2']

func(lista)
func(lista1)
func(lista3)
func(lista4)

df.to_csv('base_dating_data.csv', index=False)

  multiplier = 100/row.sum()
  df.at[x, col] *= multiplier


In [15]:
# Column met_o had values 1/2. It needs to be changed to 0/1.
df = pd.read_csv('./base_dating_data.csv')
df['met_o'] = df['met_o'].apply(lambda x: x - 1)
df.to_csv('base_dating_data.csv', index=False)

We removed some rows (by hand) because they didn't have `pid`.

### Below you can see code that prepares data for the basic matrix factorization.
Here in the base matrix we only have information about match. So we need a set of vectors, where each vector describes each date (holds both ids and information about match).

Task will requite two base matrices.
1. Matrix where men are "users" and women are "products". It will then be used to recommend women to men because matrix will say what's the predicted rating of a woman in eyes of man. Basically it will answer the question: **"How likely is that a man will like a woman?"**. Let's call this matrix/data frame **"men_like_women"**.
1. Matrix where women are "users" and men are "products". It will then be used to recommend men to women because matrix will say what's the predicted rating of a man in eyes of woman. Basically it will answer the question: **"How likely is that a woman will like a man?"**. Let's call this matrix/data frame **"women_like_men"**.

Why such analogies? It may help to understand how do this human relations task translates into recommender systems world.

These matrices will be used to train models (with matrix factorization) which will then be saved into csv files.

In [2]:
# Split into vectors, let's have two matrices as describes above.
base_df = pd.read_csv('./data_for_matrix.csv')
men_like_women_data = []
women_like_men_data = []

for _, row in base_df.iterrows():
    vector = {
        'id': row['iid'],
        'pid': row['pid'],
        'decision': row['dec'],
    }
    if row['gender'] == 0:
        # it's a woman
        women_like_men_data.append(vector)
    else:
        men_like_women_data.append(vector)

men_like_women_df = pd.DataFrame(men_like_women_data)
women_like_men_df = pd.DataFrame(women_like_men_data)

print("men_like_women_df:")
print(men_like_women_df)
print("\nwomen_like_men_df:")
print(women_like_men_df)

men_like_women_df:
         id    pid  decision
0      11.0    1.0       0.0
1      11.0    2.0       0.0
2      11.0    3.0       0.0
3      11.0    4.0       0.0
4      11.0    5.0       0.0
...     ...    ...       ...
4179  552.0  526.0       0.0
4180  552.0  527.0       0.0
4181  552.0  528.0       0.0
4182  552.0  529.0       0.0
4183  552.0  530.0       0.0

[4184 rows x 3 columns]

women_like_men_df:
         id    pid  decision
0       1.0   11.0       1.0
1       1.0   12.0       1.0
2       1.0   13.0       1.0
3       1.0   14.0       1.0
4       1.0   15.0       1.0
...     ...    ...       ...
4179  530.0  548.0       0.0
4180  530.0  549.0       1.0
4181  530.0  550.0       0.0
4182  530.0  551.0       0.0
4183  530.0  552.0       1.0

[4184 rows x 3 columns]


## Let's prepare train and test data sets

### Make ids contiguous
We need to change the data. As you can see ids are not contiguous and they must be. So let's change that.

In [3]:
def proc_col(column):
    """Encodes a pandas column with continous ids."""
    unique = column.unique()
    old_to_new = {o: i for i, o in enumerate(unique)}
    return np.array([old_to_new.get(x, -1) for x in column])


def encode_data(df):
    """Encodes data with continous person and partner ids."""
    df = df.copy()
    for col_name in ["id", "pid"]:
        col = proc_col(df[col_name])
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

And now use these functions to make ids contiguous.

In [4]:
print('Before:')
print(men_like_women_df)
men_like_women_df = encode_data(men_like_women_df)
print('\nAfter:')
print(men_like_women_df)

women_like_men_df = encode_data(women_like_men_df)

Before:
         id    pid  decision
0      11.0    1.0       0.0
1      11.0    2.0       0.0
2      11.0    3.0       0.0
3      11.0    4.0       0.0
4      11.0    5.0       0.0
...     ...    ...       ...
4179  552.0  526.0       0.0
4180  552.0  527.0       0.0
4181  552.0  528.0       0.0
4182  552.0  529.0       0.0
4183  552.0  530.0       0.0

[4184 rows x 3 columns]

After:
       id  pid  decision
0       0    0       0.0
1       0    1       0.0
2       0    2       0.0
3       0    3       0.0
4       0    4       0.0
...   ...  ...       ...
4179  276  269       0.0
4180  276  270       0.0
4181  276  271       0.0
4182  276  272       0.0
4183  276  273       0.0

[4184 rows x 3 columns]


### Let's split into train and test data sets
Standard sklearn function `train_test_split` doesn't do a job here because both test and train data sets should include all people. So after using `train_test_split` we need to transition some people between sets to ensure that both sets are correct. The same goes for partners.

In [5]:
from sklearn.model_selection import train_test_split as tts

def train_test_split(df, test_size=0.2):
    # Use standard train_test_split.
    df_train, df_test = tts(df, test_size=test_size)
    no_train_unique = len(df_train.id.unique())
    no_test_unique = len(df_test.id.unique())
    
    # See what people are missing in the test set.
    diff = np.setdiff1d(df_train.id.unique(), df_test.id.unique())
    for id in diff:
        # For every missing person we need to exchange them for a
        # different one so that sets still have the same number of elements.
        person_to_send_to_test = df_train.loc[df_train.id == id].iloc[0]
        person_to_send_to_train = None
        # Choose some person from the test set to send to the train set.
        ids = df_test.id.unique()
        np.random.shuffle(ids)
        for test_id in ids:
            person = df_test.loc[df_test.id == test_id]
            if len(person.index) > 1:
                person_to_send_to_train = person.iloc[0]
                break
        if person_to_send_to_train is not None:
            # Remove people that transit from old sets.
            # .name holds the id of that row in the oryginal df.
            df_train = df_train.drop(person_to_send_to_test.name)
            df_test = df_test.drop(person_to_send_to_train.name)
            # Add new people to sets.
            df_train = pd.concat([df_train, person_to_send_to_train.to_frame().T], ignore_index=True)
            df_test = pd.concat([df_test, person_to_send_to_test.to_frame().T], ignore_index=True)
        else:
            raise Exception("Couldn't find any person from people to send from the test to the train.")
        
    # See what partners are missing in the test set.
    diff = np.setdiff1d(df_train.pid.unique(), df_test.pid.unique())
    for pid in diff:
        # For every missing partner we need to exchange them for a
        # different one so that sets still have the same number of elements.
        partner_to_send_to_test = df_train.loc[df_train.pid == pid].iloc[0]
        partner_to_send_to_train = None
        
        # Choose some partner from the test set to send to the train set.
        # Need to make sure that both sets will still have all the people.
        pids = df_test.pid.unique()
        np.random.shuffle(pids)
        for test_pid in pids:
            partner = df_test.loc[df_test.pid == test_pid]
            if len(partner.index) > 1:
                # Make sure we don't remove a person completely.
                id = partner.iloc[0].id
                person_qty = len(df_test.loc[df_test.id == id].index)
                if person_qty > 1:
                    partner_to_send_to_train = partner.iloc[0]
                    break
        if partner_to_send_to_train is not None:
            # Remove partners that transit from old sets.
            df_train = df_train.drop(partner_to_send_to_test.name)
            df_test = df_test.drop(partner_to_send_to_train.name)
            # Add new people to sets.
            df_train = pd.concat([df_train, partner_to_send_to_train.to_frame().T], ignore_index=True)
            df_test = pd.concat([df_test, partner_to_send_to_test.to_frame().T], ignore_index=True)
        else:
            raise Exception("Couldn't find any partner from partners to send from the test to the train.")
                    
    df_train = df_train.sort_values(by='id')
    df_test = df_test.sort_values(by='id')
    return df_train.reset_index(drop=True), df_test.sort_values(by='id').reset_index(drop=True)

### And now just get correct data

In [6]:
men_like_women_train_df, men_like_women_test_df = train_test_split(men_like_women_df, test_size=0.2)
men_like_women_no_men = len(men_like_women_train_df.id.unique())
men_like_women_no_women = len(men_like_women_train_df.pid.unique())

women_like_men_train_df, women_like_men_test_df = train_test_split(women_like_men_df, test_size=0.2)
women_like_men_no_women = len(women_like_men_train_df.id.unique())
women_like_men_no_men = len(women_like_men_train_df.pid.unique())

## Let's create matrix factorization models

We will create and train several base MF (matrix factorization) models and for each of them let's do a cross validation to learn the best hyperparameters and parameters. Then we will compare the results and choose the best model.

Some general explanations for models:
* Models are train on only one batch because our data set is rather small.

Good reading resource: https://towardsdatascience.com/weight-initialization-techniques-in-neural-networks-26c649eb3b78

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
dev

device(type='cuda')

### Matrix factorization

In [8]:
class MatrixFactorizationWithoutXavier(nn.Module):
    def __init__(self, num_people, num_partners, weights=(0, 1), bias=(-0.01, 0.01), emb_size=100):
        super(MatrixFactorizationWithoutXavier, self).__init__()
        self.person_emb = nn.Embedding(num_people, emb_size)
        self.person_bias = nn.Embedding(num_people, 1)
        self.partner_emb = nn.Embedding(num_partners, emb_size)
        self.parnter_bias = nn.Embedding(num_partners, 1)
        self.person_emb.weight.data.uniform_(weights[0], weights[1])
        self.partner_emb.weight.data.uniform_(weights[0], weights[1])
        self.person_bias.weight.data.uniform_(bias[0], bias[1])
        self.parnter_bias.weight.data.uniform_(bias[0], bias[1])
            
    def forward(self, u, v):
        bias_u = self.person_bias(u).squeeze()
        bias_v = self.parnter_bias(v).squeeze()
        u = self.person_emb(u)
        v = self.partner_emb(v)
        # calculate dot product
        # u*v is a element wise vector multiplication
        return torch.sigmoid((u*v).sum(1) + bias_u + bias_v)
    
    
class MatrixFactorizationWithXavier(nn.Module):
    def __init__(self, num_people, num_partners, bias=(-0.01, 0.01), emb_size=100):
        super(MatrixFactorizationWithXavier, self).__init__()
        self.person_emb = nn.Embedding(num_people, emb_size)
        self.person_bias = nn.Embedding(num_people, 1)
        self.partner_emb = nn.Embedding(num_partners, emb_size)
        self.parnter_bias = nn.Embedding(num_partners, 1)
        torch.nn.init.xavier_uniform_(self.person_emb.weight)
        torch.nn.init.xavier_uniform_(self.partner_emb.weight)
        self.person_bias.weight.data.uniform_(bias[0], bias[1])
        self.parnter_bias.weight.data.uniform_(bias[0], bias[1])
            
    def forward(self, u, v):
        bias_u = self.person_bias(u).squeeze()
        bias_v = self.parnter_bias(v).squeeze()
        u = self.person_emb(u)
        v = self.partner_emb(v)
        # calculate dot product
        # u*v is a element wise vector multiplication
        return torch.sigmoid((u*v).sum(1) + bias_u + bias_v)
    

# Example small models demonstrating weights
example_model_no_xavier = MatrixFactorizationWithoutXavier(10, 10, bias=(0, 0), emb_size=3)
example_model_xavier = MatrixFactorizationWithXavier(10, 10, emb_size=3)
print("Model with without xavier weights are:\n")
for p in example_model_no_xavier.parameters():
    print(p)
print('\n\n', '='*20)
print("\n\nModel with with xavier weights are:\n")
for p in example_model_xavier.parameters():
    print(p)

Model with without xavier weights are:

Parameter containing:
tensor([[0.0544, 0.8659, 0.2938],
        [0.8871, 0.8007, 0.6418],
        [0.2806, 0.6298, 0.4886],
        [0.9440, 0.2081, 0.1910],
        [0.2553, 0.1161, 0.2733],
        [0.3250, 0.2360, 0.2192],
        [0.2942, 0.8049, 0.0749],
        [0.5206, 0.3197, 0.7085],
        [0.8762, 0.9432, 0.2356],
        [0.0029, 0.4609, 0.9614]], requires_grad=True)
Parameter containing:
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], requires_grad=True)
Parameter containing:
tensor([[0.0370, 0.6098, 0.5509],
        [0.5549, 0.0904, 0.7126],
        [0.2000, 0.8530, 0.0411],
        [0.6198, 0.2937, 0.5277],
        [0.0707, 0.0107, 0.1260],
        [0.5294, 0.7154, 0.2432],
        [0.2563, 0.2075, 0.3882],
        [0.1797, 0.0315, 0.1131],
        [0.1502, 0.7822, 0.5031],
        [0.3409, 0.2783, 0.8407]], requires_grad=True)
Parameter c

### Training and testing functions are below

In [9]:
def test(model, df_test, verbose=False):
    model.eval()
    # .to(dev) puts code on either gpu or cpu.
    people = torch.LongTensor(df_test.id.values).to(dev)
    partners = torch.LongTensor(df_test.pid.values).to(dev)
    decision = torch.FloatTensor(df_test.decision.values).to(dev)
    y_hat = model(people, partners)
    loss = F.mse_loss(y_hat, decision)
    if verbose:
        print('test loss %.3f ' % loss.item())
    return loss.item()


# Default values assigned below are ones that I found online.
# Cross validadtion will be done later but it's good to have some defaults.
def train(model, df_train, epochs=100, learning_rate=0.01, weight_decay=1e-5, verbose=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    model.train()
    
    for epoch in range(epochs):
        # From numpy to PyTorch tensors.
        # .to(dev) puts code on either gpu or cpu.
        people = torch.LongTensor(df_train.id.values).to(dev)
        partners = torch.LongTensor(df_train.pid.values).to(dev)
        decision = torch.FloatTensor(df_train.decision.values).to(dev)
        
        # calls forward method of the model
        y_hat = model(people, partners)
        # Using mean squared errors loss function
        loss = F.mse_loss(y_hat, decision)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if verbose and epoch % 100 == 0: 
            print(loss.item())

# Let's finally train models and choose the best one

## Cross validation on each model

In [11]:
def cross_valid_model(model, train_df, test_df):
    """Function to choose the best hyperparameters for a model."""
    min_loss = np.inf
    best_settings = None

    for (epochs, wd, lr) in product(epochs_poss, weight_decay_poss, learning_rate_poss):
        train(model, train_df, epochs=epochs, learning_rate=lr, weight_decay=wd)
        test_loss = test(model, test_df)
        if (test_loss < min_loss) or (test_loss == min_loss and best_settings is not None and epochs < best_settings['epochs']):
            min_loss = test_loss
            best_settings = {'epochs': epochs, 'weight_decay': wd, 'learning_rate': lr}
    return min_loss, best_settings


def cross_valid(
    num_people, num_partners, train_df, test_df,
    epochs_poss,
    weight_decay_poss,
    learning_rate_poss,
    weights_poss,
    bias_poss,
    emb_size_poss,
    verbose=False, file=None
):
    """
    Function to choose the best model.
    If arg file is provided (should be a path), statistics will be written to the given file.
    """
    min_loss = np.inf
    best_settings = None
    models = []
    start_time = time()
    
    for (weights, bias, emb_size) in tqdm(product(weights_poss, bias_poss, emb_size_poss)):
        model = None
        if weights == 'xavier':
            model = MatrixFactorizationWithXavier(num_people, num_partners, bias=bias, emb_size=emb_size).to(dev)
        else:
            model = MatrixFactorizationWithoutXavier(num_people, num_partners, weights=weights, bias=bias, emb_size=emb_size).to(dev)
        
        model_min_loss, model_best_settings = cross_valid_model(model, train_df, test_df)
        
        curr_settings = {'model': model, 'min_loss': model_min_loss, 'weights': weights, 'bias': bias, 'emb_size': emb_size}
        model_best_settings = {**curr_settings, **model_best_settings}
        models.append(model_best_settings)
            
        if verbose:
            print(f'best_settings: {model_best_settings}')
            print(f'{"="*50}')
        
        if (model_min_loss < min_loss) or (model_min_loss == min_loss and best_settings and emb_size < best_settings['emb_size']):
            min_loss = model_min_loss
            best_settings = model_best_settings
    
    end_time = time()
        
    if verbose:
        print(f'\n{"?"*50}')
        print('THE WINNER IS:')
        print(f'best min_loss: {min_loss}')
        print(f'best best_settings: {best_settings}')
        print(f'Cross validation took {end_time - start_time}')
    
    if file:
        models.sort(key=lambda x: x['min_loss'])
        with open(file, 'wb') as stats_file:
            pickle.dump(models, stats_file)
        
    return min_loss, best_settings

## Let's first train men_like_women

In [12]:
HUGE_MF_CROSS_VALID_RES_FILE = './men_like_women_MF_cross_vaild_results.txt'

In [13]:
epochs_poss = list(range(60, 350, 10))
weight_decay_poss = [0.0001, 0.001, 0.01, 0.1, 1.0]
learning_rate_poss = [0.0001, 0.001, 0.01, 0.1, 1.0]
weights_poss = [(0, 1), (-1, 1), (0, 0.2), (-0.2, 0.2), 'xavier']
bias_poss = [(0, 0), (-0.01, 0.01), (0, 1), (-1, 1)]
emb_size_poss = list(range(300, 1100, 100))

min_loss, best_settings = cross_valid(
    men_like_women_no_men, men_like_women_no_women,
    men_like_women_train_df, men_like_women_test_df,
    epochs_poss=epochs_poss,
    weight_decay_poss=weight_decay_poss,
    learning_rate_poss=learning_rate_poss,
    weights_poss=weights_poss,
    bias_poss=bias_poss,
    emb_size_poss=emb_size_poss,
    file=HUGE_MF_CROSS_VALID_RES_FILE
)

In [131]:
# model.eval()
# print(model(torch.LongTensor(men_like_women_train_df.id).to(dev), torch.LongTensor(men_like_women_train_df.pid).to(dev)))

### THE WINNER IS:
#### best min_loss: 0.1881617307662964
#### model: MatrixFactorizationWithBiasNoXavier
weights: (-1, 1),
bias: (0, 0),
emb_size: 150,
epochs: 60,
weight_decay: 0.001,
learning_rate: 0.001,

*Cross validation took 65399.79477787018 seconds ~ around 18 hours*

In [222]:
huge_mf_cross_valid_res = []

# with open(HUGE_MF_CROSS_VALID_RES_FILE, 'r') as file:
#     curr_model = {}
#     for line in file:
#         min_loss = findall('min_loss: 0\.\d+', line)
#         model = findall("'model': [a-zA-Z]+", line)
#         if min_loss:
#             curr_model['min_loss'] = float(min_loss[0].split()[1])
#         if model:
#             curr_model['model'] = model[0].split()[1]
#         elif line.startswith("), 'weights':"):    
#             settings = line.split(", '")
#             settings.pop(0)
#             settings = [item.split("': ") for item in settings]
            
#             if settings[0][1] == 'xavier':
#                 curr_model['weights'] = 'xavier'
#             else:
#                 # str to tuple
#                 curr_model['weights'] = eval(settings[0][1])
#             if settings[0][1] == 'None':
#                 curr_model['bias'] = None
#             else:
#                 curr_model['bias'] = eval(settings[1][1])
                
#             curr_model['emb_size'] = float(settings[2][1])
#             curr_model['epochs'] = float(settings[3][1])
#             curr_model['weight_decay'] = float(settings[4][1])
#             curr_model['learning_rate'] = float(settings[5][1].split('}\n')[0])
#         elif line.startswith('='):
#             huge_mf_cross_valid_res.append(curr_model)
#             curr_model = {}
            
# sorting
huge_mf_cross_valid_res.sort(key=lambda x: x['min_loss'])

### Top 10 best models

In [223]:
for i, model in enumerate(huge_mf_cross_valid_res):
    print(f'{i + 1})')
    print(f'min_loss: {model["min_loss"]}')
    print(f'model: {model["model"]}')
    print(f'weights: {model["weights"]}, bias: {model["bias"]}, emb_size: {model["emb_size"]}')
    print(f'weight_decay: {model["weight_decay"]}, learning_rate: {model["learning_rate"]}, epochs: {model["epochs"]}')
    print('\n', '='*30, '\n')
    if i == 9:
        break