In [10]:
# base imports
import pandas as pd
import numpy as np
from itertools import product
from tqdm import tqdm
import pickle
import copy
from time import time
from audtorch.metrics.functional import pearsonr

%config IPCompleter.greedy = True

In [11]:
# Data preparation: 
# Some data is missing (but we know what it should be) or some data should be changed.
base_df = pd.read_csv('./speed_dating_data.csv', encoding = 'ISO-8859-1')

goal_c = {
    1: 1,
    2: 1,
    3: 1,
    4: 0,
    5: 1,
    6: 1,
}

career_c = {
    'lawyer': 1.,
    'law': 1.,
    'Economics': 7.,
    'Economist': 7.,
    'tech professional': 5.
}

for x, row in base_df.iterrows():
    race = base_df.at[x, 'race']
    race_o = base_df.at[x, 'race_o']
    date = base_df.at[x, 'date']
    go_out = base_df.at[x, 'go_out']
    goal = base_df.at[x, 'goal']
    career = base_df.at[x, 'career']
    
    if career in career_c:
        base_df.at[x, 'career_c'] = career_c[career]
    if str(career) == 'nan':
        # if carrer is nan then change its code for 18 which means 'Other'
        base_df.at[x, 'career_c'] = 18.    
    if goal in goal_c:
        base_df.at[x, 'goal'] = goal_c[goal]
    if str(race) == 'nan':
        # if race is nan then change its code for 6 which means 'Other'
        base_df.at[x, 'race'] = 6.
    if str(race_o) == 'nan':
        # if race is nan then change its code for 6 which means 'Other'
        base_df.at[x, 'race_o'] = 6.
    if str(date) == 'nan':
        # if date is nan then change its code for 8 which means 'Other'
        base_df.at[x, 'date'] = 8.
    if str(go_out) == 'nan':
        # if go_out is nan then change its code for 8 which means 'Other'
        base_df.at[x, 'go_out'] = 8.

base_df.drop(['career'], axis=1, inplace=True)
        
# data is inconsistent:
# Some values (it’s based on waves) are 1-10 and others are 1-100 so
# they should be normalized so that all are 1-100.

def normalize(lst):
    selected_df = base_df[lst]
    for x, row in selected_df.iterrows():
        multiplier = 100/row.sum()
        for col in lst:
            base_df.at[x, col] *= multiplier

lst1 = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']
lst2 = ['attr_o', 'sinc_o', 'intel_o', 'fun_o', 'amb_o', 'shar_o']
lst3 = ['attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1']

normalize(lst1)
normalize(lst2)
normalize(lst3)

# Column met_o had values 1/2. It needs to be changed to 0/1.
base_df['met_o'] = base_df['met_o'].apply(lambda x: x - 1)

  multiplier = 100/row.sum()
  base_df.at[x, col] *= multiplier


We removed some rows (by hand) because they didn't have `pid`.

### One hot encoding

In [12]:
# career
base_df = pd.concat([base_df, pd.get_dummies(base_df['career_c'], prefix='career', drop_first=True)], axis=1)
base_df.drop(['career_c'], axis=1, inplace=True)

# race and race_o
race_df = pd.get_dummies(base_df['race'], prefix='race')
race_df.drop(['race_6.0'], axis=1, inplace=True)
base_df = pd.concat([base_df, race_df], axis=1)
base_df.drop(['race'], axis=1, inplace=True)

race_o_df = pd.get_dummies(base_df['race_o'], prefix='race_o')
race_o_df.drop(['race_o_6.0'], axis=1, inplace=True)
base_df = pd.concat([base_df, race_o_df], axis=1)
base_df.drop(['race_o'], axis=1, inplace=True)

# date
date_df = pd.get_dummies(base_df['date'], prefix='date')
date_df.drop(['date_8.0'], axis=1, inplace=True)
base_df = pd.concat([base_df, date_df], axis=1)
base_df.drop(['date'], axis=1, inplace=True)
base_df

# go_out
go_out_df = pd.get_dummies(base_df['go_out'], prefix='go_out')
go_out_df.drop(['go_out_8.0'], axis=1, inplace=True)
base_df = pd.concat([base_df, go_out_df], axis=1)
base_df.drop(['go_out'], axis=1, inplace=True)

base_df

Unnamed: 0,iid,gender,pid,age_o,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha,...,date_5.0,date_6.0,date_7.0,go_out_1.0,go_out_2.0,go_out_3.0,go_out_4.0,go_out_5.0,go_out_6.0,go_out_7.0
0,1,0,11.0,27.0,35.0,20.0,20.0,20.0,0.0,5.0,...,0,0,1,1,0,0,0,0,0,0
1,1,0,12.0,22.0,60.0,0.0,0.0,40.0,0.0,0.0,...,0,0,1,1,0,0,0,0,0,0
2,1,0,13.0,22.0,19.0,18.0,19.0,18.0,14.0,12.0,...,0,0,1,1,0,0,0,0,0,0
3,1,0,14.0,23.0,30.0,5.0,15.0,40.0,5.0,5.0,...,0,0,1,1,0,0,0,0,0,0
4,1,0,15.0,24.0,30.0,10.0,20.0,10.0,10.0,20.0,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,1,526.0,26.0,10.0,10.0,30.0,20.0,10.0,15.0,...,0,0,0,1,0,0,0,0,0,0
8374,552,1,527.0,24.0,50.0,20.0,10.0,5.0,10.0,5.0,...,0,0,0,1,0,0,0,0,0,0
8375,552,1,528.0,29.0,40.0,10.0,30.0,10.0,10.0,,...,0,0,0,1,0,0,0,0,0,0
8376,552,1,529.0,22.0,10.0,25.0,25.0,10.0,10.0,20.0,...,0,0,0,1,0,0,0,0,0,0


### Below you can see code that prepares data for the basic matrix factorization.
Here in the base matrix we only have information about match. So we need a set of vectors, where each vector describes each date (holds both ids and information about match).

Task will requite two base matrices.
1. Matrix where men are "users" and women are "products". It will then be used to recommend women to men because matrix will say what's the predicted rating of a woman in eyes of man. Basically it will answer the question: **"How likely is that a man will like a woman?"**. Let's call this matrix/data frame **"men_like_women"**.
1. Matrix where women are "users" and men are "products". It will then be used to recommend men to women because matrix will say what's the predicted rating of a man in eyes of woman. Basically it will answer the question: **"How likely is that a woman will like a man?"**. Let's call this matrix/data frame **"women_like_men"**.

Why such analogies? It may help to understand how do this human relations task translates into recommender systems world.

These matrices will be used to train models (with matrix factorization) which will then be saved into csv files.

In [13]:
# Split into vectors, let's have two matrices as describes above.
men_like_women_data = []
women_like_men_data = []

for _, row in base_df.iterrows():
    vector = {
        'id': row['iid'],
        'pid': row['pid'],
        'decision': row['dec'],
    }
    if row['gender'] == 0:
        # it's a woman
        women_like_men_data.append(vector)
    else:
        men_like_women_data.append(vector)

men_like_women_df = pd.DataFrame(men_like_women_data)
women_like_men_df = pd.DataFrame(women_like_men_data)

print("men_like_women_df:")
print(men_like_women_df)
print("\nwomen_like_men_df:")
print(women_like_men_df)

men_like_women_df:
         id    pid  decision
0      11.0    1.0       0.0
1      11.0    2.0       0.0
2      11.0    3.0       0.0
3      11.0    4.0       0.0
4      11.0    5.0       0.0
...     ...    ...       ...
4189  552.0  526.0       0.0
4190  552.0  527.0       0.0
4191  552.0  528.0       0.0
4192  552.0  529.0       0.0
4193  552.0  530.0       0.0

[4194 rows x 3 columns]

women_like_men_df:
         id    pid  decision
0       1.0   11.0       1.0
1       1.0   12.0       1.0
2       1.0   13.0       1.0
3       1.0   14.0       1.0
4       1.0   15.0       1.0
...     ...    ...       ...
4179  530.0  548.0       0.0
4180  530.0  549.0       1.0
4181  530.0  550.0       0.0
4182  530.0  551.0       0.0
4183  530.0  552.0       1.0

[4184 rows x 3 columns]


## Let's prepare train and test data sets

### Make ids contiguous
We need to change the data. As you can see ids are not contiguous and they must be. So let's change that.

In [14]:
def proc_col(column):
    """Encodes a pandas column with continous ids."""
    unique = column.unique()
    old_to_new = {o: i for i, o in enumerate(unique)}
    return np.array([old_to_new.get(x, -1) for x in column])


def encode_data(df):
    """Encodes data with continous person and partner ids."""
    df = df.copy()
    for col_name in ["id", "pid"]:
        col = proc_col(df[col_name])
        df[col_name] = col
        df = df[df[col_name] >= 0]
        
    return df

And now use these functions to make ids contiguous.

In [15]:
print('Before:')
print(men_like_women_df)
men_like_women_df = encode_data(men_like_women_df)
print('\nAfter:')
print(men_like_women_df)

women_like_men_df = encode_data(women_like_men_df)

Before:
         id    pid  decision
0      11.0    1.0       0.0
1      11.0    2.0       0.0
2      11.0    3.0       0.0
3      11.0    4.0       0.0
4      11.0    5.0       0.0
...     ...    ...       ...
4189  552.0  526.0       0.0
4190  552.0  527.0       0.0
4191  552.0  528.0       0.0
4192  552.0  529.0       0.0
4193  552.0  530.0       0.0

[4194 rows x 3 columns]

After:
       id  pid  decision
0       0    0       0.0
1       0    1       0.0
2       0    2       0.0
3       0    3       0.0
4       0    4       0.0
...   ...  ...       ...
4189  276  270       0.0
4190  276  271       0.0
4191  276  272       0.0
4192  276  273       0.0
4193  276  274       0.0

[4184 rows x 3 columns]


### Let's split into train and test data sets
Standard sklearn function `train_test_split` doesn't do a job here because both test and train data sets should include all people. So after using `train_test_split` we need to transition some people between sets to ensure that both sets are correct. The same goes for partners.

In [16]:
from sklearn.model_selection import train_test_split as tts

def train_test_split(df, test_size=0.2):
    # Use standard train_test_split.
    df_train, df_test = tts(df, test_size=test_size)
    no_train_unique = len(df_train.id.unique())
    no_test_unique = len(df_test.id.unique())
    
    # See what people are missing in the test set.
    diff = np.setdiff1d(df_train.id.unique(), df_test.id.unique())
    for id in diff:
        # For every missing person we need to exchange them for a
        # different one so that sets still have the same number of elements.
        person_to_send_to_test = df_train.loc[df_train.id == id].iloc[0]
        person_to_send_to_train = None
        # Choose some person from the test set to send to the train set.
        ids = df_test.id.unique()
        np.random.shuffle(ids)
        for test_id in ids:
            person = df_test.loc[df_test.id == test_id]
            if len(person.index) > 1:
                person_to_send_to_train = person.iloc[0]
                break
        if person_to_send_to_train is not None:
            # Remove people that transit from old sets.
            # .name holds the id of that row in the oryginal df.
            df_train = df_train.drop(person_to_send_to_test.name)
            df_test = df_test.drop(person_to_send_to_train.name)
            # Add new people to sets.
            df_train = pd.concat([df_train, person_to_send_to_train.to_frame().T], ignore_index=True)
            df_test = pd.concat([df_test, person_to_send_to_test.to_frame().T], ignore_index=True)
        else:
            raise Exception("Couldn't find any person from people to send from the test to the train.")
        
    # See what partners are missing in the test set.
    diff = np.setdiff1d(df_train.pid.unique(), df_test.pid.unique())
    for pid in diff:
        # For every missing partner we need to exchange them for a
        # different one so that sets still have the same number of elements.
        partner_to_send_to_test = df_train.loc[df_train.pid == pid].iloc[0]
        partner_to_send_to_train = None
        
        # Choose some partner from the test set to send to the train set.
        # Need to make sure that both sets will still have all the people.
        pids = df_test.pid.unique()
        np.random.shuffle(pids)
        for test_pid in pids:
            partner = df_test.loc[df_test.pid == test_pid]
            if len(partner.index) > 1:
                # Make sure we don't remove a person completely.
                id = partner.iloc[0].id
                person_qty = len(df_test.loc[df_test.id == id].index)
                if person_qty > 1:
                    partner_to_send_to_train = partner.iloc[0]
                    break
        if partner_to_send_to_train is not None:
            # Remove partners that transit from old sets.
            df_train = df_train.drop(partner_to_send_to_test.name)
            df_test = df_test.drop(partner_to_send_to_train.name)
            # Add new people to sets.
            df_train = pd.concat([df_train, partner_to_send_to_train.to_frame().T], ignore_index=True)
            df_test = pd.concat([df_test, partner_to_send_to_test.to_frame().T], ignore_index=True)
        else:
            raise Exception("Couldn't find any partner from partners to send from the test to the train.")
                    
    df_train = df_train.sort_values(by='id')
    df_test = df_test.sort_values(by='id')
    return df_train.reset_index(drop=True), df_test.sort_values(by='id').reset_index(drop=True)

### And now just get correct data

In [17]:
men_like_women_train_df, men_like_women_test_df = train_test_split(men_like_women_df, test_size=0.2)
men_like_women_no_men = len(men_like_women_train_df.id.unique())
men_like_women_no_women = len(men_like_women_train_df.pid.unique())

women_like_men_train_df, women_like_men_test_df = train_test_split(women_like_men_df, test_size=0.2)
women_like_men_no_women = len(women_like_men_train_df.id.unique())
women_like_men_no_men = len(women_like_men_train_df.pid.unique())

## Let's create matrix factorization models

We will create and train several base MF (matrix factorization) models and for each of them let's do a cross validation to learn the best hyperparameters and parameters. Then we will compare the results and choose the best model.

Some general explanations for models:
* Models are train on only one batch because our data set is rather small.

Good reading resource: https://towardsdatascience.com/weight-initialization-techniques-in-neural-networks-26c649eb3b78

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
dev = torch.device('cuda')
dev

device(type='cuda')

### Matrix factorization

In [20]:
class MatrixFactorization(nn.Module):
    def __init__(self, num_people, num_partners, weights=(0, 1), bias=(-0.01, 0.01), emb_size=100):
        super(MatrixFactorization, self).__init__()
        self.person_emb = nn.Embedding(num_people, emb_size)
        self.person_bias = nn.Embedding(num_people, 1)
        self.partner_emb = nn.Embedding(num_partners, emb_size)
        self.parnter_bias = nn.Embedding(num_partners, 1)
        self.person_emb.weight.data.uniform_(weights[0], weights[1])
        self.partner_emb.weight.data.uniform_(weights[0], weights[1])
        self.person_bias.weight.data.uniform_(bias[0], bias[1])
        self.parnter_bias.weight.data.uniform_(bias[0], bias[1])
            
    def forward(self, u, v):
        bias_u = self.person_bias(u).squeeze()
        bias_v = self.parnter_bias(v).squeeze()
        u = self.person_emb(u)
        v = self.partner_emb(v)
        # calculate dot product
        # u*v is a element wise vector multiplication
        return torch.sigmoid((u*v).sum(1) + bias_u + bias_v)
    

# Example small models demonstrating weights
example_model = MatrixFactorization(10, 10, bias=(0, 0), emb_size=3)
print("Model weights are:\n")
for p in example_model.parameters():
    print(p)
print('\n\n', '='*20)

Model weights are:

Parameter containing:
tensor([[0.0762, 0.3123, 0.5546],
        [0.1057, 0.1367, 0.5152],
        [0.1282, 0.2517, 0.2468],
        [0.8584, 0.1111, 0.6178],
        [0.8935, 0.5006, 0.5250],
        [0.0151, 0.9112, 0.3072],
        [0.7003, 0.3894, 0.4643],
        [0.4291, 0.0807, 0.4183],
        [0.3528, 0.5130, 0.0663],
        [0.4358, 0.0360, 0.5397]], requires_grad=True)
Parameter containing:
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], requires_grad=True)
Parameter containing:
tensor([[0.0843, 0.8679, 0.9682],
        [0.2266, 0.3234, 0.7096],
        [0.3889, 0.6726, 0.5233],
        [0.5458, 0.2156, 0.0217],
        [0.7662, 0.8410, 0.6444],
        [0.2075, 0.1152, 0.2188],
        [0.9869, 0.1833, 0.8718],
        [0.8725, 0.1490, 0.9325],
        [0.2922, 0.3397, 0.3623],
        [0.1246, 0.9288, 0.3558]], requires_grad=True)
Parameter containing:
tensor([[

### Training and testing functions are below

In [21]:
def MF_test(model, df_test, verbose=False):
    model.eval()
    # .to(dev) puts code on either gpu or cpu.
    people = torch.LongTensor(df_test.id.values).to(dev)
    partners = torch.LongTensor(df_test.pid.values).to(dev)
    decision = torch.FloatTensor(df_test.decision.values).to(dev)
    y_hat = model(people, partners)
    loss = F.mse_loss(y_hat, decision)
    if verbose:
        print('test loss %.3f ' % loss.item())
    return loss.item()


# Default values assigned below are ones that I found online.
# Cross validadtion will be done later but it's good to have some defaults.
def MF_train(model, df_train, epochs=100, learning_rate=0.01, weight_decay=1e-5, verbose=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    model.train()
    
    for epoch in range(epochs):
        # From numpy to PyTorch tensors.
        # .to(dev) puts code on either gpu or cpu.
        people = torch.LongTensor(df_train.id.values).to(dev)
        partners = torch.LongTensor(df_train.pid.values).to(dev)
        decision = torch.FloatTensor(df_train.decision.values).to(dev)
        
        # calls forward method of the model
        y_hat = model(people, partners)
        # Using mean squared errors loss function
        loss = F.mse_loss(y_hat, decision)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if verbose and epoch % 100 == 0: 
            print(loss.item())

# Let's finally train models and choose the best one

## Cross validation on each model

In [35]:
def cross_valid_model(train_df, test_df, num_people, num_partners, bias, weights, emb_size):
    """Function to choose the best hyperparameters for a model."""
    min_loss = np.inf
    best_settings = None

    for (epochs, wd, lr) in product(epochs_poss, weight_decay_poss, learning_rate_poss):
        model = MatrixFactorization(num_people, num_partners, weights=weights, bias=bias, emb_size=emb_size).to(dev)
        MF_train(model, train_df, epochs=epochs, learning_rate=lr, weight_decay=wd)
        test_loss = MF_test(model, test_df)
        if (test_loss < min_loss) or (test_loss == min_loss and best_settings is not None and epochs < best_settings['epochs']):
            min_loss = test_loss
            best_settings = {'epochs': epochs, 'weight_decay': wd, 'learning_rate': lr}
    return min_loss, best_settings


def cross_valid(
    num_people, num_partners, train_df, test_df,
    epochs_poss,
    weight_decay_poss,
    learning_rate_poss,
    weights_poss,
    bias_poss,
    emb_size_poss,
    verbose=False, file=None
):
    """
    Function to choose the best model.
    If arg file is provided (should be a path), statistics will be written to the given file.
    """
    min_loss = np.inf
    best_settings = None
    models = []
    start_time = time()
    
    for (weights, bias, emb_size) in tqdm(product(weights_poss, bias_poss, emb_size_poss)):            
        model_min_loss, model_best_settings = cross_valid_model(train_df, test_df, num_people, num_partners, bias, weights, emb_size)
        
        curr_settings = {'min_loss': model_min_loss, 'weights': weights, 'bias': bias, 'emb_size': emb_size}
        model_best_settings = {**curr_settings, **model_best_settings}
        models.append(model_best_settings)
            
        if verbose:
            print(f'best_settings: {model_best_settings}')
            print(f'{"="*50}')
        
        if (model_min_loss < min_loss) or (model_min_loss == min_loss and best_settings and emb_size < best_settings['emb_size']):
            min_loss = model_min_loss
            best_settings = model_best_settings
    
    end_time = time()
        
    if verbose:
        print(f'\n{"?"*50}')
        print('THE WINNER IS:')
        print(f'best min_loss: {min_loss}')
        print(f'best best_settings: {best_settings}')
        print(f'Cross validation took {end_time - start_time}')
    
    if file:
        models.sort(key=lambda x: x['min_loss'])
        with open(file, 'wb') as stats_file:
            pickle.dump(models, stats_file)
        
    return min_loss, best_settings

## Let's first train women_like_man

In [36]:
MF_CROSS_VALID_RES_FILE = './women_like_men_MF_cross_vaild_results.txt'

In [39]:
# epochs_poss = list(range(1400, 1700, 100))
epochs_poss = [1400]
weight_decay_poss = [1e-7, 1e-6]
learning_rate_poss = [0.001]
weights_poss = [(0, 1)]
bias_poss = [(-1, 1)]
emb_size_poss = list(range(1900, 2400, 100))

min_loss, best_settings = cross_valid(
    women_like_men_no_women, women_like_men_no_men,
    women_like_men_train_df, women_like_men_test_df,
    epochs_poss=epochs_poss,
    weight_decay_poss=weight_decay_poss,
    learning_rate_poss=learning_rate_poss,
    weights_poss=weights_poss,
    bias_poss=bias_poss,
    emb_size_poss=emb_size_poss,
    file=MF_CROSS_VALID_RES_FILE,
)

Below you can see results of the above cross valid. Models are sorted.

In [27]:
with open(MF_CROSS_VALID_RES_FILE, 'rb') as file:
    mf_cross_valid_results = pickle.load(file)
            
# sorting
mf_cross_valid_results.sort(key=lambda x: x['min_loss'])
mf_cv_res_pd = pd.DataFrame(mf_cross_valid_results)

### Top 10 best models

In [42]:
mf_cv_res_pd.head(10)

Unnamed: 0,min_loss,weights,bias,emb_size,epochs,weight_decay,learning_rate
0,0.17977,"(-1, 1)","(0, 0)",1400,1400,0.0001,0.001
1,0.180538,"(0, 1)","(-1, 1)",1000,1200,1e-05,0.001
2,0.180655,"(-1, 1)","(0, 0)",1200,1600,0.0001,0.001
3,0.180667,"(-1, 1)","(0, 0)",1100,1500,0.0001,0.001
4,0.180749,"(-1, 1)","(0, 0)",1000,1400,0.0001,0.001
5,0.180875,"(-1, 1)","(0, 0)",1300,1500,0.0001,0.001
6,0.181202,"(0, 1)","(-1, 1)",1100,1300,1e-05,0.001
7,0.181699,"(0, 1)","(-1, 1)",1200,1400,0.0001,0.001
8,0.181801,"(-1, 1)","(-1, 1)",1100,1500,0.0001,0.001
9,0.182414,"(0, 1)","(-1, 1)",1300,1300,1e-05,0.001


After seeing the results above we found better set of parameters and hyperparameters (by hand because we ran out of time). So the winner is:

* weights: (0, 1)
* bias: (-1, 1)
* emb_size: 2000
* epochs: 1400
* learning_rate: 0.001
* weight_decay: 1e-6

whose loss is 0.1758.

To clarify, we know that this loss is the final correct loss, because it's overfitted, because test set = validation set. We did't have the time to create a custom methods for kfold. But don't worry, final usage of that will be cross validated with kfold which should give us reliable accuracy.

## Let's create logistic regression
Now, to create regression models, both used before and after, we will modify data set in such ways:

1. Modified income column and $..._5_1$ columns - here the missing values are filled with the mean of the columns.
2. With modified income column - here the missing values are filled with the mean/median of the column and removed $..._5_1$ columns.
3. With modified $..._5_1$ columns - here the missing values are filled with the mean/median of the columns and removed income column.
4. Removed income column and $..._5_1$ columns.

#### Later we will aply the logic to combine this with MF.

It's going to be used:
1. Before matrix factorization - output of this model will be used as a input to MF.
2. After matrix factorization - output of MF will be used as input in this model, as one of the features.

### Data preprocessing for logistic regression

In [22]:
def fill_data(df, filling_data):
    """Function fills missing values with mean of the column"""
    for index, row in df.iterrows():
        for column in df.columns:
            if column not in ['met_o', 'goal'] and np.isnan(row[column]):
                df.at[index, column] = filling_data[column]
            elif column == 'goal' and np.isnan(row['goal']):
                df.at[index, column] = 1
            elif column == 'met_o' and (np.isnan(row['met_o']) or row['met_o'] != 0):
                df.at[index, column] = 1
    return df


def fill_with_mean(df):
    col_means = df.mean()
    return fill_data(df, col_means)


def fill_with_median(df):
    col_medians = df.median()
    return fill_data(df, col_medians)


women_like_men_LG_df = base_df.copy()
women_like_men_LG_df = women_like_men_LG_df[women_like_men_LG_df['gender'] == 1]
women_like_men_LG_df.drop(columns=['gender', 'iid', 'pid', 'dec'], inplace=True)
women_like_men_LG_df = women_like_men_LG_df.reset_index(drop=True)

poss_LG_data_sets = []

### Option 1
Modified income column and  $..._5_1$  columns - here the missing values are filled with the mean of the columns.

In [90]:
LG_no1_mean_df = base_df.copy()
LG_no1_mean_df = fill_with_mean(LG_no1_mean_df)
poss_LG_data_sets.append(LG_no1_mean_df)

LG_no1_median_df = base_df.copy()
LG_no1_median_df = fill_with_median(LG_no1_median_df)
poss_LG_data_sets.append(LG_no1_median_df)

### Option 2
With modified income column - here the missing values are filled with the mean/median of the column and removed  $..._5_1$  columns.

In [91]:
cols_to_drop = ['attr5_1', 'sinc5_1', 'intel5_1', 'fun5_1', 'amb5_1']

LG_no2_mean_df = base_df.copy()
LG_no2_mean_df.drop(columns=cols_to_drop, inplace=True)
LG_no2_mean_df = fill_with_mean(LG_no2_mean_df)
poss_LG_data_sets.append(LG_no2_mean_df)

LG_no2_median_df = base_df.copy()
LG_no2_median_df.drop(columns=cols_to_drop, inplace=True)
LG_no2_median_df = fill_with_median(LG_no2_median_df)
poss_LG_data_sets.append(LG_no2_median_df)

### Option 3
With modified  $..._5_1$  columns - here the missing values are filled with the mean/median of the columns and removed income column.

In [92]:
cols_to_drop = ['income']

LG_no3_mean_df = base_df.copy()
LG_no3_mean_df.drop(columns=cols_to_drop, inplace=True)
LG_no3_mean_df = fill_with_mean(LG_no3_mean_df)
poss_LG_data_sets.append(LG_no3_mean_df)

LG_no3_median_df = base_df.copy()
LG_no3_median_df.drop(columns=cols_to_drop, inplace=True)
LG_no3_median_df = fill_with_median(LG_no3_median_df)
poss_LG_data_sets.append(LG_no3_median_df)

### Option 4
Removed income column and $..._5_1$ columns.

In [93]:
cols_to_drop = ['income', 'attr5_1', 'sinc5_1', 'intel5_1', 'fun5_1', 'amb5_1']

LG_no4_df = base_df.copy()
LG_no4_df.drop(columns=cols_to_drop, inplace=True)
poss_LG_data_sets.append(LG_no4_df)

### Definition of logistic regression

In [23]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        #(out)input_dim is size of our (out)input data
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        pred_y = torch.sigmoid(self.linear(x))
        return torch.squeeze(pred_y, 1)

In [24]:
def LR_train(model, x, y, epochs=100, learning_rate=0.01, weight_decay=1e-5, verbose=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = torch.nn.CrossEntropyLoss()
    model.train()
    
    for epoch in range(epochs):
        # From numpy to PyTorch tensors.
        # .to(dev) puts code on either gpu or cpu.
        X = torch.FloatTensor(x).to(dev)
        Y = torch.FloatTensor(y).to(dev)
        
        # calls forward method of the model
        y_hat = model(X)
        # Using mean squared errors loss function
        loss = criterion(y_hat, Y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if verbose and epoch % 10 == 0: 
            print(loss.item())
            
def LR_test(model, x, y, verbose=False):
    criterion = torch.nn.CrossEntropyLoss()
    model.eval()
    # .to(dev) puts code on either gpu or cpu.
    X = torch.FloatTensor(x).to(dev)
    Y = torch.FloatTensor(y).to(dev)
    y_hat = model(X)
    # Using mean squared errors loss function
    loss = criterion(y_hat, Y)
    if verbose:
        print('test loss %.3f ' % loss.item())
    return loss.item()

### Let's find the best model using Cross validation:

In [25]:
## Preprocessing data before running cross validation to find optimal parameters for training MF for new date decisionepochs_poss = list(range(100, 510, 10))
weight_decay_poss = [0.0, 1e-6, 1e-5, 1e-4, 0.0001, 0.001, 0.01, 0.1]
learning_rate_poss = [1e-6, 1e-5, 1e-4, 0.0001, 0.001, 0.01, 0.1]

def cross_valid_regression(model, train_x, train_y, test_x, test_y, verbose=False):
    """Function to choose the best hyperparameters for a model."""
    min_loss = np.inf
    best_settings = None

    for (epochs, wd, lr) in tqdm(product(epochs_poss, weight_decay_poss, learning_rate_poss)):
        train(model, train_x, train_y, epochs=epochs, learning_rate=lr, weight_decay=wd)
        test_loss = test(model, test_x, test_y)
        if (test_loss < min_loss) or (test_loss == min_loss and best_settings is not None and epochs < best_settings['epochs']):
            min_loss = test_loss
            best_settings = {'epochs': epochs, 'weight_decay': wd, 'learning_rate_poss': lr}
    if verbose:
        print('min loss %.3f' % min_loss)
        print('best settings are', best_settings)
    return min_loss, best_settings


In [42]:
# train_sample defines size of train set
train_sample = 0.8
# our output is only one number
output_dim = 1

for dataset in poss_LG_data_sets:
    cross_valid_regression()

# #simple division into training and testing sets
# msk = np.random.rand(len(regression_df_1)) < train_sample
# train_set = regression_df_1[msk]
# test_set = regression_df_1[~msk]

# #we want to predict 'dec' based on other attributes. We dont want 'iid' and 'pid' columns
# #to have any impact on results
# train_x = train_set.drop(['dec', 'iid', 'pid', 'gender'], axis=1).values
# train_y = train_set['dec'].values 

# test_x = test_set.drop(['dec', 'iid', 'pid'], axis=1).values
# test_y = test_set['dec'].values 

# # input_dim = train_x.shape[1]
# # model = LogisticRegression(input_dim, output_dim).to(dev)
# # print(cross_valid_regression(model, train_x, train_y, test_x, test_y))

[      iid  gender    pid  age_o  pf_o_att  pf_o_sin  pf_o_int  pf_o_fun  \
 0       1       0   11.0   27.0      35.0      20.0      20.0      20.0   
 1       1       0   12.0   22.0      60.0       0.0       0.0      40.0   
 2       1       0   13.0   22.0      19.0      18.0      19.0      18.0   
 3       1       0   14.0   23.0      30.0       5.0      15.0      40.0   
 4       1       0   15.0   24.0      30.0      10.0      20.0      10.0   
 ...   ...     ...    ...    ...       ...       ...       ...       ...   
 8373  552       1  526.0   26.0      10.0      10.0      30.0      20.0   
 8374  552       1  527.0   24.0      50.0      20.0      10.0       5.0   
 8375  552       1  528.0   29.0      40.0      10.0      30.0      10.0   
 8376  552       1  529.0   22.0      10.0      25.0      25.0      10.0   
 8377  552       1  530.0   22.0      20.0      20.0      10.0      15.0   
 
       pf_o_amb  pf_o_sha  ...  date_5.0  date_6.0  date_7.0  go_out_1.0  \
 0        

In [26]:
women_like_men_CV_df = base_df.copy()
women_like_men_CV_df.drop(columns=['income'], inplace=True)
women_like_men_CV_df = fill_with_mean(women_like_men_CV_df)
women_like_men_CV_df.rename(columns={'iid':'id'},inplace=True)
men_like_women_data_dates = []
women_like_men_data_dates = []

for _, row in women_like_men_CV_df.iterrows():
    if row['gender']:
        # it's a woman
        men_like_women_data_dates.append(row)
    else:
        women_like_men_data_dates.append(row)
        
men_like_women_df_dates = encode_data(pd.DataFrame(men_like_women_data_dates))
women_like_men_df_dates = encode_data(pd.DataFrame(women_like_men_data_dates))

nr_of_dates = len(men_like_women_df_dates)

print("men_like_women_df:")
print(men_like_women_df_dates)
print("\nwomen_like_men_df:")
print(women_like_men_df_dates)

men_like_women_df:
       id  gender  pid  age_o  pf_o_att  pf_o_sin  pf_o_int  pf_o_fun  \
100     0     1.0    0   21.0      15.0      20.0      20.0      15.0   
101     0     1.0    1   24.0      45.0       5.0      25.0      20.0   
102     0     1.0    2   25.0      35.0      10.0      35.0      10.0   
103     0     1.0    3   23.0      20.0      20.0      20.0      20.0   
104     0     1.0    4   21.0      20.0       5.0      25.0      25.0   
...   ...     ...  ...    ...       ...       ...       ...       ...   
8373  276     1.0  270   26.0      10.0      10.0      30.0      20.0   
8374  276     1.0  271   24.0      50.0      20.0      10.0       5.0   
8375  276     1.0  272   29.0      40.0      10.0      30.0      10.0   
8376  276     1.0  273   22.0      10.0      25.0      25.0      10.0   
8377  276     1.0  274   22.0      20.0      20.0      10.0      15.0   

      pf_o_amb  pf_o_sha  ...  date_5.0  date_6.0  date_7.0  go_out_1.0  \
100       15.0  15.00000  ...

### Because of the data set being fairly small, we are going to be doing a Leave One Out Cross Validation to get parameters for the Matrix Factorization model training on new date.


**To first train MF and LR models we use parameters computed by cross validation before**

In [41]:
def leave_k_out(MF_df, df_dates_user, df_dates_item, user_id):
    ###Extracts users whith user_id from user_id and then for every user extracts one item for it
    ###----------------------------------------Extracting users-----------------------###
    ###Leave one user(of user_id) and every item we have his decision on out of the MF_df
    train_data_no_users = []
    test_data = []
    
    for _,row in MF_df.iterrows():
        if row["id"] in user_id:
            test_data.append(row)
        else:
            train_data_no_users.append(row)
    
    ###Leave one user(of user_id) and every item we have his decision on out of the LR_df
    train_dates_user = []
    test_dates = []
    
    for _,row in df_dates_user.iterrows():
        if row["id"] in user_id:
            test_dates.append(row)
        else:
            train_dates_user.append(row)
    
    train_dates_items = []
    
    for _,row in df_dates_item.iterrows():
        if row['pid'] not in user_id:
            train_dates_items.append(row)
            
    ###------------------------------------Pair up users with items---------------###
    ###Making test data and set have only one exclusive pair of (user,item)
    item_id = []
    test_data_final = []
    
    for user in user_id:
        for row in test_data:
            if row["id"] == user:
                item_id.append(row["pid"])
                test_data_final.append(row)
                break
            
    
    test_dates_final = []
    for dec_row in test_data_final:
        user,item = dec_row[['id','pid']]
        for row in test_dates:
            if row['id'] == user and row['pid'] == item:
                test_dates_final.append(row)
                break
        
    ###--------------------------------------Extracting items-------------------------###
    ### Training dataframe for base MF without user(of user_id), and his decisions upon which we can test the results.

    train_data_final = []
    for row in train_data_no_users:
        if row["pid"] not in item_id:
            train_data_final.append(row)
    
    train_dates_user_final= []
    for row in train_dates_user:
        if row["pid"] not in item_id:
            train_dates_user_final.append(row)
    
    train_dates_items_final= []
    for row in train_dates_items:
        if row["id"] not in item_id:
            train_dates_items_final.append(row)
    
    
    train_df = pd.DataFrame(train_data_final)
    
    ### Training dataframe for base LR without user(of user_id), and his dates upon which we can predict his decision
    ###     and test them against test_df.
    train_dates_user_final = pd.DataFrame(train_dates_user_final)
    train_dates_items_final = pd.DataFrame(train_dates_items_final)
    return train_df, test_data_final, train_dates_user_final, train_dates_items_final, test_dates_final

def new_date_test(MF_model, dec_Matrix, df_dates_men, df_dates_women, new_date, nr_men, nr_women, dec,sim_fun = get_similarity,
                 epochs=100, learning_rate=0.001, weight_decay=0.001,K=0.1,P=0.1):
    
    Y = torch.FloatTensor([dec]).to(dev)
    y_hat = get_new_date_dec(MF_model, dec_Matrix, df_dates_men, df_dates_women, new_date,nr_men= nr_men,nr_women= nr_women,
                             epochs=epochs, learning_rate=learning_rate, weight_decay=weight_decay,sim_fun =sim_fun,K=K,P=P,CV_mode=True)
    
    loss = F.mse_loss(y_hat, Y)
    return loss.item()

# epochs_poss = list(range(60, 350, 10))
# weight_decay_poss = [0.0001, 0.001, 0.01, 0.1, 1.0]
# learning_rate_poss = [0.0001, 0.001, 0.01, 0.1, 1.0]
K_poss = [0.05,0.1]
P_poss = [0.05,0.1]
sim_funs = [get_similarity, get_similarity_cosine]


### Cross valid for women_like_men
def KFoldCrossValidation(MF_df, df_dates_user , df_dates_item, input_dim = 105, output_dim = 1,K = 10,verbose=False):
    nr_users= 274
    nr_items= 277
    
    min_overall_loss = np.inf
    best_overall_settings = None
    skip_first = True
    
    for (N, P, fun) in tqdm(product(K_poss, P_poss, sim_funs)):
        if skip_first:
            skip_first = False
            continue
        nr_users= 274
        nr_items= 277
        
        min_user_loss = np.inf
        best_settings = None
        
        indices = [idx for idx in range(nr_users)]
        np.random.shuffle(indices)
        user_folds = np.array_split(indices,K)
        
        
        for user_batch in tqdm(user_folds):
            
            MF_train_df, dec_test_df, train_dates_user,train_dates_items, dates_test_df = leave_k_out(MF_df, df_dates_user, df_dates_item, user_batch)
            MF_train_df.rename(columns={'dec':'decision'},inplace=True)
            MF_train_df = encode_data(MF_train_df)
            nr_users = len(MF_train_df.id.unique())
            nr_items = len(MF_train_df.pid.unique())
            
            train_dates_user = encode_data(train_dates_user)
            train_dates_items = encode_data(train_dates_items)

            MF_model = MatrixFactorization(nr_users, nr_items, weights=(0, 1), bias=(-1,1), emb_size=2000).to(dev)            
            
            MF_train(MF_model, MF_train_df, epochs = 1400, learning_rate = 0.001, weight_decay= 1e-6) ## Change train function

            MF_model.eval()
            
            fold_loss = 0.    

            dec_Matrix = np.array([[get_mf_dec(MF_model,user_id,item_id) for user_id in range(nr_users)]
                                   for item_id in range(nr_items)])
            
            for dec_row, new_date_row in zip(dec_test_df,dates_test_df):
                fold_loss += new_date_test(MF_model, dec_Matrix, train_dates_items, train_dates_user, new_date_row,
                                         nr_items, nr_users, dec_row["decision"],sim_fun = fun,
                                          epochs=1400, learning_rate=0.001, weight_decay=1e-6,K=N,P=P)
            fold_loss /= len(user_batch)
            
            if (fold_loss < min_user_loss) or (fold_loss == min_user_loss and best_user_settings is not None):
                min_user_loss = fold_loss
                best_user_settings = {'% of similar': N, '% of decisions': P, 'sim_function': fun.__name__}
                
        if (min_user_loss < min_overall_loss) or (min_user_loss == min_overall_loss and best_overall_settings is not None):
            min_overall_loss = min_user_loss
            best_overall_settings = best_user_settings
            
        if verbose:
            print('min loss %.3f' % min_overall_loss)
            print('best settings are', best_overall_settings)
        
    return min_overall_loss, best_overall_settings

In [42]:
%%time
min_overall_loss, best_overall_settings = KFoldCrossValidation(women_like_men_df, women_like_men_df_dates, men_like_women_df_dates,K=5,verbose=True)

0it [00:00, ?it/s]
  0%|                                                                                                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████████████████████                                                                                                                        | 1/5 [07:16<29:05, 436.31s/it][A
 40%|████████████████████████████████████████████████████████████                                                                                          | 2/5 [14:25<21:37, 432.35s/it][A
 60%|██████████████████████████████████████████████████████████████████████████████████████████                                                            | 3/5 [21:41<14:27, 433.65s/it][A
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4/5 [28:51<07:12, 432.50s/it][A
100%|██████████████████████████

min loss 0.479
best settings are {'% of similar': 0.05, '% of decisions': 0.05, 'sim_function': 'get_similarity_cosine'}



 20%|██████████████████████████████                                                                                                                        | 1/5 [07:54<31:36, 474.16s/it][A
 40%|████████████████████████████████████████████████████████████                                                                                          | 2/5 [15:34<23:17, 465.94s/it][A
 60%|██████████████████████████████████████████████████████████████████████████████████████████                                                            | 3/5 [22:47<15:02, 451.14s/it][A
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4/5 [29:48<07:19, 439.11s/it][A
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [36:41<00:00, 440.25s/it][A
3it [1:12:40, 1546.94s/it]
  0%|                 

min loss 0.468
best settings are {'% of similar': 0.05, '% of decisions': 0.1, 'sim_function': 'get_similarity'}



 20%|██████████████████████████████                                                                                                                        | 1/5 [06:28<25:55, 388.87s/it][A
 40%|████████████████████████████████████████████████████████████                                                                                          | 2/5 [12:58<19:27, 389.17s/it][A
 60%|██████████████████████████████████████████████████████████████████████████████████████████                                                            | 3/5 [19:23<12:54, 387.37s/it][A
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4/5 [25:53<06:28, 388.39s/it][A
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [32:17<00:00, 387.40s/it][A
4it [1:44:57, 1692.49s/it]
  0%|                 

min loss 0.463
best settings are {'% of similar': 0.05, '% of decisions': 0.1, 'sim_function': 'get_similarity_cosine'}



 20%|██████████████████████████████                                                                                                                        | 1/5 [06:54<27:36, 414.18s/it][A
 40%|████████████████████████████████████████████████████████████                                                                                          | 2/5 [13:42<20:32, 410.75s/it][A
 60%|██████████████████████████████████████████████████████████████████████████████████████████                                                            | 3/5 [20:39<13:46, 413.47s/it][A
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4/5 [27:42<06:57, 417.30s/it][A
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [34:31<00:00, 414.26s/it][A
5it [2:19:28, 1824.20s/it]
  0%|                 

min loss 0.463
best settings are {'% of similar': 0.05, '% of decisions': 0.1, 'sim_function': 'get_similarity_cosine'}



 20%|██████████████████████████████                                                                                                                        | 1/5 [06:28<25:53, 388.29s/it][A
 40%|████████████████████████████████████████████████████████████                                                                                          | 2/5 [12:53<19:19, 386.47s/it][A
 60%|██████████████████████████████████████████████████████████████████████████████████████████                                                            | 3/5 [19:23<12:56, 388.06s/it][A
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4/5 [25:44<06:25, 385.46s/it][A
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [32:06<00:00, 385.40s/it][A
6it [2:51:35, 1858.31s/it]
  0%|                 

min loss 0.463
best settings are {'% of similar': 0.05, '% of decisions': 0.1, 'sim_function': 'get_similarity_cosine'}



 20%|██████████████████████████████                                                                                                                        | 1/5 [06:56<27:46, 416.59s/it][A
 40%|████████████████████████████████████████████████████████████                                                                                          | 2/5 [13:52<20:48, 416.22s/it][A
 60%|██████████████████████████████████████████████████████████████████████████████████████████                                                            | 3/5 [20:49<13:52, 416.48s/it][A
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4/5 [27:43<06:55, 415.63s/it][A
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [34:28<00:00, 413.69s/it][A
7it [3:26:04, 1925.90s/it]
  0%|                 

min loss 0.463
best settings are {'% of similar': 0.05, '% of decisions': 0.1, 'sim_function': 'get_similarity_cosine'}



 20%|██████████████████████████████                                                                                                                        | 1/5 [06:26<25:47, 386.96s/it][A
 40%|████████████████████████████████████████████████████████████                                                                                          | 2/5 [12:56<19:26, 388.74s/it][A
 60%|██████████████████████████████████████████████████████████████████████████████████████████                                                            | 3/5 [19:25<12:57, 388.75s/it][A
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 4/5 [25:50<06:27, 387.16s/it][A
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [32:08<00:00, 385.71s/it][A
8it [3:58:12, 1786.59s/it]

min loss 0.463
best settings are {'% of similar': 0.05, '% of decisions': 0.1, 'sim_function': 'get_similarity_cosine'}
Wall time: 3h 58min 12s





### 5 Fold Cross Validation on data set where we treat women as users and men as items:


**Gave best results ie. 46.3%  for :**
* **Choosing similarity rate between users and items using Cosine Similarity between their according date attributes**
* **Then computing average decisions for 5% of most similar users/items**
* **Finally taking randomly selected 10% of the average decisions and using it as a MF baseline.**

***Given two vectors x,y where $x_i$ gives user's x decision on partner i (pid = i), computes similiraty measure i.e. on how many positions does vector x match vector y, excluding "-1", the default value for no logistic regression output provided***

In [29]:
def get_similarity(x,y):
    assert len(x) == len(y) , "x should be same size as y"
    return pearsonr(torch.FloatTensor([x]).to(dev),torch.FloatTensor([y]).to(dev))

In [30]:
def get_similarity_cosine(x,y):
    assert len(x) == len(y) , "x should be same size as y"
    return torch.nn.functional.cosine_similarity(torch.FloatTensor([x]).to(dev),torch.FloatTensor([y]).to(dev))


***Given matrix A(which has results only for one gender) where $a_{ij}$ represents output of logistic regression from a date between person(iid = i) and person(pid=j), creates vector C, where each position "i" represents similiarity between input vector person and other person "i" of the same gender***

In [31]:
def split_date(date_row):
    subject_feats = ["age","race_4.0","race_3.0","race_2.0","race_1.0",
                     "race_o_4.0","race_o_3.0","race_o_2.0","race_o_1.0","imprace","imprelig","goal","date_1.0","date_2.0",
                     "date_3.0","date_4.0","date_5.0","date_6.0","date_7.0","go_out_7.0","go_out_6.0","go_out_5.0","go_out_4.0","go_out_3.0","go_out_2.0",
                     "go_out_1.0","sports","tvsports","exercise","dining","museums","art","hiking","gaming","clubbing","reading","tv",
                     "theater","movies","concerts","music","shopping","yoga","attr1_1","sinc1_1","intel1_1","fun1_1","amb1_1","shar1_1","attr2_1",
                     "sinc2_1","intel2_1","fun2_1","amb2_1","shar2_1","attr3_1","sinc3_1","fun3_1","intel3_1","amb3_1","attr5_1","sinc5_1","intel5_1",
                     "fun5_1","amb5_1","career_2.0","career_3.0","career_4.0","career_5.0","career_6.0","career_7.0","career_8.0","career_9.0","career_10.0",
                     "career_11.0","career_12.0","career_13.0","career_14.0","career_15.0","career_16.0","career_17.0","career_18.0"]
    
    subject = date_row.drop(["id","pid","gender",'dec_o','dec',"age_o","pf_o_att","pf_o_sin","pf_o_int","pf_o_fun","pf_o_amb","pf_o_sha","attr_o",
                             "sinc_o","intel_o","fun_o","amb_o","shar_o","like_o","prob_o","met_o","attr4_1",
                             "sinc4_1","intel4_1","fun4_1","amb4_1","shar4_1"])
    
    partner = date_row.drop(subject_feats + ['gender', 'id','dec_o', 'pid'])
    return (date_row['id'],subject),(date_row['pid'],partner)

def make_date(subject_info, partner_info):
    return torch.FloatTensor([np.hstack((partner_info,subject_info))]).to(dev)

In [32]:
def get_mf_dec(mf_model, user_id, item_id):
    return mf_model(torch.LongTensor([user_id]).to(dev),torch.LongTensor([item_id]).to(dev)).cpu().detach().numpy()[0]

In [33]:
def update_params(MF_model, weights=(0,0.2),bias=(0,1), user_mode=False):
    if user_mode:
        ###Adding new row to user_emb
        new_emb_row = torch.FloatTensor(1,MF_model.person_emb.embedding_dim).to(dev)
        new_emb_row.data.uniform_(weights[0], weights[1])
        MF_model.person_emb.weight = nn.Parameter(torch.cat((MF_model.person_emb.weight, new_emb_row))).to(dev)

        ###Adding new bias row to user_bias
        new_bias_row = torch.FloatTensor(1,MF_model.person_bias.embedding_dim).to(dev)
        new_bias_row.data.uniform_(bias[0], bias[1])
        MF_model.person_bias.weight = nn.Parameter(torch.cat((MF_model.person_bias.weight, new_bias_row))).to(dev)
    else:
        ###Adding new row to item_emb
        new_emb_row = torch.FloatTensor(1,MF_model.partner_emb.embedding_dim).to(dev)
        new_emb_row.data.uniform_(weights[0], weights[1])
        MF_model.partner_emb.weight = nn.Parameter(torch.cat((MF_model.partner_emb.weight, new_emb_row))).to(dev)

        ###Adding new bias row to item_bias
        new_bias_row = torch.FloatTensor(1,MF_model.parnter_bias.embedding_dim).to(dev)
        new_bias_row.data.uniform_(bias[0], bias[1])
        MF_model.parnter_bias.weight = nn.Parameter(torch.cat((MF_model.parnter_bias.weight, new_bias_row))).to(dev)


def train_one_row(MF_model, df_train,
                  epochs=100, learning_rate=0.001, weight_decay=0.001,weights=(0,0.2),bias=(0,1),user_mode=False):
    ###Creating a mask and registering a hook to zero out gradients of every embedding 
    ###    but the newly added ones, so that mf_model retrains only those.
    item_mask_bias = torch.zeros_like(MF_model.parnter_bias.weight)
    item_bias_hook = MF_model.parnter_bias.weight.register_hook(lambda grad: grad*item_mask_bias)
    
    user_mask_bias = torch.zeros_like(MF_model.person_bias.weight)
    user_bias_hook = MF_model.person_bias.weight.register_hook(lambda grad: grad*user_mask_bias)
    
    item_mask_emb = torch.zeros_like(MF_model.partner_emb.weight)
    item_emb_hook = MF_model.partner_emb.weight.register_hook(lambda grad: grad*item_mask_emb)
    
    user_mask_emb = torch.zeros_like(MF_model.person_emb.weight)
    user_emb_hook = MF_model.person_emb.weight.register_hook(lambda grad: grad*user_mask_emb)
    
    if user_mode:
        ### Setting to train only new_user's parameters
        user_mask_emb[-1] = 1.
        user_mask_bias[-1] = 1.
    else:
        ### Setting to train only new_item's parameters
        item_mask_emb[-1] = 1.
        item_mask_bias[-1] = 1.
    
    MF_model.train()
    MF_train(MF_model,df_train,epochs=100, learning_rate=0.001, weight_decay=0.001)
    MF_model.eval()
    
    ###Remove the hooks
    item_bias_hook.remove()
    item_emb_hook.remove()
    user_bias_hook.remove()
    user_emb_hook.remove()

In [34]:
def get_new_date_dec(mf_model,
                     dec_Matrix,
                     df_dates_men,
                     df_dates_women,
                     new_date = pd.Series,
                     K = 0.1,
                     P = 0.1,
                     nr_men = 277, nr_women = 274,
                     epochs=100, learning_rate=0.001, weight_decay=0.001,weights=(0,0.2),bias=(0,1),
                     CV_mode = False,
                     sim_fun = get_similarity
                    ):
    
    MF_model = copy.deepcopy(mf_model).to(dev)
    user,item = split_date(new_date)
    ### Determine number of users and items depending on target's gender
    if new_date["gender"]: 
        #target is a man
        nr_users = nr_men
        nr_items = nr_women
        df_user_dates = df_dates_men
        df_item_dates = df_dates_women
    else:
        #target is a woman
        nr_users = nr_women
        nr_items = nr_men
        df_user_dates = df_dates_women
        df_item_dates = df_dates_men

    

    ###---------------------------------------------------Add new item----------------------------------------------------###
    
    ###First get every user's decision on the new item using logistic regression and -1 when user's date_info not provided
    similarity_vec = np.array([-1.] * nr_items, dtype= np.float64)
    for _,row in df_item_dates.iterrows():
        _,_item = split_date(row)
        similarity_rate = sim_fun(_item[1], item[1])
        similarity_vec[int(_item[0])-1] = similarity_rate
    
    ### Get ids of K most similar items
    most_sim_idx = np.argsort(similarity_vec)[:int(nr_items * K)]
    
    ###Compute the average decisions from K most similar items.
    avg_dec_vec = np.mean(dec_Matrix[most_sim_idx],axis=0)
    
    ###Pick nr_users*P(percantage of users) of these decisions and use them as baseline decisions to retrain new item's decisions using MatrixFactorization
    new_item_dec = np.array([-1.] * nr_users, dtype= np.float64)
    random_idx = np.random.choice(range(nr_users),int(nr_users * P))
    new_item_dec[random_idx] = avg_dec_vec[random_idx]
   
    ###Prepare dataframe for MF_model to train on
    df_train = pd.DataFrame(data=[[user_id,nr_items,dec] for user_id,dec in enumerate(new_item_dec) if dec != -1] ,columns=["id", "pid", "decision"])
    
    ###Update MF_model parameters to train only the new item's row
    update_params(MF_model)
    train_one_row(MF_model,df_train)

    ###Add new item's decisions to the dec_Matrix
    ###New item has been added.
    nr_items += 1
    
    
    ###---------------------------------------------------Add new user----------------------------------------------------###
    
    ###First get every new user's decisions on every item using logistic regression and -1 when item's date_info not provided
    similarity_vec = np.array([-1.] * nr_users, dtype= np.float64)
    for idx,row in df_user_dates.iterrows():
        _user,_ = split_date(row)
        similarity_rate = sim_fun(_user[1],user[1])
        similarity_vec[int(_user[0])-1] = similarity_rate

    ###Then get id's of most similar users to the new one, by comparing their decisions we got 
    ###    using matrix factorization and new items logistic regression ones.
    most_sim_idx = np.argsort(similarity_vec)[:int(nr_items * K)]
    
    ###Compute the average decisions from K most similar users.
    avg_dec_vec = np.mean([[item_dec[idx] for item_dec in dec_Matrix] for idx in most_sim_idx],axis=0)
    
    ###Take nr_items*P(percantage of items number) of these decisions and use them as a baseline for MF.
    new_user_dec = np.array([-1.] * nr_items, dtype= np.float64)
    random_idx = np.random.choice(range(nr_items-1),int(nr_items * P))
    new_user_dec[random_idx] = avg_dec_vec[random_idx]
    
    ###Prepare dataframe for MF_model to train on
    df_train = pd.DataFrame(data=[[nr_users,item_id,dec] for item_id, dec in enumerate(new_user_dec) if dec != -1],columns=["id", "pid", "decision"])

    ###Update MF_model parameters to train only the new user's row
    update_params(MF_model,user_mode=True)
    train_one_row(MF_model,df_train,user_mode=True)
    
    if CV_mode:
        return MF_model(torch.LongTensor([nr_users]).to(dev),torch.LongTensor([nr_items-1]).to(dev))
    
    ###return new user's decision on new item
    return get_mf_dec(MF_model, nr_users, nr_items-1)