# Baseline2: MF_Naive MF_IPS CausE (Using extract data from Baseline1)

We have extracted deterministic data according to the problem description on Baseline 1, which can be used for further mining with some more complicated model. Here, we will introduce three model, hoping to inspire you, especially from the perspective of causality. For more details, you can find the whole project from https://github.com/KID-22/PCIC2021-Baselines. Welcome to watch, star and fork! Note that some new baselines will update soon.

You can see more detailed information about these models from paper as follows:  
[1] Koren et al. 2009. Matrix factorization techniques for recommender systems. In Computer.  
[2] Schnabel et al. 2016. Recommendations as Treatments: Debiasing Learning and Evaluation. In JMLR.  
[3] Bonner et al. 2018. Causal embeddings for recommendation. In RecSys.  

All three models were initially used in the rating prediction task. Here, the task of our problem is to predict whether a user like(1) or dislike(0) a tag, which can be considered as a binary ratings prediction task. So we first use the deterministic data extracting from Baseline 1 to constuct the user-item interaction matrix. Then, we use these rating model to predict the user performance.  
**Note that we haven't used the rating.txt, and how to use this dataset is left to you participants to explore.**

## Code

### Models
#### MF_Naive.py
```python
import torch
import torch.nn as nn
import numpy as np
from torch.nn.init import normal_


class MF_Naive(nn.Module):
    def __init__(self, num_users, num_items, embedding_size, device='cpu'):
        super(MF_Naive, self).__init__()

        self.num_users = num_users
        self.num_items = num_items

        self.user_e = nn.Embedding(self.num_users, embedding_size)
        self.item_e = nn.Embedding(self.num_items, embedding_size)
        self.user_b = nn.Embedding(self.num_users, 1)
        self.item_b = nn.Embedding(self.num_items, 1)

        self.apply(self._init_weights)

        self.loss = nn.MSELoss()

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.1)

    def forward(self, user, item):
        user_embedding = self.user_e(user)
        item_embedding = self.item_e(item)

        preds = self.user_b(user)
        preds += self.item_b(item)
        preds += (user_embedding * item_embedding).sum(dim=1, keepdim=True)

        return preds.squeeze()

    def calculate_loss(self, user_list, item_list, label_list):
        return self.loss(self.forward(user_list, item_list), label_list)

    def predict(self, user, item):
        return self.forward(user, item)

    def get_optimizer(self, lr, weight_decay):
        return torch.optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)

    def get_embedding(self):
        return self.user_e, self.item_e

```
#### MF_IPS.py
```python
import torch
import torch.nn as nn
import numpy as np
from torch.nn.init import normal_
from .loss import IPSLoss


class MF_IPS(nn.Module):
    def __init__(self, num_users, num_items, embedding_size, inverse_propensity, device):
        super(MF_IPS, self).__init__()
        self.device = device
        self.num_users = num_users
        self.num_items = num_items
        self.inverse_propensity = inverse_propensity

        self.user_e = nn.Embedding(self.num_users, embedding_size)
        self.item_e = nn.Embedding(self.num_items, embedding_size)
        self.user_b = nn.Embedding(self.num_users, 1)
        self.item_b = nn.Embedding(self.num_items, 1)

        self.apply(self._init_weights)

        self.loss = IPSLoss(device)

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.1)

    def forward(self, user, item):
        user_embedding = self.user_e(user)
        item_embedding = self.item_e(item)

        preds = self.user_b(user)
        preds += self.item_b(item)
        preds += (user_embedding * item_embedding).sum(dim=1, keepdim=True)

        return preds.squeeze()

    def calculate_loss(self, user_list, item_list, label_list):
        return self.loss(self.forward(user_list, item_list), label_list, self.inverse_propensity, item_list)

    def predict(self, user, item):
        return self.forward(user, item)

    def get_optimizer(self, lr, weight_decay):
        return torch.optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)

    def get_embedding(self):
        return self.user_e, self.item_e
```
#### CausE.py
```python
import torch
import torch.nn as nn
import numpy as np
from torch.nn.init import normal_


class CausE(nn.Module):
    def __init__(self, num_users, num_items, embedding_size,
                 reg_c, reg_t, reg_tc, s_c, s_t, device='cpu'):
        super(CausE, self).__init__()
        self.user_e = nn.Embedding(num_users, embedding_size)
        self.item_e_c = nn.Embedding(num_items, embedding_size)
        self.item_e_t = nn.Embedding(num_items, embedding_size)
        self.user_b = nn.Embedding(num_users, 1)
        self.item_b = nn.Embedding(num_items, 1)
        self.reg_c = reg_c
        self.reg_t = reg_t
        self.reg_tc = reg_tc
        self.s_c = s_c
        self.s_t = s_t

        self.apply(self._init_weights)

        self.loss_c = nn.MSELoss()
        self.loss_t = nn.MSELoss()

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.1)

    def forward(self, user, item):
        user_embedding = self.user_e(user)
        item_embedding = self.item_e_c(item)

        preds = self.user_b(user)
        preds += self.item_b(item)
        preds += (user_embedding * item_embedding).sum(dim=1, keepdim=True)
        return preds.squeeze()

    def calculate_loss(self, user_list, item_list, label_list, control):
        user_embedding = self.user_e(user_list)

        item_embedding_c = self.item_e_c(item_list)
        item_embedding_t = self.item_e_t(item_list)

        dot_c = (user_embedding * item_embedding_c).sum(dim=1, keepdim=True)
        pred_c = dot_c + self.user_b(user_list) + self.item_b(item_list)
        pred_c = pred_c.squeeze()
        dot_t = (user_embedding * item_embedding_t).sum(dim=1, keepdim=True)
        pred_t = dot_t + self.user_b(user_list) + self.item_b(item_list)
        pred_t = pred_t.squeeze()

        loss = self.loss_c(pred_c, label_list)
        loss += self.loss_t(pred_t, label_list)
        loss_reg_tc = self.reg_tc * torch.norm(item_embedding_c - item_embedding_t, 2)
        return loss + loss_reg_tc

    def predict(self, user, item):
        return self.forward(user, item)

    def get_optimizer(self, lr, weight_decay):
        return torch.optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
```

#### main.py
```python
from config import opt
import os
import models
from torch.utils.data import DataLoader
from tqdm import tqdm
from time import time
from metrics import AUC
from utils import MF_DATA, CausE_DATA, evaluate_model
import numpy as np
import argparse
import random
import torch
import copy

seed_num = 2021
print("seed_num:", seed_num)


def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True


setup_seed(seed_num)


# propensity estimation for MF_IPS
def cal_propensity_score():
    ps_train_data = np.loadtxt(opt.ps_train_data)
    ps_train_data = ps_train_data.astype(int)
    ps_val_data = np.loadtxt(opt.ps_val_data)
    ps_val_data = ps_val_data.astype(int)

    user_num = 1000
    item_num = 1720
    P_L_TO = np.bincount(ps_train_data[:, 2], minlength=2)[:]
    tmp = P_L_TO.sum()
    P_L_TO = P_L_TO / P_L_TO.sum()

    P_L_T = np.bincount(ps_val_data[:, 2], minlength=2)[:]
    P_L_T = P_L_T / P_L_T.sum()

    P_O_T = tmp / (user_num * item_num)
    P = P_L_TO * P_O_T / P_L_T

    propensity_score = [P] * item_num

    return propensity_score


# train for CausE
def train_CausE():
    train_data = CausE_DATA(opt.s_c_data, opt.s_t_data)
    val_data = MF_DATA(opt.cause_val_data)
    train_dataloader_s_c = DataLoader(train_data.s_c,
                                      opt.batch_size,
                                      shuffle=True)
    train_dataloader_s_t = DataLoader(train_data.s_t,
                                      opt.batch_size,
                                      shuffle=True)
    model = getattr(models,
                    opt.model)(train_data.users_num, train_data.items_num,
                               opt.embedding_size, opt.reg_c, opt.reg_c,
                               opt.reg_tc, train_data.s_c[:, :2].tolist(),
                               train_data.s_t[:, :2].tolist())

    model.to(opt.device)
    optimizer = model.get_optimizer(opt.lr, opt.weight_decay)

    best_mse = 10000000.
    best_mae = 10000000.
    best_auc = 0
    best_iter = 0

    model.train()
    for epoch in range(opt.max_epoch):
        t1 = time()
        for i, data in tqdm(enumerate(train_dataloader_s_c)):
            # train model
            user = data[:, 0].to(opt.device)
            item = data[:, 1].to(opt.device)
            label = data[:, 2].to(opt.device)

            loss = model.calculate_loss(user.long(),
                                        item.long(),
                                        label.float(),
                                        control=True)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if epoch % opt.verbose == 0:
            print('Epoch %d :' % (epoch))
            print('s_c Loss = ', loss.item())

        for i, data in tqdm(enumerate(train_dataloader_s_t)):
            # train model
            user = data[:, 0].to(opt.device)
            item = data[:, 1].to(opt.device)
            label = data[:, 2].to(opt.device)

            loss = model.calculate_loss(user.long(),
                                        item.long(),
                                        label.float(),
                                        control=False)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        (mae, mse, rmse, auc) = evaluate_model(model, val_data, opt)

        if opt.metric == 'mae':
            if mae < best_mae:
                best_mae, best_mse, best_auc, best_iter = mae, mse, auc, epoch
                torch.save(model.state_dict(), "./checkpoint/ci-mae-model.pth")
        elif opt.metric == 'mse':
            if mse < best_mse:
                best_mae, best_mse, best_auc, best_iter = mae, mse, auc, epoch
                torch.save(model.state_dict(), "./checkpoint/ci-mse-model.pth")
        elif opt.metric == 'auc':
            if auc > best_auc:
                best_mae, best_mse, best_auc, best_iter = mae, mse, auc, epoch
                torch.save(model.state_dict(), "./checkpoint/ci-auc-model.pth")

        if epoch % opt.verbose == 0:
            print('s_t Loss = ', loss.item())
            print(
                'Val MAE = %.4f, MSE = %.4f, RMSE = %.4f, AUC = %.4f [%.1f s]'
                % (mae, mse, rmse, auc, time() - t1))
            print("------------------------------------------")

    print("train end\nBest Epoch %d:  MAE = %.4f, MSE = %.4f, AUC = %.4f" %
          (best_iter, best_mae, best_mse, best_auc))

    best_model = getattr(models,
                         opt.model)(train_data.users_num, train_data.items_num,
                                    opt.embedding_size, opt.reg_c, opt.reg_c,
                                    opt.reg_tc, train_data.s_c[:, :2].tolist(),
                                    train_data.s_t[:, :2].tolist())
    best_model.to(opt.device)

    if opt.metric == 'mae':
        best_model.load_state_dict(torch.load("./checkpoint/ci-mae-model.pth"))
    elif opt.metric == 'mse':
        best_model.load_state_dict(torch.load("./checkpoint/ci-mse-model.pth"))
    elif opt.metric == 'auc':
        best_model.load_state_dict(torch.load("./checkpoint/ci-auc-model.pth"))

    print("\n========================= best model =========================")
    mae, mse, rmse, auc = evaluate_model(best_model, train_data, opt)
    print('Train MAE = %.4f, MSE = %.4f, RMSE = %.4f, AUC = %.4f' %
          (mae, mse, rmse, auc))
    mae, mse, rmse, auc = evaluate_model(best_model, val_data, opt)
    print('Val MAE = %.4f, MSE = %.4f, RMSE = %.4f, AUC = %.4f' %
          (mae, mse, rmse, auc))
    print("===============================================================\n")

    return best_model


# train for MF_Naive and MF_IPS
def train(propensity_score):
    print('train begin')

    train_all_data = MF_DATA(opt.train_data)
    train_data = copy.deepcopy(train_all_data)
    val_data = MF_DATA(opt.val_all_data)
    train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True)

    if opt.model == 'MF_IPS':
        inverse_propensity = np.reciprocal(propensity_score)
        model = getattr(models, opt.model)(train_all_data.users_num,
                                           train_all_data.items_num,
                                           opt.embedding_size,
                                           inverse_propensity, opt.device)
    elif opt.model == 'MF_Naive':
        model = getattr(models, opt.model)(train_all_data.users_num,
                                           train_all_data.items_num,
                                           opt.embedding_size, opt.device)

    model.to(opt.device)
    optimizer = model.get_optimizer(opt.lr, opt.weight_decay)

    best_mse = 10000000.
    best_mae = 10000000.
    best_auc = 0
    best_iter = 0

    model.train()
    for epoch in range(opt.max_epoch):
        t1 = time()
        for i, data in tqdm(enumerate(train_dataloader)):
            user = data[:, 0].to(opt.device)
            item = data[:, 1].to(opt.device)
            label = data[:, 2].to(opt.device)

            loss = model.calculate_loss(user.long(), item.long(),
                                        label.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        t2 = time()

        (mae, mse, rmse, auc) = evaluate_model(model, val_data, opt)

        if opt.metric == 'mae':
            if mae < best_mae:
                best_mae, best_mse, best_auc, best_iter = mae, mse, auc, epoch
                torch.save(model.state_dict(), "./checkpoint/ci-mae-model.pth")
        elif opt.metric == 'mse':
            if mse < best_mse:
                best_mae, best_mse, best_auc, best_iter = mae, mse, auc, epoch
                torch.save(model.state_dict(), "./checkpoint/ci-mse-model.pth")
        elif opt.metric == 'auc':
            if auc > best_auc:
                best_mae, best_mse, best_auc, best_iter = mae, mse, auc, epoch
                torch.save(model.state_dict(), "./checkpoint/ci-auc-model.pth")

        if epoch % opt.verbose == 0:
            print('Epoch %d [%.1f s]:', epoch, t2 - t1)
            print('Train Loss = ', loss.item())
            print(
                'Val MAE = %.4f, MSE = %.4f, RMSE = %.4f, AUC = %.4f [%.1f s]'
                % (mae, mse, rmse, auc, time() - t2))
            print("------------------------------------------")

    print("train end\nBest Epoch %d:  MAE = %.4f, MSE = %.4f, AUC = %.4f" %
          (best_iter, best_mae, best_mse, best_auc))

    if opt.model == 'MF_IPS':
        inverse_propensity = np.reciprocal(propensity_score)
        best_model = getattr(models, opt.model)(train_all_data.users_num,
                                                train_all_data.items_num,
                                                opt.embedding_size,
                                                inverse_propensity, opt.device)
    elif opt.model == 'MF_Naive':
        best_model = getattr(models, opt.model)(train_all_data.users_num,
                                                train_all_data.items_num,
                                                opt.embedding_size, opt.device)

    best_model.to(opt.device)

    if opt.metric == 'mae':
        best_model.load_state_dict(torch.load("./checkpoint/ci-mae-model.pth"))
    elif opt.metric == 'mse':
        best_model.load_state_dict(torch.load("./checkpoint/ci-mse-model.pth"))
    elif opt.metric == 'auc':
        best_model.load_state_dict(torch.load("./checkpoint/ci-auc-model.pth"))

    print("\n========================= best model =========================")
    mae, mse, rmse, auc = evaluate_model(best_model, train_data, opt)
    print('Train MAE = %.4f, MSE = %.4f, RMSE = %.4f, AUC = %.4f' %
          (mae, mse, rmse, auc))
    mae, mse, rmse, auc = evaluate_model(best_model, val_data, opt)
    print('Val MAE = %.4f, MSE = %.4f, RMSE = %.4f, AUC = %.4f' %
          (mae, mse, rmse, auc))
    print("===============================================================\n")

    return best_model


# gengerate submit file
def generate_submit(model):
    test_data = np.loadtxt(opt.test_data, dtype=int)
    user = torch.LongTensor(test_data[:, 0]).to(opt.device)
    item = torch.LongTensor(test_data[:, 1]).to(opt.device)
    pred = model.predict(user, item).to(opt.device)
    pred = pred.detach().cpu().numpy()
    # normalize
    pred = (pred - np.min(pred)) / (np.max(pred) - np.min(pred))
    pred = pred.reshape(-1, 1)
    submit = np.hstack((test_data, pred))
    np.savetxt("submit.csv", submit, fmt=('%d', '%d', '%f'))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Demo of argparse")
    parser.add_argument('--model', default='MF_Naive')
    parser.add_argument('--batch_size', type=int, default=512)
    parser.add_argument('--epoch', type=int, default=50)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--metric',
                        default='auc',
                        choices=["mae", "mse", "auc"])

    args = parser.parse_args()
    opt.model = args.model
    opt.batch_size = args.batch_size
    opt.max_epoch = args.epoch
    opt.lr = args.lr
    opt.metric = args.metric

    print('\n'.join(['%s:%s' % item for item in opt.__dict__.items()]))

    if opt.model == 'MF_IPS' or opt.model == 'MF_Naive':
        propensity_score = cal_propensity_score()
        propensity_score = np.array(propensity_score).astype(float)
        best_model = train(propensity_score)
        generate_submit(best_model)
    elif opt.model == 'CausE':
        best_model = train_CausE()
        generate_submit(best_model)

    print('end')
```

## Result

Results are as follow: 

|  model |  Validation  |  Test  |  
| :----: | :----: | :----: |
|  MF_Naive  | 0.7346  | 0.6615 
|  MF_IPS  | 0.7451  | 0.6798
|  CausE  | 0.7448  | 0.6822 

We can see that MF with causal methods(MF_IPS and CausE) significantly outperformed MF_Naive. So maybe we can solve this problem more by using some novel and causal algorithms.