<a href="https://colab.research.google.com/github/KaihangZhao/DL_Notebook_Warehouse/blob/main/Recommendation_System/Matrix_Factorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [None]:
item = pd.read_csv('item_feature.csv')
train = pd.read_csv('training.csv')
df = train.merge(item, on = 'item_id', how = 'left')
df['label'] =1

In [None]:
u = np.random.randint(low=0.0, high=df.user_id.max(), size=int(len(df)*1.25))
i = np.random.randint(low=0.0, high=df.item_id.max(), size=int(len(df)*1.25))
c = np.random.randint(low=0.0, high=df.context_feature_id.max(), size=int(len(df)*1.25))

In [None]:
sample= pd.DataFrame([u,i,c]).T.rename(columns={0:'user_id', 1:'item_id', 2:'context_feature_id'})
sample = sample.merge(item, on = 'item_id', how = 'left')
sample['label'] = 0
df = pd.concat([df,sample])
df = df.drop_duplicates(subset=['user_id','item_id'])

In [None]:
len(df.user_id.unique()), len(sample.user_id.unique()), df.user_id.max()+1

(200153, 200152, 200153)

In [None]:
len(df.item_id.unique()), len(sample.item_id.unique()), df.item_id.max()+1

(39901, 39900, 39901)

In [None]:
df = df.sample(frac=1).reset_index(drop = True)

### Matrix Factorization

In [None]:
df1 = df[['user_id', 'item_id', 'label']]
np.random.seed(3)
msk = np.random.rand(len(df1)) < 0.8
train = df1[msk].reset_index(drop = True)
val = df1[~msk].reset_index(drop = True)

In [None]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=20, seed=23):
        super(MF, self).__init__()
        torch.manual_seed(seed)
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        self.classifier = nn.Sigmoid()
        self.nonlin = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        U = self.nonlin(U)
        V = self.item_emb(v)
        V = self.dropout(V)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return self.classifier((U*V).sum(1) +  b_u  + b_v)
        ### END SOLUTION

In [None]:
def train_one_epoch(model, train_df, optimizer):
    """ Trains the model for one epoch"""
    model.train()
    ### BEGIN SOLUTION
    y = torch.FloatTensor(train_df.label.values)
    u = torch.LongTensor(train_df.user_id.values)
    v = torch.LongTensor(train_df.item_id.values)
    y_hat = model(u,v)
    output = torch.as_tensor(y_hat > 0.5, dtype = torch.int8)
    train_acc = accuracy_score(output,y)
    train_loss = F.binary_cross_entropy(y_hat, y)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    ### END SOLUTION
    return train_loss.item(), train_acc

def valid_metrics(model, valid_df):
    """Computes validation loss and accuracy"""
    model.eval()
    ### BEGIN SOLUTION
    u = torch.LongTensor(valid_df.user_id.values)
    v = torch.LongTensor(valid_df.item_id.values)
    y = torch.FloatTensor(valid_df.label.values)
    y_hat = model(u,v)
    valid_loss = F.binary_cross_entropy(y_hat, y)
    output = torch.as_tensor(y_hat > 0.5, dtype = torch.int8)
    auc = roc_auc_score( y.detach().numpy(), y_hat.detach().numpy())
    valid_acc = accuracy_score(output,y)
    ### END SOLUTION
    return valid_loss.item(), valid_acc, auc

def training(model, train_df, valid_df, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in range(epochs):
        train_loss, train_acc = train_one_epoch(model, train_df, optimizer)
        valid_loss, valid_acc, auc = valid_metrics(model, valid_df) 
        if i%5 ==0:
            print("train loss %.3f train acc %.3f valid loss %.3f valid acc %.3f roc auc acc %.3f" % (train_loss,train_acc,valid_loss, valid_acc, auc)) 
        


In [None]:
model = MF(df.user_id.max()+1, df.item_id.max()+1, emb_size=75) 
#optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
training(model, train, val, epochs=20, lr=0.1, wd=1e-6)

In [None]:
model = MF(df.user_id.max()+1, df.item_id.max()+1, emb_size=75) 
#optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
training(model, train, val, epochs=30, lr=0.1, wd=1e-6)

train loss 0.708 train acc 0.185 valid loss 0.625 valid acc 0.870 roc auc acc 0.825
train loss 0.330 train acc 0.915 valid loss 0.287 valid acc 0.912 roc auc acc 0.926
train loss 0.224 train acc 0.923 valid loss 0.270 valid acc 0.915 roc auc acc 0.926
train loss 0.244 train acc 0.932 valid loss 0.275 valid acc 0.917 roc auc acc 0.930
train loss 0.223 train acc 0.944 valid loss 0.265 valid acc 0.922 roc auc acc 0.935
train loss 0.203 train acc 0.948 valid loss 0.243 valid acc 0.926 roc auc acc 0.943


In [None]:
training(model, train, val, epochs=30, lr=0.05, wd=1e-6)

train loss 0.190 train acc 0.955 valid loss 0.243 valid acc 0.928 roc auc acc 0.946
train loss 0.195 train acc 0.955 valid loss 0.233 valid acc 0.931 roc auc acc 0.949
train loss 0.191 train acc 0.958 valid loss 0.230 valid acc 0.934 roc auc acc 0.952
train loss 0.186 train acc 0.960 valid loss 0.225 valid acc 0.937 roc auc acc 0.954
train loss 0.186 train acc 0.961 valid loss 0.225 valid acc 0.938 roc auc acc 0.954
train loss 0.186 train acc 0.961 valid loss 0.224 valid acc 0.938 roc auc acc 0.954


In [None]:
training(model, train, val, epochs=30, lr=0.001, wd=1e-6)

train loss 0.186 train acc 0.962 valid loss 0.224 valid acc 0.939 roc auc acc 0.955
train loss 0.186 train acc 0.962 valid loss 0.224 valid acc 0.939 roc auc acc 0.955
train loss 0.186 train acc 0.962 valid loss 0.224 valid acc 0.939 roc auc acc 0.955
train loss 0.186 train acc 0.962 valid loss 0.224 valid acc 0.939 roc auc acc 0.955
train loss 0.186 train acc 0.962 valid loss 0.224 valid acc 0.939 roc auc acc 0.955
train loss 0.186 train acc 0.962 valid loss 0.224 valid acc 0.939 roc auc acc 0.955


In [None]:
test = pd.read_csv('test_kaggle.csv')
test.head()

Unnamed: 0,id,user_id,item_id,context_feature_id
0,0,4,16835,2
1,1,4,22590,3
2,2,4,1978,1
3,3,4,28916,1
4,4,4,14427,2


In [None]:
u = torch.LongTensor(test.user_id.values)
v = torch.LongTensor(test.item_id.values)
y_hat = model(u,v)
prob = pd.Series(y_hat.detach().numpy()).reset_index().rename(columns = {'index':'id',0:'rating'})
# prob.to_csv('',index=False)

In [None]:
sum(prob.rating>0.5)/len(prob)

0.32708942407278735