In [55]:
import tez
import pandas as pd
from sklearn import model_selection 
import plotly.express as px
import torch
import numpy as np
import sklearn 

In [56]:
RANDOM_SEED = 42

In [57]:
cars_data = pd.read_csv("../data/cars_about.csv")
users_data = pd.read_csv("../data/customers.csv")
actions_data = pd.read_csv("../data/said_to_actions.csv")

In [58]:
actions_data

Unnamed: 0,car_id,user_id
0,99,0
1,273,0
2,400,0
3,256,0
4,253,0
...,...,...
6395,5,499
6396,101,499
6397,31,499
6398,9,499


In [72]:
actions_data["interaction"] = 1
actions_pivot_table = pd.pivot_table(actions_data, values='interaction', index='user_id', columns='car_id').fillna(0)
actions_data = pd.melt(actions_pivot_table.reset_index(), id_vars='user_id', value_vars=actions_pivot_table.columns).rename(columns={"value":"interaction"})
actions_data.to_csv("../data/said_to_actions_processed.csv", index=False)

In [60]:
actions_data

Unnamed: 0,user_id,car_id,interaction
0,0,1,0.0
1,1,1,0.0
2,2,1,0.0
3,3,1,0.0
4,4,1,0.0
...,...,...,...
201495,495,403,0.0
201496,496,403,0.0
201497,497,403,0.0
201498,498,403,0.0


In [61]:
px.imshow(
    pd.pivot_table(
        actions_data, 
        values='interaction', 
        index='user_id', 
        columns='car_id'
        ),
    width=800,
    height=800
    )

In [85]:
pd.read_csv("../data/said_to_actions_processed.csv")

Unnamed: 0,user_id,car_id,interaction
0,0,1,1
1,1,1,1
2,2,1,1
3,3,1,1
4,4,1,1
...,...,...,...
201495,495,403,1
201496,496,403,1
201497,497,403,1
201498,498,403,1


In [81]:
class Dataset:
    def __init__(self, users, cars, interactions):
        self.users = users
        self.cars = cars
        self.interactions = interactions

    def __len__(self) -> int:
        return len(self.users)
    
    def __getitem__(self, item):
        return {
            "user_id" : torch.tensor(self.users[item], dtype=torch.long),
            "car_id" : torch.tensor(self.cars[item], dtype=torch.long),
            "interaction" : torch.tensor(self.interactions[item], dtype=torch.float)
        }

In [116]:
class RecSysModel(tez.Model):
    def __init__(self, num_users, num_cars):
        super().__init__()
        self.user_embedding = torch.nn.Embedding(num_users, 32)
        self.cars_embedding = torch.nn.Embedding(num_cars, 32)
        self.out = torch.nn.Linear(64, 1)
        self.mse = torch.nn.MSELoss()
        self.step_scheduler_after = "epoch"

    def monitor_metrics(self, output, interaction):
        output = output.detach().cpu().numpy()
        interaction = interaction.detach().cpu().numpy()
        return dict(
            rmse = np.sqrt(sklearn.metrics.mean_squared_error(interaction, output))
        )

    def fetch_optimizer(self):
        return torch.optim.Adam(
            self.parameters(), 
            lr = 1e-3
        )

    def fetch_scheduler(self):
        return torch.optim.lr_scheduler.StepLR(
            self.optimizer, 
            step_size=3,
            gamma=0.7
        )

    def forward(self, user_id, car_id, interaction):
        user_embeds = self.user_embedding(user_id)
        car_embeds = self.cars_embedding(car_id)
        output = torch.cat([user_embeds, car_embeds], dim=-1)
        output = self.out(output)

        loss = self.mse(output, interaction.view(-1, 1))
        calc_metrics = self.monitor_metrics(output, interaction.view(-1, 1))
        
        return output, loss, calc_metrics
        

In [117]:

actions_data = pd.read_csv("../data/said_to_actions_processed.csv")
lbl_user = sklearn.preprocessing.LabelEncoder()
lbl_car = sklearn.preprocessing.LabelEncoder()

actions_data.user_id = lbl_user.fit_transform(actions_data.user_id)
actions_data.car_id = lbl_car.fit_transform(actions_data.car_id)


df_train, df_valid = model_selection.train_test_split(
    actions_data,
    test_size=0.1,
    random_state=RANDOM_SEED,
    stratify=actions_data.interaction.values
)

train_dataset = Dataset(
    df_train.user_id.values,
    df_train.car_id.values,
    df_train.interaction.values
)

valid_dataset = Dataset(
    df_valid.user_id.values,
    df_valid.car_id.values,
    df_valid.interaction.values
)

model = RecSysModel(
    num_cars=len(lbl_car.classes_),
    num_users=len(lbl_user.classes_)
)

model.fit(
    train_dataset, 
    valid_dataset, 
    train_bs=1024, 
    fp16=True
)

NOTE: This is old Model class and is deprecated. It will no longer be maintained! Please use version > 0.5.1. Its much better and supports multi-gpu training too!


100%|██████████| 178/178 [00:00<00:00, 265.37it/s, loss=0.563, rmse=0.729, stage=train]
100%|██████████| 1260/1260 [00:01<00:00, 764.98it/s, loss=0.227, rmse=0.468, stage=valid]
100%|██████████| 178/178 [00:00<00:00, 288.57it/s, loss=0.125, rmse=0.348, stage=train]
100%|██████████| 1260/1260 [00:01<00:00, 754.70it/s, loss=0.0618, rmse=0.242, stage=valid]
100%|██████████| 178/178 [00:00<00:00, 283.56it/s, loss=0.0351, rmse=0.184, stage=train]
100%|██████████| 1260/1260 [00:01<00:00, 752.99it/s, loss=0.0173, rmse=0.126, stage=valid]
100%|██████████| 178/178 [00:00<00:00, 286.16it/s, loss=0.0113, rmse=0.105, stage=train]
100%|██████████| 1260/1260 [00:01<00:00, 749.56it/s, loss=0.00686, rmse=0.0773, stage=valid]
100%|██████████| 178/178 [00:00<00:00, 284.76it/s, loss=0.00447, rmse=0.0661, stage=train]
100%|██████████| 1260/1260 [00:01<00:00, 732.23it/s, loss=0.00273, rmse=0.048, stage=valid] 
100%|██████████| 178/178 [00:00<00:00, 287.75it/s, loss=0.00184, rmse=0.0425, stage=train]
100%|█

In [118]:
{(k, v.cuda()) for k, v in valid_dataset[0].items()}

{('car_id', tensor(237, device='cuda:0')),
 ('interaction', tensor(1., device='cuda:0')),
 ('user_id', tensor(171, device='cuda:0'))}

In [110]:
valid_dataset[1]

{'user_id': tensor(396), 'car_id': tensor(264), 'interaction': tensor(1.)}

In [119]:
test_sample = valid_dataset[0]
model(**dict(zip(test_sample.keys(), [v.to("cuda") for v in test_sample.values()])))

(tensor([1.0018], device='cuda:0', grad_fn=<AddBackward0>),
 tensor(3.2928e-06, device='cuda:0', grad_fn=<MseLossBackward0>),
 {'rmse': 0.0018146038})

In [155]:
outputs = []

for test_sample in valid_dataset:
    output = model(**dict(zip(test_sample.keys(), [v.to("cuda") for v in test_sample.values()])))
    outputs.append(torch.sigmoid(output[0]).detach().cpu().item())

In [158]:
min(outputs)

0.7118238210678101