In [1]:
import os
import sys

dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [23]:
import pickle
from functools import partial

from hydra import compose, initialize

import numpy as np
import pandas as pd

import torch

import pytorch_lightning as pl

from sklearn.model_selection import train_test_split

from ptls.preprocessing import PandasDataPreprocessor
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset, inference_data_loader
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

from src.networks.coles import MyCoLES

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
with initialize(config_path='../config', version_base=None):
    cfg = compose(config_name='config')
cfg_preprop = cfg['dataset']
cfg_model = cfg['embed_model']

In [4]:
orig_df = pd.read_parquet('data/new_data/preprocessed/preproc_dataset.parquet')

In [5]:
orig_df.drop(columns=['sample_label', 'target'], inplace=True)

In [8]:
with open('data/new_data/coles/preprocessor.p', 'rb') as f:
    preprocessor = pickle.load(f)

In [9]:
dataset = preprocessor.transform(orig_df)

In [10]:
dataset = sorted(dataset, key=lambda x: x['user_id'])

In [12]:
best_model = MyCoLES.load_from_checkpoint(
    'logs/checkpoints/coles/coles_hidden_size_32_0.ckpt',
    data_conf=cfg_preprop,
    coles_conf=cfg_model
)

In [13]:
dl = inference_data_loader(dataset, num_workers=2, batch_size=256)

In [14]:
trainer = pl.Trainer(
    max_epochs=15,
    accelerator='gpu',
    devices=[2],
    enable_progress_bar=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
embeds = torch.vstack(trainer.predict(best_model, dl))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

In [27]:
embeds = embeds.numpy()

In [21]:
users = list(map(lambda x: x['user_id'], dataset))

In [24]:
users = np.array(users)

In [31]:
users[:, np.newaxis].shape

(22533, 1)

In [52]:
embeds_with_users = np.concatenate((users[:, np.newaxis], embeds), axis=1)

In [53]:
embeds_with_users = pd.DataFrame(embeds_with_users)

In [54]:
embeds_with_users.rename(columns={0: 'user_id'}, inplace=True)
embeds_with_users['user_id'] = embeds_with_users['user_id'].astype(np.int32)
embeds_with_users.set_index('user_id', inplace=True)

In [57]:
embeds_with_users.to_parquet('data/new_data/preprocessed/user_embeddings.parquet')