In [108]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load and preprocess data

In [109]:
# !pip install pytorch-lifestream
# !pip install numpy==1.23
# !pip install pytorch-lightning==1.6

In [110]:
from data_utils import download_data, preprocess_data, split_data, get_loader
from model_utils import embedder_model
from train_utils import train_emb_model, get_embeddings, get_trainer

In [111]:
data_path = download_data()

Data saved in <data> folder


In [119]:
%%time
processed_data = preprocess_data(data_path)

Preprocessor loaded
CPU times: user 46.7 s, sys: 4.66 s, total: 51.3 s
Wall time: 51.1 s


In [120]:
train, test = split_data(processed_data, test_size=0.25)

22500 users in train, 7500 users in test


In [134]:
train_loader = get_loader(train)
val_loader = get_loader(test)
all_loader = get_loader(processed_data)

## Train embedder model

In [122]:
model = embedder_model()
n_epochs = 10

Load embedding model from checkpoint


In [141]:
_ = train_emb_model(model, all_loader, n_epochs=n_epochs)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 79.5 K
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
79.5 K    Trainable params
0         Non-trainable params
79.5 K    Total params
0.318     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

{'loss': tensor(59.0694), 'seq_len': tensor(108.0583)}


## Obtain embeddings

In [137]:
train_embeds = get_embeddings(model, train)

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

Getting embedding of shape torch.Size([22500, 128])


In [138]:
val_embeds = get_embeddings(model, test)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

Getting embedding of shape torch.Size([7500, 128])


## Evaluate on classification task

In [139]:
import os# join target and embeddings
data_path = 'data/'
df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=val_embeds, columns=[f'embed_{i}' for i in range(val_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(22500, 130) (7500, 130)


In [140]:
from sklearn.ensemble import RandomForestClassifier

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.588