In [1]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict

import addict
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
import yaml

from utils import NDCG
from data import QueryDS
from LambdaNet import LambdaRank, init_weights, train_step, validation_step

pd.options.display.max_columns = 100

In [3]:
with open("config.yaml", 'r') as f:
    cfg = addict.Dict(yaml.safe_load(f))

In [4]:
cfg

{'data_path': '../train_test.pkl',
 'features': ['avg_w2vec',
  'avg_cosine',
  'sum_w2vec',
  'sum_cosine',
  'len_of_w2vec',
  'len_of_cos',
  'len_of_pred',
  'mean_ovr',
  'sum_ovr',
  'intersection_w2vec',
  'intersection_cosine',
  'mean_score_for_prediction',
  'num_of_cart',
  'num_of_views',
  'is_popular',
  'pred_cart_sim',
  'pred_view_sim',
  'prob_cart',
  'prob_view',
  'max_cart_sim',
  'max_view_sim',
  'cat1_encoded_feature',
  'cat2_encoded_feature'],
 'target': 'target',
 'query_id_col': 'q_id'}

In [5]:
def to_float32(data: pd.DataFrame) -> pd.DataFrame:
    for col in list(data.columns[1:]):
        try:
            data[col] = data[col].astype(np.float32)
        except:
            pass
    return data

In [6]:
#Temporary. Remove in ETL process

data = pd.read_pickle(cfg.data_path)
data = data.rename({'index': 'q_id'}, axis=1)
train = data[data['type'] == 'train']
test = data[data['type'] == 'test']
train.head()

Unnamed: 0,q_id,view,cart,type,avg_w2vec,avg_cosine,sum_w2vec,sum_cosine,len_of_w2vec,len_of_cos,len_of_pred,mean_ovr,sum_ovr,intersection_w2vec,intersection_cosine,prediction,score,target,mean_score_for_prediction,num_of_cart,num_of_views,category_1,category_2,is_popular,pred_cart_sim,pred_view_sim,prob_cart,prob_view,max_cart_sim,max_view_sim,cat1_encoded_feature,cat2_encoded_feature
0,0,"[466358, 363504, 878046, 1427398, 878047, 7355...","[1427401, 855319, 1051815, 65639, 652337]",train,0.99857,0.591565,49.928509,29.578244,50,50,100,0.795068,79.506753,0,0,1208298,0.999253,0,0.95632,2.0,3.0,"Туризм, рыбалка, охота",Рыбалка,True,0.816497,0.0,0.0,0.0,0.816497,0.0,0.004192,0.004192
1,0,"[466358, 363504, 878046, 1427398, 878047, 7355...","[1427401, 855319, 1051815, 65639, 652337]",train,0.99857,0.591565,49.928509,29.578244,50,50,100,0.795068,79.506753,0,0,389284,0.999048,0,0.915601,2.0,10.0,"Туризм, рыбалка, охота",Рыбалка,True,0.408248,0.0,0.0,0.0,0.408248,0.0,0.0,0.0
2,0,"[466358, 363504, 878046, 1427398, 878047, 7355...","[1427401, 855319, 1051815, 65639, 652337]",train,0.99857,0.591565,49.928509,29.578244,50,50,100,0.795068,79.506753,0,0,82739,0.999008,0,0.986127,3.0,0.0,"Туризм, рыбалка, охота",Рыбалка,True,0.333333,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
3,0,"[466358, 363504, 878046, 1427398, 878047, 7355...","[1427401, 855319, 1051815, 65639, 652337]",train,0.99857,0.591565,49.928509,29.578244,50,50,100,0.795068,79.506753,0,0,1491380,0.998987,0,0.997073,3.0,2.0,Хобби и творчество,"Создание картин, фоторамок, открыток",True,0.0,0.0,0.0,0.0,0.0,0.0,0.004192,0.004192
4,0,"[466358, 363504, 878046, 1427398, 878047, 7355...","[1427401, 855319, 1051815, 65639, 652337]",train,0.99857,0.591565,49.928509,29.578244,50,50,100,0.795068,79.506753,0,0,1315601,0.998962,0,0.985984,2.0,0.0,"Туризм, рыбалка, охота",Рыбалка,True,0.408248,0.0,0.0,0.0,0.408248,0.0,0.0,0.0


In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train[cfg.features] = sc.fit_transform(train[cfg.features].values)
print(test.shape)
for col in ["prob_view", "prob_cart"]:
    test[col] = test[col].apply(lambda x: np.nan
                                        if x in [np.inf, -np.inf]
                                        else x)

test = test.dropna(subset=["prob_view", "prob_cart"])
print(test.shape)
test[cfg.features] = sc.transform(test[cfg.features])
test.head()

test = to_float32(test)
train = to_float32(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[cfg.features] = sc.fit_transform(train[cfg.features].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


(568781, 32)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].apply(lambda x: np.nan


(568702, 32)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].astype(np.float32)


In [8]:
train_dataset = QueryDS(train, cfg)
test_dataset = QueryDS(test, cfg, 'test')

30000 unique sessions before filter.
train shape: (4319767, 32)
4130 unique sessions after filter.
train shape: (123900, 32)
4000 unique sessions before filter.
train shape: (568702, 32)
4000 unique sessions after filter.
train shape: (568702, 32)


In [9]:
train_loader = DataLoader(train_dataset, batch_size=1, num_workers=1, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=1, num_workers=1, shuffle=True, pin_memory=True)

In [10]:
input_size = len(cfg.features)
lambdarank_structure = [input_size, 64, 16]
ndcg_gain_in_train = 'exp2'
sigma = 1.0
device = torch.device("cuda:0")
my_net = LambdaRank(net_structures=lambdarank_structure, sigma=sigma) # , ndcg_gain_in_train=ndcg_gain_in_train)
my_net.to(device)
my_net.apply(init_weights)
print(my_net)


ideal_dcg = NDCG(10, ndcg_gain_in_train)
count = 0
batch_size = 64
grad_batch, y_pred_batch = [], []
lr = 0.0001
optimizer = torch.optim.Adam(my_net.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.75)

LambdaRank(
  (fc1): Linear(in_features=23, out_features=64, bias=True)
  (act1): ReLU()
  (fc2): Linear(in_features=64, out_features=16, bias=True)
  (act2): ReLU()
  (fc3): Linear(in_features=16, out_features=1, bias=True)
  (activation): Sigmoid()
)


In [11]:
from utils import Logger

In [12]:
writer = Logger("lambda_rank_log")

for epoch in range(10):
    train_step(my_net, train_loader, writer, batch_size, optimizer, ideal_dcg, epoch)
    validation_step(my_net, test_loader, writer, epoch)

In [14]:
torch.save(my_net.state_dict(), "10epochs.pth")

----