In [1]:
!pip install implicit

Collecting implicit
  Obtaining dependency information for implicit from https://files.pythonhosted.org/packages/cd/cc/deac70cae8cc32c9885d0cd73bc66e1b3cbea36ae7080b8c83995eaf5322/implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2


In [2]:
PATH = '/kaggle/input/otto-recsys-short/kaggle/working/'

In [3]:
import numpy as np
import pandas as pd

from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

from tqdm.auto import tqdm
import gc

from implicit.gpu.als import AlternatingLeastSquares



In [4]:
def encode_data(data, entities):
    data_ = data.copy()
    encoders = {}
    for e in entities:
        encoder = LabelEncoder()
        encoder.fit(data_[e])
        data_[e] = encoder.transform(data_[e])
        encoders[e] = encoder
    return data_, encoders

def get_table(n_elements):
    table = pd.DataFrame.from_dict(
        {i: None for i in range(n_elements)}, orient='index').reset_index()
    table = table.rename(columns={'index' : 'session', 0 : 'aid'})
    return table

## ALS model 1

Let's take default ALS method from implicit library.

In [5]:
df_clicks_train = pd.read_parquet(PATH + "/otto_exploded_dataset/clicks/train")
df_carts_train = pd.read_parquet(PATH + "/otto_exploded_dataset/carts/train")
df_orders_train = pd.read_parquet(PATH + "/otto_exploded_dataset/orders/train")

df_clicks_train['type'] = 1
df_carts_train['type'] = 1
df_orders_train['type'] = 1

### Clicks

In [6]:
entities_clicks = ['session', 'aid']
training_clicks, encoders_clicks = encode_data(df_clicks_train, entities=entities_clicks)

num_session_clicks = training_clicks['session'].nunique()
num_aid_clicks = training_clicks['aid'].nunique()

csr_data_clicks = csr_matrix(
    (training_clicks['type'], (training_clicks.session, training_clicks.aid)),
    shape=(num_session_clicks, num_aid_clicks)
)

csr_data_clicks

<385470x774068 sparse matrix of type '<class 'numpy.int64'>'
	with 4162102 stored elements in Compressed Sparse Row format>

In [7]:
config_clicks = {
    'calculate_training_loss' : True, 
    'random_state' : 59 
}

als_model_clicks = AlternatingLeastSquares(
    calculate_training_loss=config_clicks['calculate_training_loss'], 
    random_state=config_clicks['random_state']
)

als_model_clicks.fit(csr_data_clicks, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [8]:
!mkdir recs_clicks_als_1
!ls

recs_clicks_als_1


In [9]:
batch_size = 2000
batch_count = (num_session_clicks + batch_size - 1) // batch_size
data_batches = []

all_users = np.arange(num_session_clicks)
for i, temp_user in tqdm(enumerate(range(batch_count))):

    if i <= batch_count - 1:
        temp_users = all_users[i*batch_size : (i+1)*batch_size]
    else:
        temp_users = all_users[i*batch_size : ]

    temp_table = get_table(len(temp_users))
    temp_table['session'] = encoders_clicks['session'].inverse_transform(temp_users)

    idxs = als_model_clicks.recommend(
        temp_users, 
        csr_data_clicks[temp_user], 
        N=2000, 
        filter_already_liked_items=False
    )[0]

    temp_table['aid'] = list(map(encoders_clicks['aid'].inverse_transform, idxs))
    temp_table.to_parquet(f'recs_clicks_als_1/part_{i}.parquet')

0it [00:00, ?it/s]

In [10]:
!zip -r "recs_clicks_als_1.zip" "recs_clicks_als_1"

  adding: recs_clicks_als_1/ (stored 0%)
  adding: recs_clicks_als_1/part_12.parquet (deflated 5%)
  adding: recs_clicks_als_1/part_33.parquet (deflated 5%)
  adding: recs_clicks_als_1/part_53.parquet (deflated 6%)
  adding: recs_clicks_als_1/part_108.parquet (deflated 5%)
  adding: recs_clicks_als_1/part_11.parquet (deflated 5%)
  adding: recs_clicks_als_1/part_94.parquet (deflated 6%)
  adding: recs_clicks_als_1/part_156.parquet (deflated 6%)
  adding: recs_clicks_als_1/part_162.parquet (deflated 5%)
  adding: recs_clicks_als_1/part_92.parquet (deflated 5%)
  adding: recs_clicks_als_1/part_115.parquet (deflated 6%)
  adding: recs_clicks_als_1/part_177.parquet (deflated 5%)
  adding: recs_clicks_als_1/part_191.parquet (deflated 5%)
  adding: recs_clicks_als_1/part_159.parquet (deflated 5%)
  adding: recs_clicks_als_1/part_89.parquet (deflated 6%)
  adding: recs_clicks_als_1/part_118.parquet (deflated 6%)
  adding: recs_clicks_als_1/part_22.parquet (deflated 5%)
  adding: recs_clicks_a

### Carts

In [11]:
entities_carts = ['session', 'aid']
training_carts, encoders_carts = encode_data(df_carts_train, entities=entities_carts)

num_session_carts = training_carts['session'].nunique()
num_aid_carts = training_carts['aid'].nunique()

csr_data_carts = csr_matrix(
    (training_carts['type'], (training_carts.session, training_carts.aid)),
    shape=(num_session_carts, num_aid_carts)
)

csr_data_carts

<123891x196951 sparse matrix of type '<class 'numpy.int64'>'
	with 427040 stored elements in Compressed Sparse Row format>

In [12]:
config_carts = {
    'calculate_training_loss' : True, 
    'random_state' : 59 
}

als_model_carts = AlternatingLeastSquares(
    calculate_training_loss=config_carts['calculate_training_loss'], 
    random_state=config_carts['random_state']
)

als_model_carts.fit(csr_data_carts, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [13]:
!mkdir recs_carts_als_1
!ls

recs_carts_als_1  recs_clicks_als_1  recs_clicks_als_1.zip


In [14]:
batch_size = 2000
batch_count = (num_session_carts + batch_size - 1) // batch_size
data_batches = []

all_users = np.arange(num_session_carts)
for i, temp_user in tqdm(enumerate(range(batch_count))):

    if i <= batch_count - 1:
        temp_users = all_users[i*batch_size : (i+1)*batch_size]
    else:
        temp_users = all_users[i*batch_size : ]

    temp_table = get_table(len(temp_users))
    temp_table['session'] = encoders_carts['session'].inverse_transform(temp_users)
        
    idxs = als_model_carts.recommend(
        temp_users, 
        csr_data_carts[temp_user], 
        N=2000, 
        filter_already_liked_items=False
    )[0]

    temp_table['aid'] = list(map(encoders_carts['aid'].inverse_transform, idxs))
    temp_table.to_parquet(f'recs_carts_als_1/part_{i}.parquet')

0it [00:00, ?it/s]

In [15]:
!zip -r "recs_carts_als_1.zip" "recs_carts_als_1"

  adding: recs_carts_als_1/ (stored 0%)
  adding: recs_carts_als_1/part_12.parquet (deflated 7%)
  adding: recs_carts_als_1/part_33.parquet (deflated 8%)
  adding: recs_carts_als_1/part_53.parquet (deflated 7%)
  adding: recs_carts_als_1/part_11.parquet (deflated 8%)
  adding: recs_carts_als_1/part_22.parquet (deflated 8%)
  adding: recs_carts_als_1/part_27.parquet (deflated 8%)
  adding: recs_carts_als_1/part_40.parquet (deflated 7%)
  adding: recs_carts_als_1/part_19.parquet (deflated 7%)
  adding: recs_carts_als_1/part_25.parquet (deflated 8%)
  adding: recs_carts_als_1/part_45.parquet (deflated 7%)
  adding: recs_carts_als_1/part_21.parquet (deflated 8%)
  adding: recs_carts_als_1/part_55.parquet (deflated 8%)
  adding: recs_carts_als_1/part_14.parquet (deflated 8%)
  adding: recs_carts_als_1/part_13.parquet (deflated 8%)
  adding: recs_carts_als_1/part_51.parquet (deflated 7%)
  adding: recs_carts_als_1/part_46.parquet (deflated 8%)
  adding: recs_carts_als_1/part_15.parquet (defl

### Orders

In [16]:
entities_orders = ['session', 'aid']
training_orders, encoders_orders = encode_data(df_orders_train, entities=entities_orders)

num_session_orders = training_orders['session'].nunique()
num_aid_orders = training_orders['aid'].nunique()

csr_data_orders = csr_matrix(
    (training_orders['type'], (training_orders.session, training_orders.aid)),
    shape=(num_session_orders, num_aid_orders)
)

csr_data_orders

<55845x79824 sparse matrix of type '<class 'numpy.int64'>'
	with 140831 stored elements in Compressed Sparse Row format>

In [17]:
config_orders = {
    'calculate_training_loss' : True, 
    'random_state' : 59 
}

als_model_orders = AlternatingLeastSquares(
    calculate_training_loss=config_orders['calculate_training_loss'], 
    random_state=config_orders['random_state']
)

als_model_orders.fit(csr_data_orders, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [18]:
!mkdir recs_orders_als_1
!ls

recs_carts_als_1      recs_clicks_als_1      recs_orders_als_1
recs_carts_als_1.zip  recs_clicks_als_1.zip


In [19]:
batch_size = 2000
batch_count = (num_session_orders + batch_size - 1) // batch_size
data_batches = []

all_users = np.arange(num_session_orders)
for i, temp_user in tqdm(enumerate(range(batch_count))):

    if i <= batch_count - 1:
        temp_users = all_users[i*batch_size : (i+1)*batch_size]
    else:
        temp_users = all_users[i*batch_size : ]

    temp_table = get_table(len(temp_users))
    temp_table['session'] = encoders_orders['session'].inverse_transform(temp_users)
        
    idxs = als_model_orders.recommend(
        temp_users, 
        csr_data_orders[temp_user], 
        N=2000, 
        filter_already_liked_items=False
    )[0]

    temp_table['aid'] = list(map(encoders_orders['aid'].inverse_transform, idxs))
    temp_table.to_parquet(f'recs_orders_als_1/part_{i}.parquet')

0it [00:00, ?it/s]

In [20]:
!zip -r "recs_orders_als_1.zip" "recs_orders_als_1"

  adding: recs_orders_als_1/ (stored 0%)
  adding: recs_orders_als_1/part_12.parquet (deflated 2%)
  adding: recs_orders_als_1/part_11.parquet (deflated 2%)
  adding: recs_orders_als_1/part_22.parquet (deflated 2%)
  adding: recs_orders_als_1/part_27.parquet (deflated 2%)
  adding: recs_orders_als_1/part_19.parquet (deflated 2%)
  adding: recs_orders_als_1/part_25.parquet (deflated 2%)
  adding: recs_orders_als_1/part_21.parquet (deflated 2%)
  adding: recs_orders_als_1/part_14.parquet (deflated 2%)
  adding: recs_orders_als_1/part_13.parquet (deflated 2%)
  adding: recs_orders_als_1/part_15.parquet (deflated 2%)
  adding: recs_orders_als_1/part_17.parquet (deflated 2%)
  adding: recs_orders_als_1/part_20.parquet (deflated 2%)
  adding: recs_orders_als_1/part_7.parquet (deflated 2%)
  adding: recs_orders_als_1/part_10.parquet (deflated 2%)
  adding: recs_orders_als_1/part_0.parquet (deflated 2%)
  adding: recs_orders_als_1/part_26.parquet (deflated 2%)
  adding: recs_orders_als_1/part_

## ALS model 2

Let's preprocess our data firstly and then use standart ALS.

In [21]:
df_clicks_train = pd.read_parquet(PATH + "/otto_exploded_dataset/clicks/train")
df_carts_train = pd.read_parquet(PATH + "/otto_exploded_dataset/carts/train")
df_orders_train = pd.read_parquet(PATH + "/otto_exploded_dataset/orders/train")

df_clicks_train['type'] = 1
df_carts_train['type'] = 1
df_orders_train['type'] = 1

### Clicks

In [22]:
# Data preprocess

min_ts, max_ts  = df_clicks_train['ts'].min(), df_clicks_train['ts'].max()
df_clicks_train['weight'] = np.exp(
    0.5 * (df_clicks_train['ts'] - min_ts) / (max_ts - min_ts))

scale_factor = 10
df_clicks_train['weighted_type'] = scale_factor * df_clicks_train['type'] * df_clicks_train['weight']

In [23]:
entities_clicks = ['session', 'aid']
training_clicks, encoders_clicks = encode_data(df_clicks_train, entities=entities_clicks)

num_session_clicks = training_clicks['session'].nunique()
num_aid_clicks = training_clicks['aid'].nunique()

csr_data_clicks = csr_matrix(
    (training_clicks['weighted_type'], (training_clicks.session, training_clicks.aid)),
    shape=(num_session_clicks, num_aid_clicks)
)

csr_data_clicks

<385470x774068 sparse matrix of type '<class 'numpy.float64'>'
	with 4162102 stored elements in Compressed Sparse Row format>

In [24]:
config_clicks = {
    'calculate_training_loss' : True, 
    'random_state' : 59 
}

als_model_clicks = AlternatingLeastSquares(
    calculate_training_loss=config_clicks['calculate_training_loss'], 
    random_state=config_clicks['random_state']
)

als_model_clicks.fit(csr_data_clicks, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [25]:
!mkdir recs_clicks_als_2
!ls

recs_carts_als_1      recs_clicks_als_1.zip  recs_orders_als_1.zip
recs_carts_als_1.zip  recs_clicks_als_2
recs_clicks_als_1     recs_orders_als_1


In [26]:
batch_size = 2000
batch_count = (num_session_clicks + batch_size - 1) // batch_size
data_batches = []

all_users = np.arange(num_session_clicks)
for i, temp_user in tqdm(enumerate(range(batch_count))):

    if i <= batch_count - 1:
        temp_users = all_users[i*batch_size : (i+1)*batch_size]
    else:
        temp_users = all_users[i*batch_size : ]

    temp_table = get_table(len(temp_users))
    temp_table['session'] = encoders_clicks['session'].inverse_transform(temp_users)

    idxs = als_model_clicks.recommend(
        temp_users, 
        csr_data_clicks[temp_user], 
        N=2000, 
        filter_already_liked_items=False
    )[0]

    temp_table['aid'] = list(map(encoders_clicks['aid'].inverse_transform, idxs))
    temp_table.to_parquet(f'recs_clicks_als_2/part_{i}.parquet')

0it [00:00, ?it/s]

In [27]:
!zip -r "recs_clicks_als_2.zip" "recs_clicks_als_2"

  adding: recs_clicks_als_2/ (stored 0%)
  adding: recs_clicks_als_2/part_12.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_33.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_53.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_108.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_11.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_94.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_156.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_162.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_92.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_115.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_177.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_191.parquet (deflated 2%)
  adding: recs_clicks_als_2/part_159.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_89.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_118.parquet (deflated 3%)
  adding: recs_clicks_als_2/part_22.parquet (deflated 3%)
  adding: recs_clicks_a

### Carts

In [28]:
# Data preprocess

min_ts, max_ts  = df_carts_train['ts'].min(), df_carts_train['ts'].max()
df_carts_train['weight'] = np.exp(
    0.5 * (df_carts_train['ts'] - min_ts) / (max_ts - min_ts))

scale_factor = 10
df_carts_train['weighted_type'] = scale_factor * df_carts_train['type'] * df_carts_train['weight']

In [29]:
entities_carts = ['session', 'aid']
training_carts, encoders_carts = encode_data(df_carts_train, entities=entities_carts)

num_session_carts = training_carts['session'].nunique()
num_aid_carts = training_carts['aid'].nunique()

csr_data_carts = csr_matrix(
    (training_carts['weighted_type'], (training_carts.session, training_carts.aid)),
    shape=(num_session_carts, num_aid_carts)
)

csr_data_carts

<123891x196951 sparse matrix of type '<class 'numpy.float64'>'
	with 427040 stored elements in Compressed Sparse Row format>

In [30]:
config_carts = {
    'calculate_training_loss' : True, 
    'random_state' : 59 
}

als_model_carts = AlternatingLeastSquares(
    calculate_training_loss=config_carts['calculate_training_loss'], 
    random_state=config_carts['random_state']
)

als_model_carts.fit(csr_data_carts, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [31]:
!mkdir recs_carts_als_2
!ls

recs_carts_als_1      recs_clicks_als_1      recs_clicks_als_2.zip
recs_carts_als_1.zip  recs_clicks_als_1.zip  recs_orders_als_1
recs_carts_als_2      recs_clicks_als_2      recs_orders_als_1.zip


In [32]:
batch_size = 2000
batch_count = (num_session_carts + batch_size - 1) // batch_size
data_batches = []

all_users = np.arange(num_session_carts)
for i, temp_user in tqdm(enumerate(range(batch_count))):

    if i <= batch_count - 1:
        temp_users = all_users[i*batch_size : (i+1)*batch_size]
    else:
        temp_users = all_users[i*batch_size : ]

    temp_table = get_table(len(temp_users))
    temp_table['session'] = encoders_carts['session'].inverse_transform(temp_users)
        
    idxs = als_model_carts.recommend(
        temp_users, 
        csr_data_carts[temp_user], 
        N=2000, 
        filter_already_liked_items=False
    )[0]

    temp_table['aid'] = list(map(encoders_carts['aid'].inverse_transform, idxs))
    temp_table.to_parquet(f'recs_carts_als_2/part_{i}.parquet')

0it [00:00, ?it/s]

In [33]:
!zip -r "recs_carts_als_2.zip" "recs_carts_als_2"

  adding: recs_carts_als_2/ (stored 0%)
  adding: recs_carts_als_2/part_12.parquet (deflated 5%)
  adding: recs_carts_als_2/part_33.parquet (deflated 6%)
  adding: recs_carts_als_2/part_53.parquet (deflated 6%)
  adding: recs_carts_als_2/part_11.parquet (deflated 5%)
  adding: recs_carts_als_2/part_22.parquet (deflated 6%)
  adding: recs_carts_als_2/part_27.parquet (deflated 5%)
  adding: recs_carts_als_2/part_40.parquet (deflated 5%)
  adding: recs_carts_als_2/part_19.parquet (deflated 6%)
  adding: recs_carts_als_2/part_25.parquet (deflated 5%)
  adding: recs_carts_als_2/part_45.parquet (deflated 5%)
  adding: recs_carts_als_2/part_21.parquet (deflated 6%)
  adding: recs_carts_als_2/part_55.parquet (deflated 5%)
  adding: recs_carts_als_2/part_14.parquet (deflated 5%)
  adding: recs_carts_als_2/part_13.parquet (deflated 6%)
  adding: recs_carts_als_2/part_51.parquet (deflated 5%)
  adding: recs_carts_als_2/part_46.parquet (deflated 5%)
  adding: recs_carts_als_2/part_15.parquet (defl

### Orders

In [34]:
# Data preprocess

min_ts, max_ts  = df_orders_train['ts'].min(), df_orders_train['ts'].max()
df_orders_train['weight'] = np.exp(
    0.5 * (df_orders_train['ts'] - min_ts) / (max_ts - min_ts))

scale_factor = 10
df_orders_train['weighted_type'] = scale_factor * df_orders_train['type'] * df_orders_train['weight']

In [35]:
entities_orders = ['session', 'aid']
training_orders, encoders_orders = encode_data(df_orders_train, entities=entities_orders)

num_session_orders = training_orders['session'].nunique()
num_aid_orders = training_orders['aid'].nunique()

csr_data_orders = csr_matrix(
    (training_orders['weighted_type'], (training_orders.session, training_orders.aid)),
    shape=(num_session_orders, num_aid_orders)
)

csr_data_orders

<55845x79824 sparse matrix of type '<class 'numpy.float64'>'
	with 140831 stored elements in Compressed Sparse Row format>

In [36]:
config_orders = {
    'calculate_training_loss' : True, 
    'random_state' : 59 
}

als_model_orders = AlternatingLeastSquares(
    calculate_training_loss=config_orders['calculate_training_loss'], 
    random_state=config_orders['random_state']
)

als_model_orders.fit(csr_data_orders, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [37]:
!mkdir recs_orders_als_2
!ls

recs_carts_als_1      recs_clicks_als_1      recs_orders_als_1
recs_carts_als_1.zip  recs_clicks_als_1.zip  recs_orders_als_1.zip
recs_carts_als_2      recs_clicks_als_2      recs_orders_als_2
recs_carts_als_2.zip  recs_clicks_als_2.zip


In [38]:
batch_size = 2000
batch_count = (num_session_orders + batch_size - 1) // batch_size
data_batches = []

all_users = np.arange(num_session_orders)
for i, temp_user in tqdm(enumerate(range(batch_count))):

    if i <= batch_count - 1:
        temp_users = all_users[i*batch_size : (i+1)*batch_size]
    else:
        temp_users = all_users[i*batch_size : ]

    temp_table = get_table(len(temp_users))
    temp_table['session'] = encoders_orders['session'].inverse_transform(temp_users)

    idxs = als_model_orders.recommend(
        temp_users,
        csr_data_orders[temp_user],
        N=2000,
        filter_already_liked_items=False
    )[0]

    temp_table['aid'] = list(map(encoders_orders['aid'].inverse_transform, idxs))
    temp_table.to_parquet(f'recs_orders_als_2/part_{i}.parquet')

0it [00:00, ?it/s]

In [39]:
!zip -r "recs_orders_als_2.zip" "recs_orders_als_2"

  adding: recs_orders_als_2/ (stored 0%)
  adding: recs_orders_als_2/part_12.parquet (deflated 2%)
  adding: recs_orders_als_2/part_11.parquet (deflated 2%)
  adding: recs_orders_als_2/part_22.parquet (deflated 2%)
  adding: recs_orders_als_2/part_27.parquet (deflated 2%)
  adding: recs_orders_als_2/part_19.parquet (deflated 2%)
  adding: recs_orders_als_2/part_25.parquet (deflated 2%)
  adding: recs_orders_als_2/part_21.parquet (deflated 2%)
  adding: recs_orders_als_2/part_14.parquet (deflated 2%)
  adding: recs_orders_als_2/part_13.parquet (deflated 2%)
  adding: recs_orders_als_2/part_15.parquet (deflated 3%)
  adding: recs_orders_als_2/part_17.parquet (deflated 2%)
  adding: recs_orders_als_2/part_20.parquet (deflated 2%)
  adding: recs_orders_als_2/part_7.parquet (deflated 2%)
  adding: recs_orders_als_2/part_10.parquet (deflated 2%)
  adding: recs_orders_als_2/part_0.parquet (deflated 2%)
  adding: recs_orders_als_2/part_26.parquet (deflated 2%)
  adding: recs_orders_als_2/part_

## ALS model 3

Let's add to previous model some additional constraints of the data.

In [40]:
df_clicks_train = pd.read_parquet(PATH + "/otto_exploded_dataset/clicks/train")
df_carts_train = pd.read_parquet(PATH + "/otto_exploded_dataset/carts/train")
df_orders_train = pd.read_parquet(PATH + "/otto_exploded_dataset/orders/train")

df_clicks_train['type'] = 1
df_carts_train['type'] = 1
df_orders_train['type'] = 1

### Clicks

In [41]:
# Data preprocess
min_ts, max_ts  = df_clicks_train['ts'].min(), df_clicks_train['ts'].max()
df_clicks_train['weight'] = np.exp(
    0.5 * (df_clicks_train['ts'] - min_ts) / (max_ts - min_ts))

scale_factor = 10
df_clicks_train['weighted_type'] = scale_factor * df_clicks_train['type'] * df_clicks_train['weight']

In [42]:
entities_clicks = ['session', 'aid']
training_clicks, encoders_clicks = encode_data(df_clicks_train, entities=entities_clicks)

num_session_clicks = training_clicks['session'].nunique()
num_aid_clicks = training_clicks['aid'].nunique()

csr_data_clicks = csr_matrix(
    (training_clicks['weighted_type'], (training_clicks.session, training_clicks.aid)),
    shape=(num_session_clicks, num_aid_clicks)
)

csr_data_clicks.data = np.clip(csr_data_clicks.data, 0, 20)

csr_data_clicks

<385470x774068 sparse matrix of type '<class 'numpy.float64'>'
	with 4162102 stored elements in Compressed Sparse Row format>

In [43]:
config_clicks = {
    'calculate_training_loss' : True, 
    'random_state' : 59 
}

als_model_clicks = AlternatingLeastSquares(
    calculate_training_loss=config_clicks['calculate_training_loss'], 
    random_state=config_clicks['random_state']
)

als_model_clicks.fit(csr_data_clicks, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [44]:
!mkdir recs_clicks_als_3
!ls

recs_carts_als_1      recs_clicks_als_1.zip  recs_orders_als_1.zip
recs_carts_als_1.zip  recs_clicks_als_2      recs_orders_als_2
recs_carts_als_2      recs_clicks_als_2.zip  recs_orders_als_2.zip
recs_carts_als_2.zip  recs_clicks_als_3
recs_clicks_als_1     recs_orders_als_1


In [45]:
batch_size = 2000
batch_count = (num_session_clicks + batch_size - 1) // batch_size
data_batches = []

all_users = np.arange(num_session_clicks)
for i, temp_user in tqdm(enumerate(range(batch_count))):

    if i <= batch_count - 1:
        temp_users = all_users[i*batch_size : (i+1)*batch_size]
    else:
        temp_users = all_users[i*batch_size : ]

    temp_table = get_table(len(temp_users))
    temp_table['session'] = encoders_clicks['session'].inverse_transform(temp_users)

    idxs = als_model_clicks.recommend(
        temp_users, 
        csr_data_clicks[temp_user], 
        N=2000, 
        filter_already_liked_items=False
    )[0]

    temp_table['aid'] = list(map(encoders_clicks['aid'].inverse_transform, idxs))
    temp_table.to_parquet(f'recs_clicks_als_3/part_{i}.parquet')

0it [00:00, ?it/s]

In [46]:
!zip -r "recs_clicks_als_3.zip" "recs_clicks_als_3"

  adding: recs_clicks_als_3/ (stored 0%)
  adding: recs_clicks_als_3/part_12.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_33.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_53.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_108.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_11.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_94.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_156.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_162.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_92.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_115.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_177.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_191.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_159.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_89.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_118.parquet (deflated 4%)
  adding: recs_clicks_als_3/part_22.parquet (deflated 4%)
  adding: recs_clicks_a

### Carts

In [47]:
# Data preprocess

min_ts, max_ts  = df_carts_train['ts'].min(), df_carts_train['ts'].max()
df_carts_train['weight'] = np.exp(
    0.5 * (df_carts_train['ts'] - min_ts) / (max_ts - min_ts))

scale_factor = 10
df_carts_train['weighted_type'] = scale_factor * df_carts_train['type'] * df_carts_train['weight']

In [48]:
entities_carts = ['session', 'aid']
training_carts, encoders_carts = encode_data(df_carts_train, entities=entities_carts)

num_session_carts = training_carts['session'].nunique()
num_aid_carts = training_carts['aid'].nunique()

csr_data_carts = csr_matrix(
    (training_carts['weighted_type'], (training_carts.session, training_carts.aid)),
    shape=(num_session_carts, num_aid_carts)
)

csr_data_carts.data = np.clip(csr_data_carts.data, 0, 20)

csr_data_carts

<123891x196951 sparse matrix of type '<class 'numpy.float64'>'
	with 427040 stored elements in Compressed Sparse Row format>

In [49]:
config_carts = {
    'calculate_training_loss' : True, 
    'random_state' : 59 
}

als_model_carts = AlternatingLeastSquares(
    calculate_training_loss=config_carts['calculate_training_loss'], 
    random_state=config_carts['random_state']
)

als_model_carts.fit(csr_data_carts, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [50]:
!mkdir recs_carts_als_3
!ls

recs_carts_als_1      recs_clicks_als_1      recs_clicks_als_3.zip
recs_carts_als_1.zip  recs_clicks_als_1.zip  recs_orders_als_1
recs_carts_als_2      recs_clicks_als_2      recs_orders_als_1.zip
recs_carts_als_2.zip  recs_clicks_als_2.zip  recs_orders_als_2
recs_carts_als_3      recs_clicks_als_3      recs_orders_als_2.zip


In [51]:
batch_size = 2000
batch_count = (num_session_carts + batch_size - 1) // batch_size
data_batches = []

all_users = np.arange(num_session_carts)
for i, temp_user in tqdm(enumerate(range(batch_count))):

    if i <= batch_count - 1:
        temp_users = all_users[i*batch_size : (i+1)*batch_size]
    else:
        temp_users = all_users[i*batch_size : ]

    temp_table = get_table(len(temp_users))
    temp_table['session'] = encoders_carts['session'].inverse_transform(temp_users)
        
    idxs = als_model_carts.recommend(
        temp_users, 
        csr_data_carts[temp_user], 
        N=2000, 
        filter_already_liked_items=False
    )[0]

    temp_table['aid'] = list(map(encoders_carts['aid'].inverse_transform, idxs))
    temp_table.to_parquet(f'recs_carts_als_3/part_{i}.parquet')

0it [00:00, ?it/s]

In [52]:
!zip -r "recs_carts_als_3.zip" "recs_carts_als_3"

  adding: recs_carts_als_3/ (stored 0%)
  adding: recs_carts_als_3/part_12.parquet (deflated 5%)
  adding: recs_carts_als_3/part_33.parquet (deflated 5%)
  adding: recs_carts_als_3/part_53.parquet (deflated 5%)
  adding: recs_carts_als_3/part_11.parquet (deflated 5%)
  adding: recs_carts_als_3/part_22.parquet (deflated 5%)
  adding: recs_carts_als_3/part_27.parquet (deflated 5%)
  adding: recs_carts_als_3/part_40.parquet (deflated 5%)
  adding: recs_carts_als_3/part_19.parquet (deflated 5%)
  adding: recs_carts_als_3/part_25.parquet (deflated 5%)
  adding: recs_carts_als_3/part_45.parquet (deflated 5%)
  adding: recs_carts_als_3/part_21.parquet (deflated 5%)
  adding: recs_carts_als_3/part_55.parquet (deflated 5%)
  adding: recs_carts_als_3/part_14.parquet (deflated 5%)
  adding: recs_carts_als_3/part_13.parquet (deflated 6%)
  adding: recs_carts_als_3/part_51.parquet (deflated 5%)
  adding: recs_carts_als_3/part_46.parquet (deflated 5%)
  adding: recs_carts_als_3/part_15.parquet (defl

### Orders

In [53]:
# Data preprocess

min_ts, max_ts  = df_orders_train['ts'].min(), df_orders_train['ts'].max()
df_orders_train['weight'] = np.exp(
    0.5 * (df_orders_train['ts'] - min_ts) / (max_ts - min_ts))

scale_factor = 10
df_orders_train['weighted_type'] = scale_factor * df_orders_train['type'] * df_orders_train['weight']

In [54]:
entities_orders = ['session', 'aid']
training_orders, encoders_orders = encode_data(df_orders_train, entities=entities_orders)

num_session_orders = training_orders['session'].nunique()
num_aid_orders = training_orders['aid'].nunique()

csr_data_orders = csr_matrix(
    (training_orders['weighted_type'], (training_orders.session, training_orders.aid)),
    shape=(num_session_orders, num_aid_orders)
)

csr_data_orders.data = np.clip(csr_data_orders.data, 0, 20)

csr_data_orders

<55845x79824 sparse matrix of type '<class 'numpy.float64'>'
	with 140831 stored elements in Compressed Sparse Row format>

In [55]:
config_orders = {
    'calculate_training_loss' : True, 
    'random_state' : 59 
}

als_model_orders = AlternatingLeastSquares(
    calculate_training_loss=config_orders['calculate_training_loss'], 
    random_state=config_orders['random_state']
)

als_model_orders.fit(csr_data_orders, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [56]:
!mkdir recs_orders_als_3
!ls

recs_carts_als_1      recs_clicks_als_1      recs_orders_als_1
recs_carts_als_1.zip  recs_clicks_als_1.zip  recs_orders_als_1.zip
recs_carts_als_2      recs_clicks_als_2      recs_orders_als_2
recs_carts_als_2.zip  recs_clicks_als_2.zip  recs_orders_als_2.zip
recs_carts_als_3      recs_clicks_als_3      recs_orders_als_3
recs_carts_als_3.zip  recs_clicks_als_3.zip


In [57]:
batch_size = 2000
batch_count = (num_session_orders + batch_size - 1) // batch_size
data_batches = []

all_users = np.arange(num_session_orders)
for i, temp_user in tqdm(enumerate(range(batch_count))):

    if i <= batch_count - 1:
        temp_users = all_users[i*batch_size : (i+1)*batch_size]
    else:
        temp_users = all_users[i*batch_size : ]

    temp_table = get_table(len(temp_users))
    temp_table['session'] = encoders_orders['session'].inverse_transform(temp_users)

    idxs = als_model_orders.recommend(
        temp_users,
        csr_data_orders[temp_user],
        N=2000,
        filter_already_liked_items=False
    )[0]

    temp_table['aid'] = list(map(encoders_orders['aid'].inverse_transform, idxs))
    temp_table.to_parquet(f'recs_orders_als_3/part_{i}.parquet')

0it [00:00, ?it/s]

In [58]:
!zip -r "recs_orders_als_3.zip" "recs_orders_als_3"

  adding: recs_orders_als_3/ (stored 0%)
  adding: recs_orders_als_3/part_12.parquet (deflated 4%)
  adding: recs_orders_als_3/part_11.parquet (deflated 4%)
  adding: recs_orders_als_3/part_22.parquet (deflated 4%)
  adding: recs_orders_als_3/part_27.parquet (deflated 4%)
  adding: recs_orders_als_3/part_19.parquet (deflated 4%)
  adding: recs_orders_als_3/part_25.parquet (deflated 4%)
  adding: recs_orders_als_3/part_21.parquet (deflated 4%)
  adding: recs_orders_als_3/part_14.parquet (deflated 4%)
  adding: recs_orders_als_3/part_13.parquet (deflated 4%)
  adding: recs_orders_als_3/part_15.parquet (deflated 4%)
  adding: recs_orders_als_3/part_17.parquet (deflated 4%)
  adding: recs_orders_als_3/part_20.parquet (deflated 4%)
  adding: recs_orders_als_3/part_7.parquet (deflated 4%)
  adding: recs_orders_als_3/part_10.parquet (deflated 4%)
  adding: recs_orders_als_3/part_0.parquet (deflated 3%)
  adding: recs_orders_als_3/part_26.parquet (deflated 4%)
  adding: recs_orders_als_3/part_