In [1]:
import os
import logging
import glob
import json
import sys
import multiprocessing

import pandas as pd
import polars as pl
from gensim.models import Word2Vec
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import joblib
from pandarallel import pandarallel
pandarallel.initialize()

logger = logging.getLogger()

  from .autonotebook import tqdm as notebook_tqdm


INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# 任务描述
## clicks
只预测一个
## carts
预测20个
## orders
预测20个

In [2]:
DATADIR = "/home/search3/lichunyu/dataset/otto-recommender-system"

In [3]:
test_data_path = os.path.join(DATADIR, "test.parquet")
train_data_path = os.path.join(DATADIR, "train.parquet")
type2idx_path = os.path.join(DATADIR, "type2id.pkl")
idx2type_path = os.path.join(DATADIR, "id2type.pkl")
sample_submission_path = os.path.join(DATADIR, "sample_submission.csv")

In [4]:
train_data = pl.read_parquet(train_data_path)
test_data = pl.read_parquet(test_data_path)

train_data

session,aid,ts,type
i32,i32,i32,u8
0,1517085,1659304800,0
0,1563459,1659304904,0
0,1309446,1659367439,0
0,16246,1659367719,0
0,1781822,1659367871,0
0,1152674,1659367885,0
0,1649869,1659369893,1
0,461689,1659369898,1
0,305831,1659370027,2
0,461689,1659370027,2


In [8]:
type2idx = pd.read_pickle(type2idx_path)
idx2type = pd.read_pickle(idx2type_path)
print(f"type2idx is: {chr(10)}    {type2idx}")
print(f"idx2type is: {chr(10)}    {idx2type}")

type2idx is: 
    {'clicks': 0, 'carts': 1, 'orders': 2}
idx2type is: 
    ['clicks', 'carts', 'orders']


In [10]:
df_sentences = pl.concat([train_data, test_data]).groupby('session').agg(
    pl.col('aid').alias('sentence')
)

In [32]:
df_train_group = train_data.groupby("session").agg(
    (pl.col("aid"),
    pl.col("ts"),
    pl.col("type"))
)
df_train_group

session,aid,ts,type
i32,list[i32],list[i32],list[u8]
6368,"[100129, 597326, ... 737942]","[1659305032, 1659336329, ... 1661712952]","[0, 0, ... 0]"
8659776,"[1784774, 362805]","[1660522695, 1660525651]","[0, 0]"
8805280,"[50503, 1366533, ... 191879]","[1660562199, 1660562348, ... 1660563033]","[0, 0, ... 0]"
9254400,"[932022, 932022]","[1660649076, 1660652257]","[0, 0]"
3095424,"[986465, 1697501, ... 1405679]","[1659594816, 1661184436, ... 1661185454]","[0, 0, ... 0]"
11402624,"[362609, 500159, ... 1036074]","[1661231881, 1661255748, ... 1661699006]","[0, 0, ... 0]"
2646976,"[1344441, 1192169]","[1659538479, 1659538695]","[0, 0]"
10147296,"[51031, 874657, ... 228844]","[1660885715, 1660935860, ... 1661026384]","[0, 0, ... 0]"
409440,"[1235356, 1235356, ... 102278]","[1659348578, 1659348607, ... 1661606489]","[0, 0, ... 0]"
2952992,"[995431, 263264, ... 953530]","[1659558902, 1659558917, ... 1661685775]","[0, 0, ... 0]"


In [30]:
train_data

session,aid,ts,type
i32,i32,i32,u8
0,1517085,1659304800,0
0,1563459,1659304904,0
0,1309446,1659367439,0
0,16246,1659367719,0
0,1781822,1659367871,0
0,1152674,1659367885,0
0,1649869,1659369893,1
0,461689,1659369898,1
0,305831,1659370027,2
0,461689,1659370027,2


In [14]:
sentences = df_sentences['sentence'].to_list()

In [18]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

word2vec = Word2Vec(
                 sentences=sentences, 
                 window = 5, # 5: 0.509, 10: 0.508
#                  sg = 0,  # 1 -> skip-gram, 0 -> CBOW    1: 0.509, 0: 0.508
                 vector_size=100,  # 200: 0.509, 150: 0.509, 100: 0.510, 50: 0.509
#                  sample=6e-5,
                 alpha=0.04,       # 0.1: 0.494, 0.05: 0.507, 0.04: 0.509, 0.03: 0.509, 0.01: 0.503 <-200
                                   # 0.04: 0.510, 0.02: 0.509 <-100
                 min_alpha=0.01,  # 0.0001: 0.510, 0.001: 0.509, 0.01: 0.510
                 min_count=1, 
                 negative=5,  # 5: 0.510, 20: 0.513 (alpha=0.02), 30: 0.510 (alpha=0.04)
                 workers=cores-1
                )

In [20]:
joblib.dump(word2vec, "word2vec.m")

['word2vec.m']

In [5]:
df_train_group = train_data.groupby("session").agg(
    (pl.col("aid"),
    pl.col("ts"),
    pl.col("type"))
)
df_train_group

session,aid,ts,type
i32,list[i32],list[i32],list[u8]
11627232,"[265478, 1646989]","[1661282733, 1661283165]","[0, 0]"
2253344,"[1004996, 242290, ... 789356]","[1659506473, 1659510400, ... 1661712927]","[0, 0, ... 0]"
10057088,"[1844457, 523934, ... 1199089]","[1660847707, 1661279790, ... 1661603560]","[0, 0, ... 0]"
10688128,"[357564, 1417794]","[1661020292, 1661259952]","[0, 0]"
11604480,"[1026033, 504955]","[1661278704, 1661278940]","[0, 0]"
4926592,"[1393853, 171073, ... 171073]","[1659863386, 1661720526, ... 1661720580]","[0, 0, ... 0]"
2207552,"[699087, 699087]","[1659496783, 1659496795]","[0, 0]"
1291008,"[1392191, 6517, ... 1698389]","[1659414940, 1659414948, ... 1659415126]","[0, 0, ... 0]"
5500992,"[1092423, 79993]","[1659958720, 1659958789]","[0, 0]"
9054336,"[794192, 197530, ... 1021507]","[1660591288, 1660591294, ... 1660591304]","[0, 0, ... 0]"


In [6]:
df_train, df_dev = train_test_split(df_train_group, test_size=0.2)

In [7]:
df_dev

session,aid,ts,type
i32,list[i32],list[i32],list[u8]
8893062,"[691398, 1283903, ... 1808294]","[1660573358, 1660573579, ... 1660733396]","[0, 0, ... 0]"
7620792,"[1760714, 155954, ... 155954]","[1660370095, 1660370233, ... 1660370266]","[0, 0, ... 0]"
9303204,"[816594, 41762]","[1660657576, 1660668743]","[0, 0]"
6017195,"[890995, 545397]","[1660055612, 1660058109]","[0, 0]"
8561484,"[1577267, 1497545]","[1660504208, 1660504580]","[0, 0]"
9866212,"[385453, 1018338]","[1660809283, 1661048577]","[0, 0]"
853727,"[340679, 21151]","[1659372480, 1659440546]","[0, 0]"
9983765,"[636390, 1334511, ... 1312024]","[1660835245, 1660835321, ... 1660859707]","[0, 0, ... 0]"
9400020,"[868117, 868117, ... 868117]","[1660673547, 1660673556, ... 1660673580]","[0, 1, ... 0]"
2344499,"[1156699, 1144352, ... 1756194]","[1659515963, 1659515974, ... 1659858275]","[0, 0, ... 0]"


In [8]:
# df_dev.to_pandas().to_parquet("dev.parquet")
df_dev = df_dev.to_pandas()

In [9]:
def filter_by_idx(x, type_idx):
    aids, _type = x["aid"], x["type"]
    res = []
    for a, t in zip(aids[int(len(x)*0.7):], _type[int(len(x)*0.7):]):
        if t == type_idx:
            res.append(a)
    return res

# df_dev = df_dev.to_pandas()
df_dev["sentences"] = df_dev["aid"].apply(lambda x: x[:int(len(x)*0.7)])
df_dev["type_ids"] = df_dev["type"].apply(lambda x: x[:int(len(x)*0.7)])
df_dev["clicks"] = df_dev[["aid", "type"]].apply(filter_by_idx, axis=1, args=(0,))
df_dev["carts"] = df_dev[["aid", "type"]].apply(filter_by_idx, axis=1, args=(1,))
df_dev["orders"] = df_dev[["aid", "type"]].apply(filter_by_idx, axis=1, args=(2,))
df_dev

Unnamed: 0,session,aid,ts,type,sentences,type_ids,clicks,carts,orders
0,8893062,"[691398, 1283903, 1808294, 957461, 1392269, 23...","[1660573358, 1660573579, 1660573610, 166057373...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[691398, 1283903, 1808294, 957461, 1392269, 23...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1283903, 1808294, 957461, 1392269, 235289, 12...",[1808294],[1808294]
1,7620792,"[1760714, 155954, 674152, 1189162, 674152, 155...","[1660370095, 1660370233, 1660370249, 166037026...","[0, 0, 0, 0, 0, 0]","[1760714, 155954, 674152, 1189162]","[0, 0, 0, 0]","[155954, 674152, 1189162, 674152, 155954]",[],[]
2,9303204,"[816594, 41762]","[1660657576, 1660668743]","[0, 0]",[816594],[0],[41762],[],[]
3,6017195,"[890995, 545397]","[1660055612, 1660058109]","[0, 0]",[890995],[0],[545397],[],[]
4,8561484,"[1577267, 1497545]","[1660504208, 1660504580]","[0, 0]",[1577267],[0],[1497545],[],[]
...,...,...,...,...,...,...,...,...,...
2579951,9444027,"[1464830, 1539043]","[1660679754, 1660680242]","[0, 0]",[1464830],[0],[1539043],[],[]
2579952,5560629,"[99846, 1485561, 781589, 506879, 781589, 87272...","[1659967919, 1659967926, 1659967933, 165996797...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[99846, 1485561, 781589, 506879, 781589, 87272...","[0, 0, 0, 0, 0, 0, 0]","[1485561, 781589, 506879, 781589, 872725, 1488...",[],[]
2579953,1529115,"[1794432, 1794432, 1510009, 1794432, 1794432, ...","[1659437069, 1659610179, 1659610850, 165967732...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, ...","[1794432, 1794432, 1510009, 1794432, 1794432, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, ...","[1794432, 1510009, 1794432, 1794432, 1212546, ...","[405140, 1762856, 1762856]","[405140, 1762856]"
2579954,9306005,"[1427212, 1289166, 1409416, 1289166, 1289166, ...","[1660658052, 1660658295, 1660658687, 166065888...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1427212, 1289166, 1409416, 1289166, 1289166, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1289166, 1409416, 1289166, 1289166, 1289166, ...",[],[]


In [14]:
df_dev["sample_sentences"] = df_dev["sentences"].apply(lambda x: x[-20:])
df_dev["sample_type_ids"] = df_dev["type_ids"].apply(lambda x: x[-20:])
df_dev["sample_ts"] = df_dev["ts"].apply(lambda x: x[-20:])

In [17]:
df_dev[["sample_ts", "sample_sentences", "sample_type_ids", "clicks", "carts", "orders"]].to_pickle("dev_data_sample.pkl")

In [91]:
df_dev[["ts", "sentences", "type_ids", "clicks", "carts", "orders"]].to_parquet("dev_data_pure.parquet")

In [None]:
def filter_by_idx(x, type_idx):
    aids, _type = x["aid"], x["type"]
    res = []
    for a, t in zip(aids[int(len(x)*0.7):], _type[int(len(x)*0.7):]):
        if t == type_idx:
            res.append(a)
    return res

df_train = df_train.to_pandas()
df_train["sentences"] = df_train["aid"].apply(lambda x: x[:int(len(x)*0.7)])
df_train["type_ids"] = df_train["type"].apply(lambda x: x[:int(len(x)*0.7)])
df_train["clicks"] = df_train[["aid", "type"]].apply(filter_by_idx, axis=1, args=(0,))
df_train["carts"] = df_train[["aid", "type"]].apply(filter_by_idx, axis=1, args=(1,))
df_train["orders"] = df_train[["aid", "type"]].apply(filter_by_idx, axis=1, args=(2,))

df_train["sample_sentences"] = df_train["sentences"].apply(lambda x: x[-20:])
df_train["sample_type_ids"] = df_train["type_ids"].apply(lambda x: x[-20:])
df_train["sample_ts"] = df_train["ts"].apply(lambda x: x[-20:])

df_train[["sample_ts", "sample_sentences", "sample_type_ids", "clicks", "carts", "orders"]].to_pickle("train_data_sample.pkl")
df_train

In [94]:
df_train[["ts", "sentences", "type_ids", "clicks", "carts", "orders"]].to_parquet("train_data_pure.parquet")