In [22]:
import os
import logging
import glob
import json
import sys
import multiprocessing

import pandas as pd
import polars as pl
from gensim.models import Word2Vec
import torch
import torch.nn as nn
import joblib
from pandarallel import pandarallel
pandarallel.initialize()

logger = logging.getLogger()

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# 任务描述
## clicks
只预测一个
## carts
预测20个
## orders
预测20个

In [2]:
DATADIR = "/home/search3/lichunyu/dataset/otto-recommender-system"

In [3]:
test_data_path = os.path.join(DATADIR, "test.parquet")
train_data_path = os.path.join(DATADIR, "train.parquet")
type2idx_path = os.path.join(DATADIR, "type2id.pkl")
idx2type_path = os.path.join(DATADIR, "id2type.pkl")
sample_submission_path = os.path.join(DATADIR, "sample_submission.csv")

In [7]:
train_data = pl.read_parquet(train_data_path)
test_data = pl.read_parquet(test_data_path)

train_data

session,aid,ts,type
i32,i32,i32,u8
0,1517085,1659304800,0
0,1563459,1659304904,0
0,1309446,1659367439,0
0,16246,1659367719,0
0,1781822,1659367871,0
0,1152674,1659367885,0
0,1649869,1659369893,1
0,461689,1659369898,1
0,305831,1659370027,2
0,461689,1659370027,2


In [8]:
type2idx = pd.read_pickle(type2idx_path)
idx2type = pd.read_pickle(idx2type_path)
print(f"type2idx is: {chr(10)}    {type2idx}")
print(f"idx2type is: {chr(10)}    {idx2type}")

type2idx is: 
    {'clicks': 0, 'carts': 1, 'orders': 2}
idx2type is: 
    ['clicks', 'carts', 'orders']


In [16]:
df_sample_submission = pd.read_csv(sample_submission_path)

In [27]:
df_sample_submission.head()

Unnamed: 0,session_type,labels
0,12899779_clicks,129004 126836 118524
1,12899779_carts,129004 126836 118524
2,12899779_orders,129004 126836 118524
3,12899780_clicks,129004 126836 118524
4,12899780_carts,129004 126836 118524


In [10]:
df_sentences = pl.concat([train_data, test_data]).groupby('session').agg(
    pl.col('aid').alias('sentence')
)

In [14]:
sentences = df_sentences['sentence'].to_list()

In [18]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

word2vec = Word2Vec(
                 sentences=sentences, 
                 window = 5, # 5: 0.509, 10: 0.508
#                  sg = 0,  # 1 -> skip-gram, 0 -> CBOW    1: 0.509, 0: 0.508
                 vector_size=100,  # 200: 0.509, 150: 0.509, 100: 0.510, 50: 0.509
#                  sample=6e-5,
                 alpha=0.04,       # 0.1: 0.494, 0.05: 0.507, 0.04: 0.509, 0.03: 0.509, 0.01: 0.503 <-200
                                   # 0.04: 0.510, 0.02: 0.509 <-100
                 min_alpha=0.01,  # 0.0001: 0.510, 0.001: 0.509, 0.01: 0.510
                 min_count=1, 
                 negative=5,  # 5: 0.510, 20: 0.513 (alpha=0.02), 30: 0.510 (alpha=0.04)
                 workers=cores-1
                )

In [20]:
joblib.dump(word2vec, "word2vec.m")

['word2vec.m']