In [1]:
import torch


In [2]:
torch.cuda.is_available()

True

In [3]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [4]:
GLOBAL_SEED = 42

import os
os.environ["PYTHONIOENCODING"] = "utf8"
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
import sys
from glob import glob

import pandas as pd
import numpy as np
from numpy import random as np_rnd
import random as rnd
import shutil
import gc
import datetime
from collections import defaultdict, Counter
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
import time
import pickle
import sklearn as skl
from sklearn import model_selection

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F
from torch.optim import AdamW, Adam, SparseAdam
from transformers import get_polynomial_decay_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.__version__


'2.1.2+cu118'

In [5]:
from torch_geometric.data import Data
from torch_geometric.utils import coalesce, is_undirected, to_undirected, sort_edge_index
from torch_geometric.sampler import BaseSampler
from torch_geometric.nn import GCNConv

In [6]:
pip install pyarrow

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install fastparquet

Note: you may need to restart the kernel to use updated packages.


In [8]:
fraction_of_sessions_to_use = 0.5

train = pd.read_parquet('./data/train.parquet')
test = pd.read_parquet('./data/test.parquet')

lucky_sessions_train = train.drop_duplicates(['session']).sample(frac=fraction_of_sessions_to_use, random_state=42)['session']
subset_of_train = train[train.session.isin(lucky_sessions_train)]
subset_of_test = test

subset_of_train.index = pd.MultiIndex.from_frame(subset_of_train[['session']])
subset_of_test.index = pd.MultiIndex.from_frame(subset_of_test[['session']])

subsets = pd.concat([subset_of_train, subset_of_test])
sessions = subsets.session.unique()

del lucky_sessions_train, subset_of_train, subset_of_test; gc.collect()

0

In [9]:
chunk_size = 16384*2
recent_aids = 20
hour_interval_threshold = 6
df_adj = pd.DataFrame(columns=["aid_x", "aid_y"], dtype="int64")

# session 
for i in tqdm(range(0, sessions.shape[0], chunk_size), total=len(range(0, sessions.shape[0], chunk_size))):
    # chunk size 
    current_chunk = subsets.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index(drop=True)
    current_chunk = current_chunk.groupby('session', as_index=False).nth(list(range(-recent_aids, 0))).reset_index(drop=True)
    consecutive_AIDs = current_chunk.merge(current_chunk, on='session')
    consecutive_AIDs = consecutive_AIDs[consecutive_AIDs.aid_x != consecutive_AIDs.aid_y]
    consecutive_AIDs['days_elapsed'] = (consecutive_AIDs.ts_y - consecutive_AIDs.ts_x) / 3600
    consecutive_AIDs = consecutive_AIDs[(consecutive_AIDs.days_elapsed > 0) & (consecutive_AIDs.days_elapsed < hour_interval_threshold)]

    df_adj = pd.concat([df_adj, consecutive_AIDs[["aid_x", "aid_y"]]], axis=0, ignore_index=True)
    
n_aids = subsets["aid"].max() + 1
nodes = np.arange(subsets["aid"].max() + 1, dtype="int64")

100%|████████████████████████████████████████████████████████████████████████████████| 248/248 [09:44<00:00,  2.36s/it]


In [None]:
data_graph = Data(
    x=torch.tensor(nodes, dtype=torch.int64),
    edge_index=torch.tensor(df_adj.to_numpy().T, dtype=torch.int64),
)
# Loại bỏ cạnh trùng
data_graph.edge_index = coalesce(data_graph.edge_index)
data_graph.edge_index = to_undirected(data_graph.edge_index)

with open("./data_graph.pkl", "wb") as file:
    pickle.dump(data_graph, file)

del subsets, sessions, nodes, df_adj, current_chunk, consecutive_AIDs; gc.collect()

In [None]:
with open("./node_feature.pkl", "wb") as file:
    pickle.dump(data_graph.x, file)

In [None]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # RAPIDS random
    try:
        cp.random.seed(seed)
    except:
        pass
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except:
        pass

In [None]:
edge_index = pd.DataFrame(data_graph.edge_index.detach().cpu().numpy().T, columns=["x", "y"], dtype="int64")
seed_everything()
shuffled_idx = np_rnd.permutation(len(edge_index))
edge_index.iloc[shuffled_idx[:int(len(shuffled_idx) * 0.8)]].reset_index(drop=True).to_parquet("./train_edge.parquet")
edge_index.iloc[shuffled_idx[int(len(shuffled_idx) * 0.8):]].reset_index(drop=True).to_parquet("./valid_edge.parquet")

In [None]:
edge_index.iloc[shuffled_idx[:int(len(shuffled_idx) * 0.8)]].shape

In [None]:
edge_index.iloc[shuffled_idx[int(len(shuffled_idx) * 0.8):]].shape