### EDA# EDA


### Datasets
* Dataset A: a dynamic event graph with entities as nodes and different types of events as edges.
* Dataset B: a user-item graph with users and items as nodes and different types of interactions as edges.
* Train
    * Dataset A
        * edges_train_A.csv
            * src_id -> dst_id, edge_type, timestamp
        * node_features.csv
            * node_id, anonymized categorical features
        * edge_type_features.csv
            * edge_id, anonymized categorical features
    * Dataset B
        * edges_train_B.csv
            * src_id -> dst_id, edge_type, timestamp, feat (anonymized edge features)
* Test
    * Dataset A / Dataset B
        * src_id -> dst_id, edge_type, start_time, end_time


### References
* https://networkx.org/documentation/stable/reference/readwrite/edgelist.html
* https://towardsdatascience.com/pyvis-visualize-interactive-network-graphs-in-python-77e059791f01

### TODOs
- [X] 날짜별 노드 종류 분포 (histogram)
- [ ] 두 노드 사이의 edge 수 확인


In [1]:

"""
2022-01-00
author: Jiho Choi

References
    - https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
"""

import os
import sys
import numpy as np
import pandas as pd
import datetime

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from torch_geometric.data import Data
from torch_geometric.data import download_url, InMemoryDataset
from torch_geometric.loader import GraphSAINTRandomWalkSampler



# from parse_args import params
sys.path.append('../scripts/')
from utils import label_stats
from utils import load_pickle_file
from utils import save_pickle_file

import csv


DATA_DIR = "../data/wsdm-2022"
"""
edge_list = pd.read_csv(
    f"{DATA_DIR}/train/edges_train_A.csv",
    header=None,
    names=['src_id', 'dst_id', 'edge_type', 'timestamp'],
    dtype={'src_id': int, 'dst_id': int, 'edge_type': int, 'timestamp': int},
).sort_values('timestamp')

edge_list['datetime'] = edge_list['timestamp'].copy().apply(
    lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y%m%d_%H%M%S")
)
edge_list['month'] = edge_list['timestamp'].apply(
    lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y%m")
)
edge_list['date'] = edge_list['timestamp'].apply(
    lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y%m%d")
)

save_pickle_file(f"{DATA_DIR}/train/edges_train_A_cache.pickle", edge_list)
"""
# edge_list_df = load_pickle_file(f"{DATA_DIR}/train/edges_train_A_cache.pickle")
# grouped_df = edge_list_df.groupby('date')

'\nedge_list = pd.read_csv(\n    f"{DATA_DIR}/train/edges_train_A.csv",\n    header=None,\n    names=[\'src_id\', \'dst_id\', \'edge_type\', \'timestamp\'],\n    dtype={\'src_id\': int, \'dst_id\': int, \'edge_type\': int, \'timestamp\': int},\n).sort_values(\'timestamp\')\n\nedge_list[\'datetime\'] = edge_list[\'timestamp\'].copy().apply(\n    lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y%m%d_%H%M%S")\n)\nedge_list[\'month\'] = edge_list[\'timestamp\'].apply(\n    lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y%m")\n)\nedge_list[\'date\'] = edge_list[\'timestamp\'].apply(\n    lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y%m%d")\n)\n\nsave_pickle_file(f"{DATA_DIR}/train/edges_train_A_cache.pickle", edge_list)\n'

In [2]:
a = torch.rand(100)
print(a.shape)
b = torch.rand(100)
print(b.shape)
# b = b.unsqueeze(0)
print(b.shape)
c = torch.cat([a, b], dim=0)
c = torch.stack([a, b], dim=0)
print(c.shape)

torch.Size([100])
torch.Size([100])
torch.Size([100])
torch.Size([2, 100])


In [5]:
DATA_DIR = "../data/wsdm-2022"

edge_list_df = pd.read_csv(
    f"{DATA_DIR}/raw/train/edges_train_A.csv",
    header=None,
    names=['src_id', 'dst_id', 'edge_type', 'timestamp'],
    dtype={'src_id': int, 'dst_id': int, 'edge_type': int, 'timestamp': int},
).sort_values('timestamp')
edge_list_df['date'] = edge_list_df['timestamp'].apply(
    lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y%m%d")
)
grouped_df = edge_list_df.groupby('date')


In [6]:
print(list(grouped_df.groups.keys())[0])
print(list(grouped_df.groups.keys())[-1])

20141019
20170514


In [7]:
import pandas as pd
from tqdm import tqdm
from datetime import datetime

date_start = datetime.strptime("20141019", "%Y%m%d")
date_end = datetime.strptime("20170514", "%Y%m%d")

file_name_list = []

for index, date in tqdm(enumerate(pd.date_range(date_start, date_end))):
    # print(index, date)
    file_name_list.append(f'graph_{index}')

# for index, date in enumerate(range(date_start, date_end)):
#     file_name_list.append(f'graph_{index}')

939it [00:00, 1085870.27it/s]


In [24]:
DATA_DIR = "../data/wsdm-2022"
edge_type_features_df = pd.read_csv(
    f"{DATA_DIR}/raw/train/edge_type_features.csv",
    header=None,
    names=['edge_type', 'feature1', 'feature2', 'feature3'],
    dtype={'edge_type': int, 'feature1': int, 'feature2': int, 'feature3': int},
).sort_values('edge_type')
# edge_type_features_df

edge_type_feature_dict = {}
for index, row in enumerate(edge_type_features_df.values.tolist()):
    edge_type, feature1, feature2, feature3 = row
    edge_type_feature_dict[edge_type] = [feature1, feature2, feature3]
edge_type_feature_dict[0]

[56, 16, 32]

In [16]:
edge_type_features_df[edge_type_features_df['edge_type']==0].values.tolist()[0]

[0, 56, 16, 32]

In [None]:
# edge_type_features.values.tolist()
# [0, 56, 16, 32],
#  [1, 179, 15, 70],
#  [2, 65, 6, 62],
#  [3, 104, 12, 86],

In [None]:
# edge_type_features.to_dict('records')
# [{'edge_type': 0, 'feature1': 56, 'feature2': 16, 'feature3': 32},
#  {'edge_type': 1, 'feature1': 179, 'feature2': 15, 'feature3': 70},
#  {'edge_type': 2, 'feature1': 65, 'feature2': 6, 'feature3': 62},
#  {'edge_type': 3, 'feature1': 104, 'feature2': 12, 'feature3': 86},

In [12]:
edge_type_features.to_dict('records')

edge_type_features = edge_type_features.values.tolist()

for edge_type in edge_type_series:
    print(edge_type)
    break


# edge_type_features['edge_type']==edge_type
# edge_type_features[].values.tolist()[0]

96


In [None]:
edge_type_features[edge_type_features['edge_type']==edge_type].values.tolist()[0]

In [45]:
DATA_DIR = "../data/wsdm-2022"

import csv

node_feature_dict = {}
with open(f"{DATA_DIR}/raw/train/node_features.csv", mode='r') as file:
    reader = csv.reader(file)
    for row in reader:
        row = list(map(lambda x: int(x), row))
        node_id, features = row[0], row[1:]
        features = list(map(lambda x: x if x != -1 else 0, features))
        node_feature_dict[node_id] = features
# node_feature_dict

In [30]:
DATA_DIR = "../data/wsdm-2022"

node_features_df = pd.read_csv(
    f"{DATA_DIR}/raw/train/node_features.csv", header=None,
    names=[
        'node_id',
        'feature1', 'feature2', 'feature3', 'feature4',
        'feature5', 'feature6', 'feature7', 'feature8'
    ],
    dtype={
        'node_id': int,
        'feature1': int, 'feature2': int, 'feature3': int, 'feature4': int,
        'feature5': int, 'feature6': int, 'feature7': int, 'feature8': int
    },
)

node_feature_dict = {}
for index, row in enumerate(node_features_df.values.tolist()):
    node_id, feature1, feature2, feature3, feature4, \
               feature5, feature6, feature7, feature8 = row
    # TODO: NaN
    features = [feature1, feature2, feature3, feature4, feature5, feature6, feature7, feature8]
    features = list(map(lambda x: x if x != -1 else 0, features))
    node_feature_dict[node_id] = features
# node_feature_dict

In [37]:
for node_id in source_nodes:
    print(node_id)
    print(node_feature_dict[node_id])
    break

106
[5, 0, 0, 0, 0, 0, 0, 0]


In [48]:
            
edge_type_feature_dict = {}
with open(f"{DATA_DIR}/raw/train/edge_type_features.csv", mode='r') as file:
    reader = csv.reader(file)
    for row in reader:
        row = list(map(lambda x: int(x), row))
        edge_type, features = row[0], row[1:]
        edge_type_feature_dict[edge_type] = features
# edge_type_feature_dict

In [49]:
for index, (date, group) in enumerate(grouped_df):
    group = group.reset_index(drop=True)

    source_nodes = group['src_id']
    target_nodes = group['dst_id']
    timestamp = group['timestamp']
    edge_type_series = group['edge_type']
    break
# source_nodes = group['src_id']
# target_nodes = group['dst_id']

In [52]:
"""
edge_type_features_df = pd.read_csv(
    f"{DATA_DIR}/raw/train/edge_type_features.csv", header=None,
    names=['edge_type', 'feature1', 'feature2', 'feature3'],
    dtype={'edge_type': int, 'feature1': int, 'feature2': int, 'feature3': int},
).sort_values('edge_type')
edge_type_feature_dict = {}
for index, row in enumerate(edge_type_features_df.values.tolist()):
    edge_type, feature1, feature2, feature3 = row
    edge_type_feature_dict[edge_type] = [feature1, feature2, feature3]
"""

'\nedge_type_features_df = pd.read_csv(\n    f"{DATA_DIR}/raw/train/edge_type_features.csv", header=None,\n    names=[\'edge_type\', \'feature1\', \'feature2\', \'feature3\'],\n    dtype={\'edge_type\': int, \'feature1\': int, \'feature2\': int, \'feature3\': int},\n).sort_values(\'edge_type\')\nedge_type_feature_dict = {}\nfor index, row in enumerate(edge_type_features_df.values.tolist()):\n    edge_type, feature1, feature2, feature3 = row\n    edge_type_feature_dict[edge_type] = [feature1, feature2, feature3]\n'

In [51]:
target_nodes

0        28905
1         5776
2         5502
3         5450
4         3784
         ...  
18322    28905
18323    20686
18324    28905
18325    24861
18326    28905
Name: dst_id, Length: 18327, dtype: int64

In [56]:
!ls ../data/wsdm-2022

processed  raw


In [219]:
# ---------------- edges_train_A.csv ----------------

edge_list_df = pd.read_csv(
    f"{DATA_DIR}/raw/train/edges_train_A.csv", header=None,
    names=['src_id', 'dst_id', 'edge_type', 'timestamp'],
    dtype={'src_id': int, 'dst_id': int,
           'edge_type': int, 'timestamp': int},
).sort_values('timestamp')
edge_list_df['date'] = edge_list_df['timestamp'].apply(
    lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y%m%d")
)
grouped_df = edge_list_df.groupby('date')




In [258]:

# ---------------- edge_type_features.csv ----------------

edge_type_feature_dict = {}
with open(f"{DATA_DIR}/raw/train/edge_type_features.csv", mode='r') as file:
    reader = csv.reader(file)
    for row in reader:
        row = list(map(lambda x: int(x), row))
        edge_type, features = row[0], row[1:]
        edge_type_feature_dict[edge_type] = features

# ---------------- node_features.csv ----------------

node_feature_dict = {}
with open(f"{DATA_DIR}/raw/train/node_features.csv", mode='r') as file:
    reader = csv.reader(file)
    for row in reader:
        row = list(map(lambda x: int(x), row))
        node_id, features = row[0], row[1:]
        features = list(map(lambda x: x if x != -1 else 0, features))
        node_feature_dict[node_id] = features


In [264]:
print(edge_type_feature_dict[0])
print(edge_type_feature_dict[1])
# node_feature_dict
# node_feature_dict[1]

[56, 16, 32]
[179, 15, 70]


[1847]

[0, 1, 2, 3]

In [290]:
edge_index_mapper.inverse_transform(list(range(0, max_index + 1))[-5:-1])

array([69908, 69914, 69930, 69931])

In [293]:
node_indexes = edge_index_mapper.inverse_transform(list(range(0, max_index + 1))[0:-1])
node_indexes

array([   30,    72,   106, ..., 69914, 69930, 69931])

In [297]:
node_indexes = edge_index_mapper.inverse_transform(list(range(0, max_index + 1)))
node_indexes

array([   30,    72,   106, ..., 69930, 69931, 69947])

In [295]:
node_feature_dict[30]

[17, 0, 0, 0, 0, 0, 0, 0]

In [296]:
temp = [node_feature_dict[node_index] for node_index in node_indexes]
temp[0:5]

[[17, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 86, 0, 0],
 [5, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 361, 0, 0, 0, 0],
 [11, 0, 0, 0, 0, 0, 0, 11]]

In [285]:
max_index = np.amax((source_nodes, target_nodes))
for index in range(0, max_index + 1):
    pass
#     edge_index_sorter.inverse_transform(
# [1188, 1188, 1188, 1799, 47, 1847, 1098, 967,  895, 1043, 773, 742]
index

1847

In [302]:
torch.tensor(node_features, dtype=torch.long)

tensor([[ 17,   0,   0,  ...,   0,   0,   0],
        [  0,   0,   0,  ...,  86,   0,   0],
        [  5,   0,   0,  ...,   0,   0,   0],
        ...,
        [  0,   0,   0,  ...,  67,   0,   0],
        [  0,   0,   0,  ..., 141,   0,   0],
        [  0,   0,   0,  ...,   0,   0,   0]])

In [301]:
torch.LongTensor(node_features)

tensor([[ 17,   0,   0,  ...,   0,   0,   0],
        [  0,   0,   0,  ...,  86,   0,   0],
        [  5,   0,   0,  ...,   0,   0,   0],
        ...,
        [  0,   0,   0,  ...,  67,   0,   0],
        [  0,   0,   0,  ..., 141,   0,   0],
        [  0,   0,   0,  ...,   0,   0,   0]])

In [307]:
type(group['edge_type'])

pandas.core.series.Series

In [310]:
edge_type_series = group['edge_type']
edge_type_series
edge_features = [edge_type_feature_dict[edge_type] for edge_type in edge_type_series]
# edge_features

In [362]:
edge_index = torch.tensor((source_nodes, target_nodes), dtype=torch.long)
edge_index


tensor([[1188, 1188, 1188,  ..., 1799,   47, 1847],
        [1098,  967,  895,  ..., 1043,  773,  742]])

In [359]:
# from torch_geometric.utils import sort_edge_index
from sklearn.preprocessing import LabelEncoder

file_name_list = []
for index, (datesort_edge_indexroup) in enumerate(grouped_df):
    group = group.reset_index(drop=True)

    source_nodes = group['src_id']
    target_nodes = group['dst_id']
    timestamp_series = group['timestamp']
    edge_type_series = group['edge_type']
    
    node_index_mapper = LabelEncoder()  # sorter: compress edge index
    node_index_mapper.fit(pd.concat([source_nodes, target_nodes], axis=0))
    source_nodes = edge_index_mapper.transform(source_nodes)
    target_nodes = edge_index_mapper.transform(target_nodes)

    # ------------ edge_index ------------
    source_nodes = np.array(source_nodes)
    target_nodes = np.array(target_nodes)
    edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
    
    # ------------ node_features ------------
    max_index = np.amax((source_nodes, target_nodes))
    node_indexes = node_index_mapper.inverse_transform(list(range(0, max_index + 1)))
    node_features = [node_feature_dict[node_index] for node_index in node_indexes]
    # x = torch.LongTensor(node_features)
    x = torch.tensor(node_features, dtype=torch.long)
    
    # ------------ edge_attr ------------
    timestamp = group['timestamp']
    edge_type = group['edge_type']
    edge_features = [edge_type_feature_dict[edge_type] for edge_type in edge_type_series]
    
    
    
    break

In [314]:
edge_type

0        20
1       151
2       237
3        69
4        33
       ... 
7158     65
7159    151
7160    242
7161    242
7162    220
Name: edge_type, Length: 7163, dtype: int64

In [315]:
print(type(source_nodes))
source_nodes

<class 'numpy.ndarray'>


array([1188, 1188, 1188, ..., 1799,   47, 1847])

In [316]:
from torch_geometric.utils import sort_edge_index
from sklearn.preprocessing import LabelEncoder

In [317]:
edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
edge_index

tensor([[1188, 1188, 1188,  ..., 1799,   47, 1847],
        [1098,  967,  895,  ..., 1043,  773,  742]])

In [318]:
# sort_edge_index(edge_index)

In [319]:
node_index_mapper = LabelEncoder()  #
node_index_mapper.fit(pd.concat([source_nodes, target_nodes], axis=0))
source_nodes = node_index_mapper.transform(source_nodes)
target_nodes = node_index_mapper.transform(target_nodes)


# source_nodes = torch.tensor(source_nodes, dtype=torch.long)
# target_nodes = torch.tensor(target_nodes, dtype=torch.long)

# edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
print(edge_index.shape)
edge_index

TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

In [320]:
# node_index_mapper.inverse_transform(
#     [1188, 1188, 1188, 1799, 47, 1847, 1098, 967,  895, 1043, 773, 742]
# )

In [225]:
# node_index_mapper.transform(
#     [45285, 45285, 45285, 68389, 1584, 69947, 41603, 35831, 33586, 39520, 30055, 28905]
# )

In [321]:
# max_index = np.sort(np.concatenate((source_nodes, target_nodes), axis=None))[-1]
max_index = np.amax((source_nodes, target_nodes))
for index in range(0, max_index + 1):
    pass
index

1847

In [322]:
edge_list = torch.cat((source_nodes, target_nodes), dim=1, dtype=torch.long)

TypeError: expected Tensor as element 0 in argument 0, but got numpy.ndarray

In [215]:
source_nodes = torch.tensor(source_nodes)
source_nodes

tensor([1188, 1188, 1188,  ..., 1799,   47, 1847])

In [216]:
edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
edge_index

TypeError: only integer tensors of a single element can be converted to an index

In [None]:
# sort_edge_index(edge_index)

In [None]:
node_features = group.loc[group.session_id==session_id,['sess_item_id','item_id']].sort_values('sess_item_id').item_id.drop_duplicates().values

node_features = torch.LongTensor(node_features).unsqueeze(1)
target_nodes = group.sess_item_id.values[1:]
source_nodes = group.sess_item_id.values[:-1]

edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
x = node_features

y = torch.FloatTensor([group.label.values[0]])

data = Data(x=x, edge_index=edge_index, y=y)


        


In [348]:
import datetime




DATA_DIR = "../data/wsdm-2022"
date_set = set()
with open(f"{DATA_DIR}/raw/train/edges_train_A.csv", mode='r') as file:
    reader = csv.reader(file)
    for row in reader:
        # print(datetime.datetime.fromtimestamp(int(row[3])))
        date = datetime.datetime.fromtimestamp(int(row[3]))
        date = date.strftime("%Y%m%d")  # %H:%M:%S
        date_set.add(date)
    
len(date_set)

939

In [354]:
print(sorted(list(date_set))[0])
print(sorted(list(date_set))[-1])
sorted(list(date_set))[-1]

20141019
20170514


'20170514'

In [356]:
for index, date in enumerate(sorted(list(date_set))):
    file_name_list.append(f'graph_{index}')
# file_name_list

['graph_0',
 'graph_1',
 'graph_2',
 'graph_3',
 'graph_4',
 'graph_5',
 'graph_6',
 'graph_7',
 'graph_8',
 'graph_9',
 'graph_10',
 'graph_11',
 'graph_12',
 'graph_13',
 'graph_14',
 'graph_15',
 'graph_16',
 'graph_17',
 'graph_18',
 'graph_19',
 'graph_20',
 'graph_21',
 'graph_22',
 'graph_23',
 'graph_24',
 'graph_25',
 'graph_26',
 'graph_27',
 'graph_28',
 'graph_29',
 'graph_30',
 'graph_31',
 'graph_32',
 'graph_33',
 'graph_34',
 'graph_35',
 'graph_36',
 'graph_37',
 'graph_38',
 'graph_39',
 'graph_40',
 'graph_41',
 'graph_42',
 'graph_43',
 'graph_44',
 'graph_45',
 'graph_46',
 'graph_47',
 'graph_48',
 'graph_49',
 'graph_50',
 'graph_51',
 'graph_52',
 'graph_53',
 'graph_54',
 'graph_55',
 'graph_56',
 'graph_57',
 'graph_58',
 'graph_59',
 'graph_60',
 'graph_61',
 'graph_62',
 'graph_63',
 'graph_64',
 'graph_65',
 'graph_66',
 'graph_67',
 'graph_68',
 'graph_69',
 'graph_70',
 'graph_71',
 'graph_72',
 'graph_73',
 'graph_74',
 'graph_75',
 'graph_76',
 'graph_7

In [335]:
import os
import os.path as osp
import datetime
import csv
from tqdm import tqdm
import pandas as pd

from sklearn.preprocessing import LabelEncoder
# from torch_geometric.utils import sort_edge_index

import torch
from torch_geometric.data import Data
from torch_geometric.data import Dataset, download_url


class LargeGraphDataset(Dataset):
    # https://pytorch-geometric.readthedocs.io/en/latest/notes/create_dataset.html
    def __init__(self, root="../data/wsdm-2022"):
        print("(__init__) root:", root)
        self.root = root  # DATA_DIR
        # self.processed_paths = f"{self.root}/processed"

        super().__init__(root)

        # DATA_DIR = "../data/wsdm-2022"

    @property
    def processed_file_names(self):
        # print("--------------------")
        # print("processed_file_names")
        # print("--------------------")
        # self.processed_paths, self.processed_file_names
        date_start = datetime.datetime.strptime("20141019", "%Y%m%d")
        date_end = datetime.datetime.strptime("20170514", "%Y%m%d")
        file_name_list = []
        for index, date in enumerate(pd.date_range(date_start, date_end)):
            file_name_list.append(f'graph_{index}')
        # print("file_name_list:", len(file_name_list), file_name_list[0])
        return file_name_list

    def process(self):
        print("---------------")
        print("    process    ")
        print("---------------")
        DATA_DIR = self.root

        # ---------------- edge_type_features.csv ----------------

        edge_type_feature_dict = {}
        with open(f"{DATA_DIR}/raw/train/edge_type_features.csv", mode='r') as file:
            reader = csv.reader(file)
            for row in reader:
                row = list(map(lambda x: int(x), row))
                edge_type, features = row[0], row[1:]
                edge_type_feature_dict[edge_type] = features

        # ---------------- node_features.csv ----------------

        node_feature_dict = {}
        with open(f"{DATA_DIR}/raw/train/node_features.csv", mode='r') as file:
            reader = csv.reader(file)
            for row in reader:
                row = list(map(lambda x: int(x), row))
                node_id, features = row[0], row[1:]
                features = list(map(lambda x: x if x != -1 else 0, features))
                node_feature_dict[node_id] = features

        # ---------------- edges_train_A.csv ----------------

        edge_list_df = pd.read_csv(
            f"{DATA_DIR}/raw/train/edges_train_A.csv", header=None,
            names=['src_id', 'dst_id', 'edge_type', 'timestamp'],
            dtype={'src_id': int, 'dst_id': int,
                   'edge_type': int, 'timestamp': int},
        ).sort_values('timestamp')
        edge_list_df['date'] = edge_list_df['timestamp'].apply(
            lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y%m%d")
        )
        grouped_df = edge_list_df.groupby('date')

        for index, (date, group) in enumerate(grouped_df):
            group = group.reset_index(drop=True)

            source_nodes = group['src_id']
            target_nodes = group['dst_id']
            timestamp_series = group['timestamp']
            edge_type_series = group['edge_type']

            node_index_mapper = LabelEncoder()  # sorter: compress edge index
            # print(pd.concat([source_nodes, target_nodes], axis=0))
            node_index_mapper.fit(pd.concat([source_nodes, target_nodes], axis=0))
            source_nodes = node_index_mapper.transform(source_nodes)
            target_nodes = node_index_mapper.transform(target_nodes)

            # ------------ edge_index ------------
            edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

            # ------------ node_features ------------
            max_index = np.amax((source_nodes, target_nodes))
            # print("max_index:", max_index)
            node_indexes = node_index_mapper.inverse_transform(list(range(0, max_index + 1)))
            node_features = [node_feature_dict[node_index] for node_index in node_indexes]
            # x = torch.LongTensor(node_features)
            x = torch.tensor(node_features, dtype=torch.long)

            # ------------ edge_attrs / edge_labels ------------
            edge_attrs = [edge_type_feature_dict[edge_type] for edge_type in edge_type_series]
            edge_labels = torch.tensor([timestamp_series, edge_type_series], dtype=torch.long)
            edge_labels = edge_labels.transpose(0, 1)

            data = Data(x=x, edge_index=edge_index, edge_attrs=edge_attrs, edge_labels=edge_labels)
            
            
            print("dataset:", data) if index == 0 else None

            torch.save(data, f"{self.processed_paths[index]}")

            
    def len(self):
        return len(self.processed_file_names)

    def get(self, index):
        # data = torch.load(f"{self.root}/processed/train/graph_{index}.pt")
        # data = torch.load(osp.join(self.processed_dir, f'data_{index}.pt'))
        data = torch.load(self.processed_file_names[index])
        return data


print("DATASET")
dataset = LargeGraphDataset()
dataset[0]

DATASET
(__init__) root: ../data/wsdm-2022
---------------
    process    
---------------


Processing...


dataset: Data(x=[3255, 8], edge_index=[2, 18327], edge_attrs=[18327], edge_labels=[18327, 2])


Done!


FileNotFoundError: [Errno 2] No such file or directory: 'graph_0'

In [None]:
data_list = []

for index, (date, group) in enumerate(grouped_df):
    group = group.reset_index(drop=True)
    print(date)
    print(group)

    source_nodes = group['src_id']
    target_nodes = group['dst_id']
    edge_type = group['edge_type']
    timestamp = group['timestamp']
    
    edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
    edge_attrs = torch.tensor([edge_type, timestamp], dtype=torch.long)
    edge_attrs = edge_attrs.transpose(0, 1)
    
    print(edge_index.shape)
    print(edge_attrs.shape)

    data = Data(edge_index=edge_index, edge_attrs=edge_attrs)
    data_list.append(data)
    if index > 2:
        break

In [None]:
import sys
import os
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import networkx as nx

%matplotlib inline

In [None]:
DATA_DIR = "../data/wsdm-2022"

print("FILES")
print(sorted(os.listdir(f"{DATA_DIR}/train")))
print(sorted(os.listdir(f"{DATA_DIR}/test")))

In [None]:
edge_list = pd.read_csv(
    f"{DATA_DIR}/train/edges_train_A.csv",
    header=None,
    names=['src_id', 'dst_id', 'edge_type', 'timestamp'],
    dtype={'src_id': int, 'dst_id': int, 'edge_type': int, 'timestamp': int},
).sort_values('timestamp')

In [None]:
edge_list.head()

In [None]:
edge_list.describe()

In [None]:
DG = nx.from_pandas_edgelist(
    edge_list[:200],
    source='src_id', target='dst_id',
    edge_attr=True,
    create_using=nx.DiGraph()
)
DG

In [None]:
from matplotlib.pyplot import figure, text

fig, ax = plt.subplots(figsize=(12, 8))
fig.set_tight_layout(False)

pos = nx.kamada_kawai_layout(DG)
# pos = nx.spring_layout(DG)
# pos = nx.fruchterman_reingold_layout(G)  # TOO SLOW

nx.draw(
    DG, pos,
    node_size=50,
    # with_labels=True,
    
)
for node, (x, y) in pos.items():
    text(x, y, node, fontsize=5, ha='center', va='center')

In [None]:
# sampling
edge_list_org = edge_list.copy()
edge_list = edge_list[:30000]
print(len(edge_list), len(edge_list_org))

In [None]:
graph = nx.DiGraph()
G = nx.from_pandas_edgelist(edge_list, 'src_id', 'dst_id')

In [None]:
from matplotlib.pyplot import figure

figure(figsize=(10, 8))
# nx.draw_shell(G, with_labels=True)
nx.draw(G, with_labels=True)

In [None]:
fig, ax = plt.subplots()
fig.set_tight_layout(False)
pos = nx.kamada_kawai_layout(G)
# pos = nx.fruchterman_reingold_layout(G)
nx.draw(G, pos, with_labels=True)
# nx.draw(G, with_labels=True)

In [None]:
import matplotlib.pyplot as plt
import networkx as nx

def plot_degree_dist(G):
    degrees = [G.degree(n) for n in G.nodes()]
    plt.hist(
        degrees,
        bins=100,
        log=True,
    )
    plt.show()

plot_degree_dist(G)

In [None]:
from pyvis.network import Network

net = Network()
for index, row in edge_list.iterrows():
    net.add_node(row['src_id'], label=row['src_id'])
    net.add_node(row['dst_id'], label=row['dst_id'])
    net.add_edge(row['src_id'], row['dst_id'])


In [None]:
net.show('nodes.html')

<img src="../assets/pyvis_network_2022_0103_01.png">

### EDGE TYPE

In [None]:
timestamps = edge_list['timestamp'].copy()
datetimes = timestamps.apply(
    lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y%m%d_%H%M%S")
)
dates = timestamps.apply(
    lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y%m%d")
)
edge_list['datetime'] = datetimes
edge_list['date'] = dates
edge_list

In [None]:
edge_list['edge_type'].value_counts().plot(
    kind='bar', figsize=(10,6), title="Edge Type Frequency"
)

In [None]:
print(edge_list['edge_type'].value_counts()[:10].sum())
edge_list['edge_type'].value_counts()[:10]

In [None]:
edge_type_count = edge_list.groupby(['date', 'edge_type']).size()
edge_type_count = edge_type_count.reset_index(name='count')
edge_type_count

In [None]:
# Top 10

top_10_edge_type = list(edge_list['edge_type'].value_counts()[:10].index)
top_10_edge_list = edge_list[edge_list['edge_type'].isin(top_10_edge_type)]
top_10_edge_list_count = top_10_edge_list.groupby(['date', 'edge_type'])
top_10_edge_list_count = top_10_edge_list_count.size().reset_index(name='count')
top_10_edge_list_count.head()

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 6))

sns.pointplot(
    x='date',
    y='count',
    hue='edge_type',
    # dashes= False,
    # marker='^',
    # color='pastel',
    # alpha=0.8,
    ax=axes,
    # data=data,
    data=top_10_edge_list_count,
)

start, end = axes.get_xlim()
axes.xaxis.set_ticks(np.arange(start, end, 20))
for tick in axes.get_xticklabels():
    tick.set_rotation(90)

In [None]:
edge_list

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 6))

sns.pointplot(
    x='timestamp',
    y='count',
    hue='edge_type',
    # dashes= False,
    # marker='^',
    # color='pastel',
    # alpha=0.8,
    ax=axes,
    # data=data,
    data=edge_list,
)

start, end = axes.get_xlim()
axes.xaxis.set_ticks(np.arange(start, end, 20))
for tick in axes.get_xticklabels():
    tick.set_rotation(90)