# Load Dataset

In [14]:
import sys
import os
import os.path as osp
from zipfile import ZipFile 
import requests
from scipy.io import loadmat

import pandas as pd
import numpy as np
import torch
import urllib.request
from torch_geometric.data import Data

def load_YelpChi_dataset(DATADIR = './data/YelpChi', log=True):   
    """
    load dataset YelpChi (fraud) with edge attributes
    """
    if osp.exists(osp.join(DATADIR,'YelpChi_data.pt')):  
        filename = osp.join(DATADIR,'YelpChi_data.pt')
        if log:
            print(f'Using existing file {filename}', file=sys.stderr)
        data = torch.load(filename)
        return data
        
    
    else:
        if not osp.exists(DATADIR):
            os.makedirs(DATADIR)
        
        download_dir = 'https://github.com/finint/antifraud/raw/main/data/'
        filename = 'YelpChi.zip'
        url = osp.join(download_dir, filename)
        
        print(f'Downloading {url}', file=sys.stderr)        
        r = requests.get(url)
        with open(osp.join(DATADIR, filename),'wb') as f:
            f.write(r.content)
        
        #unzip
        print(f'Extracting {osp.join(DATADIR, filename)}', file=sys.stderr)
        with ZipFile(osp.join(DATADIR, filename), 'r') as zObject: 
            zObject.extractall( path=DATADIR
                              )        
        print(f'Preprocessing ', file=sys.stderr)
        yelp = loadmat(osp.join(DATADIR, 'YelpChi.mat'))
        net_rur = yelp['net_rur']
        net_rtr = yelp['net_rtr']
        net_rsr = yelp['net_rsr']
        yelp_homo = yelp['homo']

        # 1) R-U-R: it connects reviews posted by the same user; 
        # 2) R-S-R: it connects reviews under the same product with the same star rating (1-5 stars); 
        # 3) R-T-R: it connects two reviews under the same product posted in the same month.

        # Relation	 Edges
        # R-U-R	49,315
        # R-T-R	573,616
        # R-S-R	3,402,743
        # All	3,846,979


        data_file = yelp
        labels = pd.DataFrame(data_file['label'].flatten())[0]
        feat_data = pd.DataFrame(data_file['features'].todense().A)

        adj_rur = np.vstack(net_rur.nonzero())
        adj_rtr = np.vstack(net_rtr.nonzero())
        adj_rsr = np.vstack(net_rsr.nonzero())
        adj_homo = np.vstack(yelp_homo.nonzero())

        df_rur_edges = pd.DataFrame(adj_rur.T)
        df_rtr_edges = pd.DataFrame(adj_rtr.T)
        df_rsr_edges = pd.DataFrame(adj_rsr.T)
        df_homo_edges = pd.DataFrame(adj_homo.T)

        df_rur_edges['rur'] = 1
        df_rtr_edges['rtr'] = 1
        df_rsr_edges['rsr'] = 1

        df_all_edges = df_rtr_edges.merge(
                df_rsr_edges,on=[0,1],how='outer'
            ).merge(df_rur_edges,on=[0,1],how='outer'
               ).fillna(0)

        df_all_edges = df_all_edges.rename(columns = {0:'index1', 1:'index2'})

        df_all_edges['rstr'] = df_all_edges['rsr'] * df_all_edges['rtr']
        extra_feats = df_all_edges.groupby('index1')[['rtr', 'rsr', 'rur', 'rstr']].sum().reset_index()\
            .rename(columns={'index1':'index'})

        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()

        extra_feats[['rsr', 'rur', 'rtr','rstr']] = scaler.fit_transform(
            np.log10(extra_feats[['rsr', 'rur', 'rtr','rstr']] + 1)
        )

        df_all_edges_extra = df_all_edges\
            .merge(
                    extra_feats.rename(columns={'index':'index1'}), 
                    suffixes=('','_lhs'),
                    on=['index1']
            )\
            .merge(
                    extra_feats.rename(columns={'index':'index2'}), 
                    suffixes=('','_rhs'),
                    on=['index2']
            )

        feat_data['index'] = np.arange(len(feat_data))
        df_feats_all = feat_data.drop(columns=['index'])

        x = torch.FloatTensor(df_feats_all.values)
        edge_index = torch.LongTensor(df_all_edges_extra[['index1','index2']].values.T)

        edge_attr = torch.FloatTensor(df_all_edges_extra.drop(columns=['index1','index2']).values)
        y = torch.LongTensor(labels.sort_index().values)

        data = Data(x=x,edge_index=edge_index, edge_attr=edge_attr, y=y)


        torch.save(data,osp.join(DATADIR,'YelpChi_data.pt'))
        return data

In [15]:
%%time
data = load_YelpChi_dataset(DATADIR = './data/YelpChi', log=True)

Using existing file ./data/YelpChi/YelpChi_data.pt


CPU times: user 7.55 ms, sys: 431 ms, total: 439 ms
Wall time: 436 ms


In [16]:
data

Data(x=[45954, 32], edge_index=[2, 7693958], edge_attr=[7693958, 12], y=[45954])

# RUN with edge attr

In [17]:
from cool_graph.runners import Runner

In [18]:
runner = Runner(data, 
                use_edge_attr=True, 
                overrides=['training.n_epochs=25',],
                seed=42,
                metrics = ['roc_auc','accuracy', 'f1'])#'training.initial_lr=0.01'

In [19]:
result = runner.run()

Sample data: 100%|██████████| 138/138 [00:27<00:00,  5.10it/s]
Sample data: 100%|██████████| 46/46 [00:08<00:00,  5.73it/s]
2024-06-04 13:42:11.738 | INFO     | cool_graph.train.helpers:eval_epoch:216 - test:
 {'roc_auc': 0.549, 'accuracy': 0.852, 'f1': 0.0, 'calc_time': 0.033, 'main_metric': 0.549}
2024-06-04 13:42:16.521 | INFO     | cool_graph.train.helpers:eval_epoch:216 - train:
 {'roc_auc': 0.554, 'accuracy': 0.856, 'f1': 0.0, 'calc_time': 0.08, 'main_metric': 0.554}
2024-06-04 13:43:20.498 | INFO     | cool_graph.train.helpers:eval_epoch:216 - test:
 {'roc_auc': 0.894, 'accuracy': 0.892, 'f1': 0.563, 'calc_time': 0.03, 'main_metric': 0.894}
2024-06-04 13:43:25.192 | INFO     | cool_graph.train.helpers:eval_epoch:216 - train:
 {'roc_auc': 0.894, 'accuracy': 0.893, 'f1': 0.55, 'calc_time': 0.078, 'main_metric': 0.894}
2024-06-04 13:44:28.071 | INFO     | cool_graph.train.helpers:eval_epoch:216 - test:
 {'roc_auc': 0.902, 'accuracy': 0.898, 'f1': 0.593, 'calc_time': 0.028, 'main_me

In [44]:
result['best_loss']['roc_auc']

0.906

# RUN without edge attr

In [45]:
runner2 = Runner(data, 
                 use_edge_attr=False, 
                 overrides=['training.n_epochs=25'],
                 seed=42,
                 metrics = ['roc_auc','accuracy', 'f1'])
result2 = runner2.run()

Sample data: 100%|██████████| 138/138 [00:25<00:00,  5.48it/s]
Sample data: 100%|██████████| 46/46 [00:07<00:00,  6.20it/s]
2024-06-04 13:53:06.545 | INFO     | cool_graph.train.helpers:eval_epoch:216 - test:
 {'roc_auc': 0.767, 'accuracy': 0.852, 'f1': 0.0, 'calc_time': 0.01, 'main_metric': 0.767}
2024-06-04 13:53:08.263 | INFO     | cool_graph.train.helpers:eval_epoch:216 - train:
 {'roc_auc': 0.767, 'accuracy': 0.856, 'f1': 0.0, 'calc_time': 0.029, 'main_metric': 0.767}
2024-06-04 13:53:32.534 | INFO     | cool_graph.train.helpers:eval_epoch:216 - test:
 {'roc_auc': 0.833, 'accuracy': 0.877, 'f1': 0.435, 'calc_time': 0.012, 'main_metric': 0.833}
2024-06-04 13:53:34.322 | INFO     | cool_graph.train.helpers:eval_epoch:216 - train:
 {'roc_auc': 0.837, 'accuracy': 0.878, 'f1': 0.427, 'calc_time': 0.03, 'main_metric': 0.837}
2024-06-04 13:53:58.779 | INFO     | cool_graph.train.helpers:eval_epoch:216 - test:
 {'roc_auc': 0.859, 'accuracy': 0.884, 'f1': 0.485, 'calc_time': 0.012, 'main_m

In [46]:
result2['best_loss']['roc_auc']

0.868

# So with edge attributes roc_auc better 
# 0.906 vs 0.868