# Load Dataset

In [21]:
import sys
import os
import os.path as osp
from zipfile import ZipFile 
import requests
from scipy.io import loadmat

import pandas as pd
import numpy as np
import torch
import urllib.request
from torch_geometric.data import Data

def load_YelpChi_dataset(DATADIR = './data/YelpChi', log=True):   
    """
    load dataset YelpChi (fraud) with edge attributes
    """
    if osp.exists(osp.join(DATADIR,'YelpChi_data.pt')):  
        filename = osp.join(DATADIR,'YelpChi_data.pt')
        if log:
            print(f'Using existing file {filename}', file=sys.stderr)
        data = torch.load(filename)
        return data
        
    
    else:
        if not osp.exists(DATADIR):
            os.makedirs(DATADIR)
        
        download_dir = 'https://github.com/finint/antifraud/raw/main/data/'
        filename = 'YelpChi.zip'
        url = osp.join(download_dir, filename)
        
        print(f'Downloading {url}', file=sys.stderr)        
        r = requests.get(url)
        with open(osp.join(DATADIR, filename),'wb') as f:
            f.write(r.content)
        
        #unzip
        print(f'Extracting {osp.join(DATADIR, filename)}', file=sys.stderr)
        with ZipFile(osp.join(DATADIR, filename), 'r') as zObject: 
            zObject.extractall( path=DATADIR
                              )        
        print(f'Preprocessing ', file=sys.stderr)
        yelp = loadmat(osp.join(DATADIR, 'YelpChi.mat'))
        net_rur = yelp['net_rur']
        net_rtr = yelp['net_rtr']
        net_rsr = yelp['net_rsr']
        yelp_homo = yelp['homo']

        # 1) R-U-R: it connects reviews posted by the same user; 
        # 2) R-S-R: it connects reviews under the same product with the same star rating (1-5 stars); 
        # 3) R-T-R: it connects two reviews under the same product posted in the same month.

        # Relation	 Edges
        # R-U-R	49,315
        # R-T-R	573,616
        # R-S-R	3,402,743
        # All	3,846,979


        data_file = yelp
        labels = pd.DataFrame(data_file['label'].flatten())[0]
        feat_data = pd.DataFrame(data_file['features'].todense().A)

        adj_rur = np.vstack(net_rur.nonzero())
        adj_rtr = np.vstack(net_rtr.nonzero())
        adj_rsr = np.vstack(net_rsr.nonzero())
        adj_homo = np.vstack(yelp_homo.nonzero())

        df_rur_edges = pd.DataFrame(adj_rur.T)
        df_rtr_edges = pd.DataFrame(adj_rtr.T)
        df_rsr_edges = pd.DataFrame(adj_rsr.T)
        df_homo_edges = pd.DataFrame(adj_homo.T)

        df_rur_edges['rur'] = 1
        df_rtr_edges['rtr'] = 1
        df_rsr_edges['rsr'] = 1

        df_all_edges = df_rtr_edges.merge(
                df_rsr_edges,on=[0,1],how='outer'
            ).merge(df_rur_edges,on=[0,1],how='outer'
               ).fillna(0)

        df_all_edges = df_all_edges.rename(columns = {0:'index1', 1:'index2'})

        df_all_edges['rstr'] = df_all_edges['rsr'] * df_all_edges['rtr']
        extra_feats = df_all_edges.groupby('index1')[['rtr', 'rsr', 'rur', 'rstr']].sum().reset_index()\
            .rename(columns={'index1':'index'})

        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()

        extra_feats[['rsr', 'rur', 'rtr','rstr']] = scaler.fit_transform(
            np.log10(extra_feats[['rsr', 'rur', 'rtr','rstr']] + 1)
        )

        df_all_edges_extra = df_all_edges\
            .merge(
                    extra_feats.rename(columns={'index':'index1'}), 
                    suffixes=('','_lhs'),
                    on=['index1']
            )\
            .merge(
                    extra_feats.rename(columns={'index':'index2'}), 
                    suffixes=('','_rhs'),
                    on=['index2']
            )

        feat_data['index'] = np.arange(len(feat_data))
        df_feats_all = feat_data.drop(columns=['index'])

        x = torch.FloatTensor(df_feats_all.values)
        edge_index = torch.LongTensor(df_all_edges_extra[['index1','index2']].values.T)

        edge_attr = torch.FloatTensor(df_all_edges_extra.drop(columns=['index1','index2']).values)
        y = torch.LongTensor(labels.sort_index().values)

        data = Data(x=x,edge_index=edge_index, edge_attr=edge_attr, y=y)


        torch.save(data,osp.join(DATADIR,'YelpChi_data.pt'))
        return data

In [38]:
%%time
data = load_YelpChi_dataset(DATADIR = './data/YelpChi', log=True)

Downloading https://github.com/finint/antifraud/raw/main/data/YelpChi.zip
Extracting ./data/YelpChi/YelpChi.zip
Preprocessing 


CPU times: user 45.6 s, sys: 10.2 s, total: 55.7 s
Wall time: 45.2 s


In [39]:
data

Data(x=[45954, 32], edge_index=[2, 7693958], edge_attr=[7693958, 12], y=[45954])

# RUN with edge attr

In [29]:
from cool_graph.runners import Runner

In [31]:
runner = Runner(data, 
                use_edge_attr=True, 
                overrides=['training.n_epochs=25',],
                seed=42,
                metrics = ['roc_auc','accuracy', 'f1'])#'training.initial_lr=0.01'

In [32]:
result = runner.run()

Sample data: 100%|██████████| 138/138 [00:29<00:00,  4.69it/s]
Sample data: 100%|██████████| 46/46 [00:09<00:00,  4.75it/s]
2023-10-27 19:59:38.948 | INFO     | cool_graph.train.trainer:train:230 - 
Epoch 000: 
2023-10-27 19:59:40.457 | INFO     | cool_graph.train.helpers:eval_epoch:176 - test:
 {'roc_auc': 0.61113194562671, 'accuracy': 0.8515971799112194, 'f1': 0.0, 'calc_time': 0.02510395844777425, 'main_metric': 0.61113194562671}
2023-10-27 19:59:40.458 | INFO     | cool_graph.train.trainer:train:257 - Epoch 000: 
2023-10-27 19:59:44.809 | INFO     | cool_graph.train.helpers:eval_epoch:176 - train:
 {'roc_auc': 0.6104388215549136, 'accuracy': 0.8557377049180328, 'f1': 0.0, 'calc_time': 0.07250570853551229, 'main_metric': 0.6104388215549136}
2023-10-27 20:00:46.468 | INFO     | cool_graph.train.trainer:train:230 - 
Epoch 005: 
2023-10-27 20:00:48.392 | INFO     | cool_graph.train.helpers:eval_epoch:176 - test:
 {'roc_auc': 0.8934165062115897, 'accuracy': 0.8911132387501088, 'f1': 0.5

In [33]:
result['best_loss']['roc_auc']

0.9058595576475329

# RUN without edge attr

In [34]:
runner2 = Runner(data, 
                 use_edge_attr=False, 
                 overrides=['training.n_epochs=25'],
                 seed=42,
                 metrics = ['roc_auc','accuracy', 'f1'])
result2 = runner2.run()

Sample data: 100%|██████████| 138/138 [00:25<00:00,  5.35it/s]
Sample data: 100%|██████████| 46/46 [00:07<00:00,  6.20it/s]
2023-10-27 20:05:46.389 | INFO     | cool_graph.train.trainer:train:230 - 
Epoch 000: 
2023-10-27 20:05:46.999 | INFO     | cool_graph.train.helpers:eval_epoch:176 - test:
 {'roc_auc': 0.7727286514819814, 'accuracy': 0.8569066063190878, 'f1': 0.09868421052631579, 'calc_time': 0.010153059164683025, 'main_metric': 0.7727286514819814}
2023-10-27 20:05:47.000 | INFO     | cool_graph.train.trainer:train:257 - Epoch 000: 
2023-10-27 20:05:49.877 | INFO     | cool_graph.train.helpers:eval_epoch:176 - train:
 {'roc_auc': 0.7718866107258253, 'accuracy': 0.8592485129841868, 'f1': 0.09882964889466841, 'calc_time': 0.047935350735982256, 'main_metric': 0.7718866107258253}
2023-10-27 20:06:12.761 | INFO     | cool_graph.train.trainer:train:230 - 
Epoch 005: 
2023-10-27 20:06:13.562 | INFO     | cool_graph.train.helpers:eval_epoch:176 - test:
 {'roc_auc': 0.840267670240239, 'acc

In [35]:
result2['best_loss']['roc_auc']

0.8723677774234312

# So with edge attributes roc_auc better 
# 0.906 vs 0.872