In [None]:
import os
import sys

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import torch as th


In [None]:
#os.listdir()

## Feature Normalization

In [None]:
feature = pd.read_csv('./node_feature.tsv', sep='\t', header=0)

# feature file contains sample id (apply_no)  and all the feature extracted by expertise

In [None]:
INF=1e10
feature.replace(np.inf, INF,inplace=True)
feature.replace(np.nan, 0, inplace=True)

In [None]:
categorical_ft=[]
numerical_ft = []
for col in feature.columns:
    if col in exclude_list:
        continue
    else:
        if feature[col].dtype == np.object:
            categorical_ft.append(col)
        else:
            numerical_ft.append(col)

In [None]:
not_null_norm=ColumnTransformer(
    [('numerical', MinMaxScaler(copy=True),numerical_ft)]
)
null_norm=SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)

In [None]:
norm_feature=not_null_norm.fit_transform(feature)

## GCN

In [None]:
import dgl
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch import GraphConv
from scipy.stats import ks_2samp
from cytoolz import pipe

In [None]:
import sklearn.metrics as metrics

In [None]:
import pdb

In [None]:
class Dataset():
    def __init__(self,
                 feature, 
                 sample_driver,  # driver with apply_no/apply_date/label; matching label to feature extracted
                 node_set,
                 edge_set,
                 ft_col, # list of feature name used
                 transformer):
        """
        number of nodes 
        nodeid
        source node 
        destination node 
        
        """
        self.num_nodes=node_dataframe.shape[0]
        self.node_id=dict(zip([x[1] for x in node_set], range(self.num_nodes))) # see 2_graph_data_explore
        
        self.g=dgl.DGLGraph()
        self.g.add_nodes(self.num_nodes)
        for item in edge_set:
            self.g.add_edges(item[1], item[2]) # see 2_graph_data_explore
        self.g.ndata['nodeId']=feature[v_col].values
        
        norm_feature=transformer.fit_transform(feature[ft_col])
        
        self.g.ndata['feature']=th.FloatTensor(norm_feature)
        self.g.ndata['label']=th.LongTensor(feature['label'].values)
    def apply_mask(self):
        y=self.g.ndata['label']
        train_mask= np.zeros(y.shape[0],dtype=np.bool)
        test_mask= np.zeros(y.shape[0], dtype= np.bool)
        #pdb.set_trace()
        y0=np.argwhere(y==0)[:,0]
        y1=np.argwhere(y==1)[:,0]
        ym=np.argwhere(y==-1)[:,0]
        
        np.random.shuffle(y0)
        np.random.shuffle(y1)
        
        k0=int(y0.shape[0]*0.8)
        k1=int(y1.shape[0]*0.8)
        #pdb.set_trace()
        y0_train=y0[:k0]
        y1_train=y1[:k1]
        
        y_train=np.hstack([y0_train, y1_train])
        y_train=np.sort(y_train)
        train_mask[y_train]=True
        test_mask[train_mask==False]=True
        test_mask[ym]=False
        
        self.g.ndata['train_mask']=train_mask
        self.g.ndata['test_mask']=test_mask
    def apply_selfloop(self):
        selfloop_src=list(self.g.nodes().detach().numpy())
        selfloop_dst=list(self.g.nodes().detach().numpy())
        self.g.add_edges(selfloop_src, selfloop_dst)

In [None]:
args={
    'feature':feature,
    'big_comm':big_comm,
    'v_col':'nodeId',
    'ft_col':numerical_ft,
    'y_col':'shangbao',
    'src_col':'src_nodeId',
    'dst_col':'dst_nodeId',
    'transformer':not_null_norm
}

In [None]:
data=Dataset(**args)

In [None]:
data.apply_mask()

In [None]:
data.apply_selfloop()

## Graph Structure Exploration

In [None]:
import networkx as nx

In [None]:
figg=plt.figure(figsize=(14,7))
ax1=figg.add_subplot(221)
ax2=figg.add_subplot(222)
ax3=figg.add_subplot(223)
ax4=figg.add_subplot(224)
ax1.plot(data.g.in_degrees().detach().numpy(),'.')
ax2.hist(data.g.in_degrees().detach().numpy())
ax3.plot(data.g.out_degrees().detach().numpy(),'.')
ax4.hist(data.g.out_degrees().detach().numpy())
figg.show()

In [None]:
nx_g=data.g.to_networkx()

## GCN Model

In [None]:
import time

In [None]:
class Graph_Model(nn.Module):
    def __init__(self, g, n_ftr, n_hidden, norm=True):
        super(Graph_Model, self).__init__()
        self.graph = g
        self.gcn0 = GraphConv(n_ftr, n_hidden, activation= F.relu, norm=norm)
        self.linear0 = nn.Linear(n_ftr + n_hidden, n_hidden)
        nn.init.xavier_uniform(self.linear0.weight)
        self.bn0=nn.BatchNorm1d(n_ftr + n_hidden)
        self.bn1 = nn.BatchNorm1d(n_ftr + 2* n_hidden)
        self.dp0 = nn.Dropout(0.3)
        
        self.linear1 = nn.Linear(n_ftr + 2* n_hidden, 1)
        nn.init.xavier_uniform(self.linear1.weight)
    def forward(self, input_):
        h1=self.gcn0(input_, self.graph)  ## add graph convolution
#         h1=h1+input_ ## equivalently with adding self-loop edges
        h2=torch.cat((input_, h1), dim=1) ## add residual net
        h5 = pipe(h2,
                  self.bn0,
                  self.linear0,
                  F.relu
                  )
        h6 = torch.cat((h5, h2), dim=1) ## add the second residual net
        h8 = pipe(h6,
                  self.bn1,
                  self.dp0,
                  self.linear1,
                  torch.sigmoid
                 )
        return h8
def evaluate(model, feature, labels, test_mask):
    model.eval()
    with torch.no_grad():
        logits = model(feature)
        y_pred = logits[test_mask]
        y_test = labels[test_mask]
        ##y_pred_label = torch.
    try:
        mv =ks_2samp((y_pred[y_test<0.5]).detach().numpy(),(y_pred[y_test>0.5]).detach().numpy())
    except:
        mv = 'The prob pred by GCN remains the same for all verticals'
    #pdb.set_trace()
    return mv

In [None]:
args_={
    'epochs': 200,
    'lr':1e-3,
    'num_hidden': 50
}

In [None]:
import torch.nn.functional as F

In [None]:
def main(args_):
    features=torch.Tensor(data.g.ndata['feature'])
    train_mask = torch.ByteTensor(data.g.ndata['train_mask'].astype(np.int32))
    test_mask = torch.ByteTensor(data.g.ndata['test_mask'].astype(np.int32))
    labels = torch.LongTensor(data.g.ndata['label'])
    
    model = Graph_Model(data.g, features.shape[1], args_['num_hidden'])
    
    # loss_fcn = F.nll_loss()
    optimizer = torch.optim.Adam(model.parameters(), lr = args_['lr'], weight_decay = 0.1)
    dur = []
    print(model)
    #for param in model.parameters():
    #    print(param, param.shape)
    for epoch in range(args_['epochs']):
        if epoch >= 3:
            t0 = time.time()
        model.train()
        logits = model(features)
        loss = F.nll_loss(logits[train_mask], labels[train_mask])
        
        if(epoch >= 3):
            dur.append(time.time()- t0)
        optimizer.zero_grad()
        loss.backward()
        optimizer.zero_grad()
        ks_value = evaluate(model, features, labels, test_mask)
        
        print("Epoch {:05d} | Time(s) {:4f} | Loss {:.4f} | KS_Value {} \n".format(epoch, np.mean(dur), loss.item(), ks_value))
        with open(f'gcn_model.epoch{epoch}','wb') as f:
            torch.save(model.state_dict(), f)


In [None]:
main(args_)