### Classification based on 4 GCPS Levels

In [1]:
import networkx as nx
import pandas as pd
import glob 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

In [2]:
# in order to computer later, view None value as 0
def process_none_value(dic):
    assert (len(dic) == 1)
    for i in dic:
        for v in range(len(dic[i])):
            if (dic[i][v] == None):
                dic[i][v] = 0

### Clean data : patient characteristics

In [3]:
df_char = pd.read_csv("data/char_cleaned.csv")

In [16]:
def get_X_struct(structure_type):
    X_struct = []
    for index, row in df_char.iterrows():
        node_id = row['token']
        filename = 'Ego_networks/' + node_id + '.csv'
        G = nx.read_edgelist(filename, delimiter=',', create_using=nx.Graph, data = (('edge_type', str),))
        if (structure_type == "ECCV"):
            dic = nx.edge_clustering_vector(G, node_id)
            process_none_value(dic)
            X_struct.append(dic[node_id])
        if (structure_type == "GDV"):
            dic = nx.induced_graphlet_degree_vector_ego(G, node_id)
            X_struct.append(dic[node_id])
        if (structure_type == "TyEGDV"):
            dic = nx.typed_edge_induced_graphlet_degree_vector_ego(G, num_type=13, nodes=node_id)
            X_struct.append([j for sub in dic[node_id] for j in sub]) 
        if (structure_type == "TyEDegree"):
            dic = nx.typed_edge_induced_graphlet_degree_vector_ego(G, num_type=13, nodes=node_id)
            X_struct.append(dic[node_id][0]) 
        if (structure_type == "ColoredEGDV"):
            dic = nx.colored_ego_graphlet_vector_for_typed_edge(G, num_type=13, nodes=node_id)
            X_struct.append(dic[node_id])
#         if (structure_type == "ColoredEGDV2"):
#             dic = nx.colored_ego_graphlet_vector_for_typed_edge_v2(G, num_type=13, nodes=node_id)
#            X_struct.append(dic[node_id])
        if (structure_type == "HeteroEGDV"):
            dic = nx.hetero_ego_graphlet_vector_for_typed_edge(G, num_type=13, nodes=node_id)
            X_struct.append(dic[node_id])
    return np.array(X_struct)

In [17]:
# prepare structural features
X_ECCV = get_X_struct("ECCV")
X_GDV = get_X_struct("GDV")
X_TyEGDV = get_X_struct("TyEGDV")
X_TyEDegree = get_X_struct("TyEDegree")
X_ColoredEGDV = get_X_struct("ColoredEGDV")
X_HeteroEGDV = get_X_struct("HeteroEGDV")
print(np.shape(X_ECCV))
print(np.shape(X_GDV))
print(np.shape(X_TyEGDV))
print(np.shape(X_TyEDegree))
print(np.shape(X_ColoredEGDV))
print(np.shape(X_HeteroEGDV))

(303, 13)
(303, 7)
(303, 91)
(303, 13)
(303, 12367)
(303, 38870)


In [18]:
# all raw features
X_char = df_char.drop(["token", "GCPS_GRADE"], axis=1).to_numpy()
print(np.shape(X_char))

y = df_char['GCPS_GRADE'].to_numpy()

(303, 24)


In [19]:
X_combined_ECCV = np.concatenate((X_char, X_ECCV),axis=1)
X_combined_GDV = np.concatenate((X_char, X_GDV),axis=1)
X_combined_TyEGDV = np.concatenate((X_char, X_TyEGDV),axis=1)
X_combined_TyEDegree = np.concatenate((X_char, X_TyEDegree),axis=1)
X_combined_ColoredEGDV = np.concatenate((X_char, X_ColoredEGDV),axis=1)
X_combined_HeteroEGDV = np.concatenate((X_char, X_HeteroEGDV),axis=1)
print(np.shape(X_combined_ECCV))
print(np.shape(X_combined_GDV))
print(np.shape(X_combined_TyEGDV))
print(np.shape(X_combined_TyEDegree))
print(np.shape(X_combined_ColoredEGDV))
print(np.shape(X_combined_HeteroEGDV))

(303, 37)
(303, 31)
(303, 115)
(303, 37)
(303, 12391)
(303, 38894)


In [20]:
scaler = StandardScaler().fit(X_char)
X_char_scaled = scaler.transform(X_char)

scaler = StandardScaler().fit(X_combined_ECCV)
X_combined_ECCV_scaled = scaler.transform(X_combined_ECCV)

scaler = StandardScaler().fit(X_combined_GDV)
X_combined_GDV_scaled = scaler.transform(X_combined_GDV)

scaler = StandardScaler().fit(X_combined_TyEGDV)
X_combined_TyEGDV_scaled = scaler.transform(X_combined_TyEGDV)

scaler = StandardScaler().fit(X_combined_TyEDegree)
X_combined_TyEDegree_scaled = scaler.transform(X_combined_TyEDegree)

scaler = StandardScaler().fit(X_combined_ColoredEGDV)
X_combined_ColoredEGDV_scaled = scaler.transform(X_combined_ColoredEGDV)

scaler = StandardScaler().fit(X_combined_HeteroEGDV)
X_combined_HeteroEGDV_scaled = scaler.transform(X_combined_HeteroEGDV)

### Average result

In [21]:
def average_score_classification(clf, X, y, cv, repeat=100):
    res = 0 
    for i in tqdm(range(repeat)):  
        temp = cross_val_score(clf, X, y, cv=cv, scoring="f1_macro")
        res += np.mean(temp)
    return res/repeat

In [22]:
def average_score_classification_with_std(clf, X, y, cv, repeat=100):
    res = [] 
    for i in tqdm(range(repeat)):  
        temp = cross_val_score(clf, X, y, cv=cv, scoring="f1_macro")
        res.append(np.mean(temp))
    return np.mean(res), np.std(res)

In [26]:
skf = StratifiedKFold(n_splits=5)
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0)
clf_rf =  RandomForestClassifier(n_estimators=20, max_features=None, bootstrap=False, min_samples_leaf=2, min_samples_split=5)

In [29]:
char_mean, char_std = average_score_classification_with_std(clf_rf, X_char_scaled, y, skf, repeat=500)
char_TyEDegree_mean, char_TyEDegree_std = average_score_classification_with_std(clf_rf, X_combined_TyEDegree_scaled, y, skf, repeat=500)
char_GDV_mean, char_GDV_std = average_score_classification_with_std(clf_rf, X_combined_GDV_scaled, y, skf, repeat=500)
char_TyEGDV_mean, char_TyEGDV_std = average_score_classification_with_std(clf_rf, X_combined_TyEGDV_scaled, y, skf, repeat=500)
char_ColoredEGDV_mean, char_ColoredEGDV_std = average_score_classification_with_std(clf_rf, X_combined_ColoredEGDV_scaled, y, skf, repeat=500)
char_HeteroEGDV_mean, char_HeteroEGDV_std = average_score_classification_with_std(clf_rf, X_combined_HeteroEGDV_scaled, y, skf, repeat=500)

100%|██████████| 500/500 [01:57<00:00,  4.26it/s]
100%|██████████| 500/500 [02:10<00:00,  3.82it/s]
100%|██████████| 500/500 [02:19<00:00,  3.59it/s]
100%|██████████| 500/500 [04:11<00:00,  1.99it/s]
100%|██████████| 500/500 [34:51<00:00,  4.18s/it]
100%|██████████| 500/500 [2:17:10<00:00, 16.46s/it]  


In [30]:
print("Char:              {:.3f} $\pm$ {:.4f}".format(char_mean, char_std))
print("Char + TyEDegree:  {:.3f} $\pm$ {:.4f}".format(char_TyEDegree_mean, char_TyEDegree_std))
print("Char + GDV:        {:.3f} $\pm$ {:.4f}".format(char_GDV_mean, char_GDV_std))
print("Char + TyEGDV:     {:.3f} $\pm$ {:.4f}".format(char_TyEGDV_mean, char_TyEGDV_std)) 
print("Char + ColoredEGDV:     {:.3f} $\pm$ {:.4f}".format(char_ColoredEGDV_mean, char_ColoredEGDV_std))
print("Char + HeteroEGDV:     {:.3f} $\pm$ {:.4f}".format(char_HeteroEGDV_mean, char_HeteroEGDV_std))

Char:              0.578 $\pm$ 0.0050
Char + TyEDegree:  0.600 $\pm$ 0.0050
Char + GDV:        0.597 $\pm$ 0.0075
Char + TyEGDV:     0.619 $\pm$ 0.0043
Char + ColoredEGDV:     0.608 $\pm$ 0.0061
Char + HeteroEGDV:     0.638 $\pm$ 0.0059


In [None]:
# dummy classifier as baseline
clf_dummy = DummyClassifier(strategy='stratified')
dummy_mean, dummy_std = average_score_classification_with_std(clf_dummy, X_char_scaled, y, skf, repeat=500)
print("Dummy:           {:.3f} $\pm$ {:.4f}".format(dummy_mean, dummy_std))