### Classification based on 4 GCPS Levels

In [1]:
import networkx as nx
import pandas as pd
import glob 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from tqdm import tqdm

### Clean data : patient characteristics

In [2]:
df_char = pd.read_csv("data/char_cleaned.csv")

In [3]:
def get_X_struct(structure_type):
    X_struct = []
    for index, row in df_char.iterrows():
        node_id = row['token']
        filename = 'Ego_networks/' + node_id + '.csv'
        G = nx.read_edgelist(filename, delimiter=',', create_using=nx.Graph, data = (('edge_type', str),))
        if (structure_type == "GDV"):
            dic = nx.induced_graphlet_degree_vector_ego(G, node_id)
            X_struct.append(dic[node_id])
        if (structure_type == "TyEGDV"):
            dic = nx.typed_edge_induced_graphlet_degree_vector_ego(G, num_type=13, nodes=node_id)
            X_struct.append([j for sub in dic[node_id] for j in sub]) 
    return np.array(X_struct)
    

In [4]:
# prepare structural features
X_GDV = get_X_struct("GDV")
X_TyEGDV = get_X_struct("TyEGDV")
print(np.shape(X_GDV))
print(np.shape(X_TyEGDV))

(303, 7)
(303, 91)


In [5]:
# all raw features
X_char = df_char.drop(["token", "GCPS_GRADE"], axis=1).to_numpy()
print(np.shape(X_char))

y = df_char['GCPS_GRADE'].to_numpy()

(303, 24)


In [6]:
X_combined_GDV = np.concatenate((X_char, X_GDV),axis=1)
X_combined_TyEGDV = np.concatenate((X_char, X_TyEGDV),axis=1)
print(np.shape(X_combined_GDV))
print(np.shape(X_combined_TyEGDV))

(303, 31)
(303, 115)


In [7]:
scaler = StandardScaler().fit(X_char)
X_char_scaled = scaler.transform(X_char)

scaler = StandardScaler().fit(X_combined_GDV)
X_combined_GDV_scaled = scaler.transform(X_combined_GDV)

scaler = StandardScaler().fit(X_combined_TyEGDV)
X_combined_TyEGDV_scaled = scaler.transform(X_combined_TyEGDV)

### Average result

In [8]:
def average_score_classification(clf, X, y, cv, repeat=100):
    res = 0 
    for i in tqdm(range(repeat)):  
        temp = cross_val_score(clf, X, y, cv=cv, scoring="f1_macro")
        res += np.mean(temp)
    return res/repeat

In [9]:
skf = StratifiedKFold(n_splits=5)
clf_rf =  RandomForestClassifier(n_estimators=20, max_features=None, bootstrap=False, min_samples_leaf=2, min_samples_split=5)

In [11]:
score_char = average_score_classification(clf_rf, X_char_scaled, y, skf, repeat=500)
score_char_GDV = average_score_classification(clf_rf, X_combined_GDV_scaled, y, skf, repeat=500)
score_char_TyEGDV = average_score_classification(clf_rf, X_combined_TyEGDV_scaled, y, skf, repeat=500)

100%|██████████| 500/500 [01:56<00:00,  4.28it/s]
100%|██████████| 500/500 [02:18<00:00,  3.60it/s]
100%|██████████| 500/500 [04:13<00:00,  1.97it/s]


In [12]:
print("Char:           {:.3f}".format(score_char))
print("Char + GDV:     {:.3f}".format(score_char_GDV))
print("Char + TyEGDV:  {:.3f}".format(score_char_TyEGDV)) 

Char:           0.578
Char + GDV:     0.597
Char + TyEGDV:  0.618


In [20]:
# dummy classifier as baseline
clf_dummy = DummyClassifier(strategy='uniform')
print(average_score_classification(clf_dummy, X_char_scaled, y, skf))

100%|██████████| 100/100 [00:00<00:00, 151.07it/s]

0.21314513130945978





In [None]:
def average_confusion_matrix(X, y, clf, cv, repeat=100):
    matrices_all = np.zeros((4,4), dtype=int)
    for n in tqdm(range(repeat)):
        for train_index, test_index in cv.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            matrix = confusion_matrix(y_test, y_pred)
            matrices_all += matrix
    matrix_avg = matrices_all / (repeat * 5)
    print("Average Confusion Matrix: \n {}\n".format(matrix_avg))
    return matrix_avg

In [None]:
average_confusion_matrix(X_char_scaled, y, clf_rf, skf)
average_confusion_matrix(X_combined_GDV_scaled, y, clf_rf, skf)
average_confusion_matrix(X_combined_TyEGDV_scaled, y, clf_rf, skf)

The performance is only comparable to random guess.
__TyE-GDV__ has several times more features than __GDV__, the performance on training set is significanly improved, to almost perfect. (However, on test set, it performs worse due to overfitting.)