# Verify SHAP in logistic regression

In [None]:
# Python一般
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import json
import shutil
import importlib
import pickle
import joblib
import sklearn
import datetime
import copy
import random
import scipy

from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import train_test_split, KFold, ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, brier_score_loss, roc_auc_score, roc_curve, precision_recall_fscore_support
import torch
import torchtuples as tt
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import dgl
import dgl.nn.pytorch as dglnn

import shap

import module.utils as utils
import module.models as models

import matplotlib.pyplot as plt
%precision 4

ROOT= "/".join(os.getcwd().split("/")[:-1])
DIR_LOG = os.path.join(ROOT, "LOG")

In [None]:
# Data File
gmt_path = os.path.join(ROOT, "data", "GSEA", "geneset", "c2.cp.kegg.v7.3.symbols.gmt")
gml_path = os.path.join(ROOT, "data", "Graphml", "kegg.graphml")
ensembl_path = os.path.join(ROOT, "data", "GSEA", "geneset", "Human_ENSEMBL_Gene_ID_MSigDB.v7.3.chip")
entrez_path = os.path.join(ROOT, "data", "GSEA", "geneset", "Human_NCBI_Entrez_Gene_ID_MSigDB.v7.2.chip")
expression_path = os.path.join(ROOT, "data", "GSE31312", "rma_expression.pickle")
clinical_df_path = os.path.join(ROOT, "data", "GSE31312", "clinical.csv")

cv = 5
n_seed = 1234
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
np.random.seed(n_seed)
_ = torch.manual_seed(n_seed)


In [None]:
# Dataset
from module.models import Graph_from_GSEA, Load_Dataset, Graph_Exp_Data
# Graphの読み込み(Reactome)
path_graphs = Graph_from_GSEA(gmt_path = gmt_path,
                              gml_path = gml_path,
                              gene_convert_ensembl = ensembl_path,
                              gene_convert_entrez  = entrez_path)

# GSEのexpression dataの読み込み
dataset = Load_Dataset(array_path = expression_path,
                   clinical_path = clinical_df_path)

#ラベルデータを取得
dataset.Y = pd.DataFrame(dataset.clinicalDf["GEP"].map({"GCB":0, "ABC":1, np.nan:np.nan}).dropna())

# グラフ上の遺伝子のみを残す
dataset.x_expression = dataset.x_expression.loc[:, np.isin(dataset.x_expression.columns, list(path_graphs.pathways.all_genes))]
dataset.x_expression = pd.DataFrame(scipy.stats.zscore(dataset.x_expression, axis=1), 
                             index=dataset.x_expression.index, 
                             columns=dataset.x_expression.columns) # 正規化
train_idx = dataset.Y.index

dgl_dataset = Graph_Exp_Data(x=dataset.x_expression.loc[train_idx, :], 
                             y=dataset.Y.loc[train_idx], 
                             graphs = path_graphs)

In [None]:
# スコアリング用データセットの読み込み
expression_path = os.path.join(ROOT, "data", "GSE10846", "rma_expression.pickle")
clinical_df_path = os.path.join(ROOT, "data", "GSE10846", "clinical.csv")

test_dataset = Load_Dataset(array_path = expression_path,
                   clinical_path = clinical_df_path)

test_dataset.Y = pd.DataFrame(test_dataset.clinicalDf["GEP"].map({"GCB":0, "ABC":1, "UC": np.nan, np.nan:np.nan}).dropna())

# グラフ上の遺伝子のみを残す
test_dataset.x_expression = test_dataset.x_expression.loc[:, np.isin(test_dataset.x_expression.columns, list(path_graphs.pathways.all_genes))]
test_dataset.x_expression = pd.DataFrame(scipy.stats.zscore(test_dataset.x_expression, axis=1), 
                             index=test_dataset.x_expression.index, 
                             columns=test_dataset.x_expression.columns) # 正規化

test_idx = test_dataset.Y.index

test_dgl_dataset = Graph_Exp_Data(x=test_dataset.x_expression.loc[test_idx, :], 
                             y=test_dataset.Y.loc[test_idx], 
                             graphs = path_graphs)

# 保存されたデータの読み込み
モデルとSHAPの記録を読み込む

In [None]:
dgl_dataset.mapping_attr()
X_train = dgl_dataset.attr.float()
Y_train = dgl_dataset.y

In [None]:
importlib.reload(models)
from module.models import GCN_Trainer
torch.manual_seed(n_seed)
variable_name = "gcn"
file_path = os.path.join(DIR_LOG, str(n_seed), "gcn")
trainer = GCN_Trainer(dgl_dataset.batched_graph, path=file_path, device=device)
trainer.load_bestmodel(load_log=True)

In [None]:
shap_value_df = pd.read_csv(os.path.join(file_path, "kegg_shap.csv"), index_col=0)
shap_cols = [col.split(".")[0] for col in shap_value_df.columns]
shap_value_df = shap_value_df.abs().T.groupby(shap_cols).sum().T
shap.summary_plot(shap_value_df, feature_names=shap_value_df.columns, plot_type="bar")

# Logistic regression classifier for validation


In [None]:
# logistic regression using all genes in KEGG pathways
from sklearn.metrics import accuracy_score, precision_recall_curve, precision_recall_fscore_support, roc_auc_score, roc_curve

n_start=0
print("=================")
print(f"n_start: {n_start}")
print("-----------------")

pathway_names = list(shap_value_df.mean().sort_values(ascending=False).index)
gene_names = set()
for p in pathway_names:
    gene_names = gene_names | set(path_graphs.id_to_symbol[p].values())
print(f"Number of genes: {len(gene_names)}")

gene_selected = dataset.x_expression.columns & gene_names
X_train = dataset.x_expression.loc[train_idx, gene_selected]
Y_train = dataset.Y.loc[train_idx].values.reshape(-1)


X_test = test_dataset.x_expression.loc[test_idx, gene_selected]
Y_test = test_dataset.Y.loc[test_idx].values.reshape(-1)

# Fit logistic regression classifier
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty="l2", C=0.01, random_state=n_seed)
clf.fit(X_train, Y_train)

print("Training set")
Y_pred = clf.predict(X_train)
Y_proba = clf.decision_function(X_train)
fpr, tpr, thres = roc_curve(Y_train, Y_proba)
cutoff = thres[np.argmin(1-tpr+fpr)]
Y_pred = [1 if x > cutoff else 0 for x in Y_proba]

# evaluation
from sklearn.metrics import accuracy_score, precision_recall_curve, precision_recall_fscore_support
acc = accuracy_score(Y_train, Y_pred)
print(f"Accuracy: {acc:.3f}")
scores = precision_recall_fscore_support(Y_train, Y_pred, average="binary")
print(f"Precision: {scores[0]:.3f}")
print(f"Recall: {scores[1]:.3f}")
print(f"F1 score: {scores[2]:.3f}")
print("--------------")

print("Test set")
Y_pred = clf.predict(X_test)
Y_proba = clf.decision_function(X_test)
Y_pred = [1 if x > cutoff else 0 for x in Y_proba]

from sklearn.metrics import accuracy_score, precision_recall_curve, precision_recall_fscore_support
acc = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {acc:.3f}")
scores = precision_recall_fscore_support(Y_test, Y_pred, average="binary")
print(f"Precision: {scores[0]:.3f}")
print(f"Recall: {scores[1]:.3f}")
print(f"F1 score: {scores[2]:.3f}")

In [None]:
Y_proba = clf.predict_proba(X_test)
from sklearn.metrics import plot_precision_recall_curve, roc_curve, auc
plot_precision_recall_curve(clf, X_test, Y_test)

# ROC
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
    fpr[i], tpr[i], _ = roc_curve(Y_test, Y_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
plt.figure()
lw = 2
plt.plot(fpr[1], tpr[1], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.02, 1.0])
plt.ylim([0.0, 1.02])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Genes in pathways
pathway_names = path_graphs.pathway_list
#pathway_names = list(shap_value_df.mean().sort_values(ascending=False)[:5].index)
gene_names = set()
for p in pathway_names:
    gene_names = gene_names | set(path_graphs.id_to_symbol[p].values())
print(len(gene_names))
#print(gene_names)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_curve, precision_recall_fscore_support, roc_auc_score, roc_curve

# preferences
n_graphs = 5
n_start = 0

score_dict = {"label": [],
    "acc": [],
         "precision": [],
         "recall": [],
         "f1score": [],
         "rocauc": [],
         "acc_test":[],
         "prec_test":[],
         "rec_test":[],
         "f1_test":[],
         "rocauc_test":[]}

while n_start < shap_value_df.shape[1]:
    print("=================")
    print(f"n_start: {n_start}")
    print("-----------------")
    # Select genes in the pathways
    pathway_names = list(shap_value_df.mean().sort_values(ascending=False)[n_start:n_start+n_graphs].index)
    gene_names = set()
    for p in pathway_names:
        gene_names = gene_names | set(path_graphs.id_to_symbol[p].values())

    gene_selected = dataset.x_expression.columns & gene_names
    X_train = dataset.x_expression.loc[train_idx, gene_selected]
    Y_train = dataset.Y.loc[train_idx].values.reshape(-1)

    X_test = test_dataset.x_expression.loc[test_idx, gene_selected]
    Y_test = test_dataset.Y.loc[test_idx].values.reshape(-1)

    # Fit logistic regression classifier
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(penalty="l2", C=0.01, random_state=n_seed)
    clf.fit(X_train, Y_train)

    print("Training set")
    score_dict["label"].append(n_start)
    Y_pred = clf.predict(X_train)
    Y_proba = clf.decision_function(X_train)
    fpr, tpr, thres = roc_curve(Y_train, Y_proba)
    cutoff = thres[np.argmin(1-tpr+fpr)]
    Y_pred = [1 if x > cutoff else 0 for x in Y_proba]
    

    # evaluation
    acc = accuracy_score(Y_train, Y_pred)
    print(f"Accuracy: {acc:.3f}")
    scores = precision_recall_fscore_support(Y_train, Y_pred, average="binary")
    
    print(f"Precision: {scores[0]:.3f}")
    print(f"Recall: {scores[1]:.3f}")
    print(f"F1 score: {scores[2]:.3f}")
    score_dict["acc"].append(acc)
    score_dict["precision"].append(scores[0])
    score_dict["recall"].append(scores[1])
    score_dict["f1score"].append(scores[2])
    
    roc_auc = roc_auc_score(Y_train, Y_proba)
    score_dict["rocauc"].append(roc_auc)
    print(f"ROC AUC: {roc_auc:.3f}")
    
    print("--------------")

    print("Test set")
    Y_pred = clf.predict(X_test)
    Y_proba = clf.decision_function(X_test)
    Y_pred = [1 if x > cutoff else 0 for x in Y_proba]

    acc = accuracy_score(Y_test, Y_pred)
    print(f"Accuracy: {acc:.3f}")
    scores = precision_recall_fscore_support(Y_test, Y_pred, average="binary")
    print(f"Precision: {scores[0]:.3f}")
    print(f"Recall: {scores[1]:.3f}")
    print(f"F1 score: {scores[2]:.3f}")
    score_dict["acc_test"].append(acc)
    score_dict["prec_test"].append(scores[0])
    score_dict["rec_test"].append(scores[1])
    score_dict["f1_test"].append(scores[2])
    roc_auc = roc_auc_score(Y_test, Y_proba)
    score_dict["rocauc_test"].append(roc_auc)
    print(f"ROC AUC: {roc_auc:.3f}")
    
    n_start += n_graphs
    
# Save data
df = pd.DataFrame(score_dict)
df.to_csv(os.path.join(file_path, "pathway_rank_log_reg_score.csv"))

In [None]:
# Load data
df = pd.read_csv(os.path.join(file_path, "pathway_rank_log_reg_score.csv"), index_col=0)
df = df.loc[:, ["label", "acc_test", "prec_test", "rec_test", "f1_test"]]

plt.style.use('seaborn-darkgrid')
fig, ax = plt.subplots(1,1, figsize=(10, 6))
plt.bar(df["label"].astype("str"), df["f1_test"])
plt.title("F1 score of test set with gene selection by pathway rank")
plt.xticks(rotation=90)
ax.set_xlabel("Pathway rank")
ax.set_ylabel("F1 score")

In [None]:
# Line plot
df = pd.read_csv(os.path.join(file_path, "pathway_rank_log_reg_score.csv"), index_col=0)

df = df.loc[:, ["label", "acc_test", "prec_test", "rec_test", "f1_test"]]

# Line fitting
ab = np.polyfit(df["label"],df["f1_test"],1)
y1 = np.poly1d(ab)(df["label"])

print(sklearn.metrics.r2_score(df["f1_test"], y1))

plt.style.use('seaborn-darkgrid')
fig, ax = plt.subplots(1,1, figsize=(10, 6))
plt.plot(df["label"], df["f1_test"], marker='o', linestyle="None", label="F1 score")
plt.plot(df["label"], y1, marker=None, linestyle="dashed", label="linear regression")
plt.title("F1 score of test set with gene selection by pathway rank")
plt.legend(loc='lower left', fontsize=10, framealpha=1.0, facecolor="w")
#plt.xticks(rotation=90)
ax.set_xlabel("Pathway rank")
ax.set_ylabel("F1 score")
plt.savefig(os.path.join(file_path, "log_reg_f1score_pathway_rank.eps"))

# Gene level SHAP

In [None]:
#shap_value_df = pd.read_csv(os.path.join(file_path, "kegg_shap.csv"), index_col=0)
shap_value_df = pd.read_csv(os.path.join(file_path, "gene_level_shap.csv"), index_col=0)
shap_cols = [col.split(".")[0] for col in shap_value_df.columns]
shap_value_df = shap_value_df.abs().T.groupby(shap_cols).sum().T
shap.summary_plot(shap_value_df, feature_names=shap_value_df.columns, plot_type="bar")

In [None]:
n_genes = 100
n_start = 0

score_dict = {"label": [],
    "acc": [],
         "precision": [],
         "recall": [],
         "f1score": [],
         "rocauc": [],
         "acc_test":[],
         "prec_test":[],
         "rec_test":[],
         "f1_test":[],
         "rocauc_test":[]}

while n_start < shap_value_df.shape[1]:

    print("=================")
    print(f"n_start: {n_start}")
    print("-----------------")
    # Select genes
    gene_names = list(shap_value_df.mean().sort_values(ascending=False)[n_start:n_start+n_genes].index)
    #gene_names = list(shap_value_df.mean().sort_values(ascending=False).index)
    gene_selected = dataset.x_expression.columns & gene_names
    X_train = dataset.x_expression.loc[train_idx, gene_selected]
    Y_train = dataset.Y.loc[train_idx].values.reshape(-1)

    X_test = test_dataset.x_expression.loc[test_idx, gene_selected]
    Y_test = test_dataset.Y.loc[test_idx].values.reshape(-1)
    
    if not X_train.shape[1] > 0:
        break

    # Fit logistic regression
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(penalty="l2", C=0.01, random_state=n_seed)
    clf.fit(X_train, Y_train)

    print("Training set")
    score_dict["label"].append(n_start)
    Y_pred = clf.predict(X_train)
    Y_proba = clf.decision_function(X_train)
    fpr, tpr, thres = roc_curve(Y_train, Y_proba)
    cutoff = thres[np.argmin(1-tpr+fpr)]
    Y_pred = [1 if x > cutoff else 0 for x in Y_proba]


    # Evaluation
    acc = accuracy_score(Y_train, Y_pred)
    print(f"Accuracy: {acc:.3f}")
    scores = precision_recall_fscore_support(Y_train, Y_pred, average="binary")

    print(f"Precision: {scores[0]:.3f}")
    print(f"Recall: {scores[1]:.3f}")
    print(f"F1 score: {scores[2]:.3f}")
    score_dict["acc"].append(acc)
    score_dict["precision"].append(scores[0])
    score_dict["recall"].append(scores[1])
    score_dict["f1score"].append(scores[2])

    roc_auc = roc_auc_score(Y_train, Y_proba)
    score_dict["rocauc"].append(roc_auc)
    print(f"ROC AUC: {roc_auc:.3f}")

    print("--------------")

    print("Test set")
    Y_pred = clf.predict(X_test)
    Y_proba = clf.decision_function(X_test)
    Y_pred = [1 if x > cutoff else 0 for x in Y_proba]

    acc = accuracy_score(Y_test, Y_pred)
    print(f"Accuracy: {acc:.3f}")
    scores = precision_recall_fscore_support(Y_test, Y_pred, average="binary")
    print(f"Precision: {scores[0]:.3f}")
    print(f"Recall: {scores[1]:.3f}")
    print(f"F1 score: {scores[2]:.3f}")
    score_dict["acc_test"].append(acc)
    score_dict["prec_test"].append(scores[0])
    score_dict["rec_test"].append(scores[1])
    score_dict["f1_test"].append(scores[2])
    roc_auc = roc_auc_score(Y_test, Y_proba)
    score_dict["rocauc_test"].append(roc_auc)
    print(f"ROC AUC: {roc_auc:.3f}")
    
    
    n_start += n_genes

# データを保存する
df = pd.DataFrame(score_dict)
df.to_csv(os.path.join(file_path, "gene_rank_log_reg_score.csv"))

In [None]:
# Load Data
df = pd.read_csv(os.path.join(file_path, "gene_rank_log_reg_score.csv"), index_col=0)

df = df.loc[:, ["label", "acc_test", "prec_test", "rec_test", "f1_test"]]

fig, ax = plt.subplots(1,1, figsize=(10, 6))
plt.bar(df["label"].astype("str"), df["f1_test"])
plt.title("F1 score of test set with gene selection by gene rank")
plt.xticks(rotation=90)
ax.set_xlabel("Gene rank")
ax.set_ylabel("F1 score")

In [None]:
# Scatter plot and linear regression
# Load data
df = pd.read_csv(os.path.join(file_path, "gene_rank_log_reg_score.csv"), index_col=0)
df = df.loc[:, ["label", "acc_test", "prec_test", "rec_test", "f1_test"]]

# Linear regression
ab = np.polyfit(df["label"],df["f1_test"],1)
y1 = np.poly1d(ab)(df["label"])

print(sklearn.metrics.r2_score(df["f1_test"], y1))
plt.style.use('seaborn-darkgrid')
fig, ax = plt.subplots(1,1, figsize=(10, 6))
plt.plot(df["label"], df["f1_test"], marker='o', linestyle="None", label="F1 score")
plt.plot(df["label"], y1, marker=None, linestyle="dashed", label="linear regression")
plt.title("F1 score of test set with gene selection by gene rank")
plt.legend(loc='lower left', fontsize=10, framealpha=1.0, facecolor="w")
ax.set_xlabel("Gene rank")
ax.set_ylabel("F1 score")
plt.savefig(os.path.join(file_path, "log_reg_f1score_gene_rank.eps"))