In [1]:
import sys
sys.path.append("/home/huaqingj/MyResearch/TVDN-AD")

In [12]:
from pyTVDN import TVDNDetect
from pathlib import Path
import numpy as np
from easydict import EasyDict as edict
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import seaborn as sns
import scipy
import pandas as pd
import numbers
%matplotlib inline

In [3]:
os.chdir("/home/huaqingj/MyResearch/TVDN-AD/")
resDir = Path("./results")
dataDir = Path("./data")

In [4]:
allDataOrd = pd.read_csv("./OtherFils/AllDataBaselineOrdered.csv")
SGMAD = pd.read_csv("./OtherFils/fmegallAD_reducedspeed_globalSGM_demographics.csv")
SGMCtrl = pd.read_csv("./OtherFils/fmegallCONT_reducedspeed_globalSGM_demographics.csv")

In [5]:
SGMAD = allDataOrd[allDataOrd["Grp"]=="AD"].set_index("RID").join(SGMAD.set_index("RADID"))
SGMCtrl = allDataOrd[allDataOrd["Grp"]=="Ctrl"].set_index("RID").join(SGMCtrl.set_index("RADID"))
SGMAD = SGMAD[SGMAD["KeepIt"]==1]
SGMCtrl = SGMCtrl[SGMCtrl["KeepIt"]==1]

### Load data

In [6]:
SGMfnames = ["taue", "taui", "alpha", "speed", "gei", "gii", "tauG"]

In [7]:
SGMDats = SGMAD

fsAll = []
for i in range(SGMDats.shape[0]):
    fs = edict()
    curSGMdat = SGMDats.iloc[i]
    
    for fname in SGMfnames:
        fs[fname] = curSGMdat[fname]
    
    fsAll.append(fs)
    
fsAD = fsAll    

In [8]:
SGMDats = SGMCtrl

fsAll = []
for i in range(SGMDats.shape[0]):
    fs = edict()
    curSGMdat = SGMDats.iloc[i]
    
    for fname in SGMfnames:
        fs[fname] = curSGMdat[fname]
    
    fsAll.append(fs)
    
fsCtrl = fsAll    

### Regression

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import roc_auc_score, roc_curve

from prettytable import PrettyTable as pTB
from sklearn.metrics import f1_score
from collections import defaultdict as ddict

import warnings
warnings.filterwarnings('ignore')

In [15]:
# this funtion is to tune the penalty parameter for regression
def TuningCFn(inpX, inpY, Cs=[0.1, 0.2, 0.4, 0.8, 1, 1.6, 3.2, 6.4], penalty="l2"):
    aucCs = []
    for C in Cs:
        eProbs = []
        loo = LeaveOneOut()
        for trIdxs, testIdxs in loo.split(inpX):
            clf = LogisticRegression(penalty=penalty, random_state=0, C=C)
            clf.fit(inpX[trIdxs, :], inpY[trIdxs])
            eProbs.append(clf.predict_proba(inpX[testIdxs, :]))
        eProbs = np.array(eProbs).squeeze()
        auc = roc_auc_score(inpY, eProbs[:, 1])
        fpr, tpr, thresholds = roc_curve(inpY, eProbs[:, 1], pos_label=1)
        aucCs.append(auc)
            
    optC = Cs[np.argmax(aucCs)]
    res = edict()
    res["optC"] = optC
    res["Cs"] = Cs
    res["aucCs"] = aucCs
    return res

In [16]:
penalty= "l2"
fsNameSet = [
             ["tauG"],
             SGMfnames, 
           ]
Cs=[0.1, 0.2, 0.4, 0.8, 1, 1.6, 3.2, 6.4]
print("="*100)
print(f"The penalty is {penalty}.")
for fsName in tqdm(fsNameSet):
    Ys = np.concatenate([np.ones(len(fsAD)), np.zeros(len(fsCtrl))])
    XsAD = []
    XsCtrl = []
    for fName in fsName:
        if isinstance(fsAD[0][fName], numbers.Number):
            cfAD = np.array([fs[fName] for fs in fsAD]).reshape(-1, 1)
            cfCtrl = np.array([fs[fName] for fs in fsCtrl]).reshape(-1, 1)
        else:
            cfAD = np.array([fs[fName] for fs in fsAD])
            cfCtrl = np.array([fs[fName] for fs in fsCtrl])
        
        XsAD.append(cfAD)
        XsCtrl.append(cfCtrl)
    XsAD = np.concatenate(XsAD, axis=1)
    XsCtrl = np.concatenate(XsCtrl, axis=1)
    Xs = np.concatenate([XsAD, XsCtrl], axis=0)
    stdXs = (Xs - Xs.mean(axis=0))/Xs.std(axis=0)
    
    
    eProbs = []
    loo = LeaveOneOut()
    parass = []
    optC = TuningCFn(stdXs, Ys, Cs=Cs, penalty=penalty)["optC"]
    #print(optC)
    for trIdxs, testIdxs in loo.split(stdXs):
        curStdXs, curYs = stdXs[trIdxs, :], Ys[trIdxs]
        
        # 
        curOptC = TuningCFn(curStdXs, curYs, Cs=Cs, penalty=penalty)["optC"]
        clf = LogisticRegression(penalty=penalty, random_state=0, C=curOptC)
        
        #clf = LogisticRegression(penalty=penalty, random_state=0, C=optC)
        clf.fit(curStdXs, curYs)
        paras = np.concatenate([clf.intercept_, clf.coef_.reshape(-1)])
        parass.append(paras)
        eProbs.append(clf.predict_proba(stdXs[testIdxs, :]))
    eProbs = np.array(eProbs).squeeze()
    auc = roc_auc_score(Ys, eProbs[:, 1])
    fpr, tpr, thresholds = roc_curve(Ys, eProbs[:, 1], pos_label=1)
    parass = np.array(parass)
    
    optC = TuningCFn(stdXs, Ys, Cs=Cs, penalty=penalty)["optC"]
    nobs = stdXs.shape[0]
    Aucss = []
    for j in range(1000):
        testIdx = np.random.choice(nobs, int(nobs/5), False)
        trainIdx = np.delete(np.arange(nobs), testIdx)
        clf = LogisticRegression(penalty=penalty, random_state=0, C=optC)
        clf.fit(stdXs[trainIdx], Ys[trainIdx])
        curEprobs = clf.predict_proba(stdXs[testIdx, :])
        curAuc = roc_auc_score(Ys[testIdx], curEprobs[:, 1])
        Aucss.append(curAuc)
    mAUC = np.mean(Aucss)
    stdAUC = np.std(Aucss)
    print(f"Features are {fsName}.")
    print(f"The AUC under optimal C is {auc:.3f}.")
    print(f"The mean of AUC under 1000 repetitions is {mAUC:.3f} and the standard deviation is {stdAUC:.3f}.")
    print("-"*100)

  0%|          | 0/2 [00:00<?, ?it/s]

The penalty is l2.


 50%|█████     | 1/2 [02:02<02:02, 122.43s/it]

Features are ['tauG'].
The AUC under optimal C is 0.893.
The mean of AUC under 1000 repetitions is 0.889 and the standard deviation is 0.054.
----------------------------------------------------------------------------------------------------


100%|██████████| 2/2 [04:21<00:00, 130.64s/it]

Features are ['taue', 'taui', 'alpha', 'speed', 'gei', 'gii', 'tauG'].
The AUC under optimal C is 0.879.
The mean of AUC under 1000 repetitions is 0.886 and the standard deviation is 0.054.
----------------------------------------------------------------------------------------------------



