In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error
import itertools
import warnings
warnings.filterwarnings('ignore')


#### Clinical data preprocessing

In [2]:
#Generating Random Forest for data
test_size = 0.2
random=30
df = pd.read_csv('oasis_cross-sectional.csv',index_col='ID')
#df.head()

df = df[df['CDR']!=2]
#Unmark the following line to take off in the binary classification
#df = df[df['CDR']!=0.5]

#RETURN TO GROUP 0.5 AND 1
df['CDR'] = df['CDR'].apply(lambda x: 1 if x==0.5 else x)
y = df['CDR'].apply(int)
X = df.drop('Hand',axis=1) #all subjects use right hand


#binaring gender
X = pd.concat([X, pd.get_dummies(X['M/F'])['M']],axis=1).drop('M/F',axis=1)

#converting nWBV and ASF to one-measure scale
def conversor_1000(toBeConverted):
    if toBeConverted < 10:
        return toBeConverted*(10**3)
    else:
        return toBeConverted
    
X['nWBV'] = X['nWBV'].apply(conversor_1000)
X['ASF'] = X['ASF'].apply(conversor_1000)

def progressive_sampling(xLabel,yLabel):
    lre = LinearRegression()
    best_combination = []
    r2 = 10000
    valid = []


    for a in xLabel:
        valid.append(a)
    
    for i in range(len(xLabel)-1):
        entrou = False
        for j in valid:                              
            best_combination.append(j)
            lre.fit(xr_train[best_combination],yr_train)
            pre = lre.predict(xr_test[best_combination])
            best_combination.remove(j)
            if mean_squared_error(yr_test,pre) < r2:
                r2 = mean_squared_error(yr_test,pre)
                value = j
                entrou=True
        if entrou:        
            best_combination.append(value)
            valid.remove(value)

    return best_combination

yLabel = 'SES'
Xra = X[~X['SES'].isnull()]    
xr_train, xr_test, yr_train, yr_test = train_test_split(Xra.drop(yLabel,axis=1),Xra[yLabel], test_size=test_size,random_state=random)

teste=np.array(X.drop('SES',axis=1).columns).ravel()
best_predictors = progressive_sampling(teste,'SES')

# Setting missing data from SES using linear regression

# RETURN BECAUSE ITS WORKING
lr = LinearRegression()
lr.fit(Xra[best_predictors],Xra[yLabel])
predicted_r = lr.predict(X[X['SES'].isnull()][best_predictors])
updatex = pd.Series(predicted_r,index=X[X['SES'].isnull()].index)
X['SES'].update(updatex)

# Using z-score
ss = StandardScaler()
scaled = ss.fit_transform(X.drop(['CDR'],axis=1))
X_zscore = pd.DataFrame(scaled,columns=X.drop(['CDR'],axis=1).columns)
X_zscore['ID'] = X.index.values; X_zscore['CDR'] = X.CDR.values
X_zscore.set_index('ID')

Unnamed: 0_level_0,Age,Educ,SES,MMSE,eTIV,nWBV,ASF,M,CDR
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
OAS1_0072_MR1,-2.016200,1.480887,-1.512282,0.887664,-0.369847,2.066653,0.287294,-0.712525,0.0
OAS1_0200_MR1,-2.016200,-0.798594,1.241284,0.887664,-0.593332,1.679310,0.542613,-0.712525,0.0
OAS1_0109_MR1,-1.892398,0.721060,0.323428,0.887664,-0.922353,1.824563,0.944935,-0.712525,0.0
OAS1_0455_MR1,-1.892398,-0.798594,1.241284,0.343970,-0.667828,2.115071,0.635457,-0.712525,0.0
OAS1_0456_MR1,-1.892398,1.480887,-0.594427,0.887664,1.089017,1.025668,-1.105359,1.403459,0.0
OAS1_0114_MR1,-1.768596,-0.798594,1.241284,0.887664,-0.518837,1.606683,0.457507,1.403459,0.0
OAS1_0457_MR1,-1.768596,-0.038767,0.323428,-0.199724,-0.556085,0.686743,0.496192,-0.712525,0.0
OAS1_0070_MR1,-1.644794,-0.038767,-0.594427,0.887664,-0.835442,1.534056,0.836618,-0.712525,0.0
OAS1_0078_MR1,-1.520993,-0.038767,-0.594427,0.887664,-0.413302,1.727728,0.333716,-0.712525,0.0
OAS1_0135_MR1,-1.520993,-0.798594,1.241284,0.615817,0.617214,1.534056,-0.703037,1.403459,0.0


#### k-fold Classifiers Values

In [3]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score,confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

def meanMetrics(predList, yList):
    PR, RC, F1, ACC, SP = [],[],[],[],[]
    for p, y in zip(predList, yList):
        pr,rl,f1,sp = precision_recall_fscore_support(p, y)
        acc = accuracy_score(p, y)
        tn, fp, fn, tp = confusion_matrix(p, y).ravel()
        specificity = 1.0*tn / (tn+fp+1e-16)
        PR.append(pr); RC.append(rl); F1.append(f1);
        ACC.append(acc); SP.append(specificity)
    PR, RC, F1, ACC, SP = np.array(PR),np.array(RC),np.array(F1),np.array(ACC),np.array(SP)
    print('Mean\t| Precision |\t| Recall |\t| F-score |\t| Accuracy |\t| Specificity |')
    exit = '| {} || {} || {} || {} || {} |'
    print(exit.format(np.mean(PR),np.mean(RC),np.mean(F1),np.mean(ACC),np.mean(SP)))

AXIALsrc = 'CV_AXIAL/'
CORONALsrc = 'CV_CORONAL/'
SARGITALsrc = 'CV_SARGITAL/'

sargitalPred, axialPred, coronalPred, y_labels = [],[],[],[]

for i in range(10):
    FOLD = 'fold_{}/'.format(i)
    test_id = AXIALsrc+FOLD+'test/test.csv'
    test_id = pd.read_csv(test_id,header=None,index_col=0)
    
    x_test = X_zscore.loc[X_zscore['ID'].isin(test_id.index.values)]
    y_test = x_test.CDR.values;
    
    sargitalPred.append(np.load(SARGITALsrc+FOLD+'fold_{}_pred_labels.npy'.format(i)))
    axialPred.append(np.load(AXIALsrc+FOLD+'fold_{}_pred_labels.npy'.format(i)))
    coronalPred.append(np.load(CORONALsrc+FOLD+'fold_{}_pred_labels.npy'.format(i)))
    y_labels.append(y_test)

print('CNN Sargital')
meanMetrics(sargitalPred,y_labels)
print('CNN Axial')
meanMetrics(axialPred,y_labels)
print('CNN Coronal')
meanMetrics(coronalPred,y_labels)

CNN Sargital
Mean	| Precision |	| Recall |	| F-score |	| Accuracy |	| Specificity |
| 0.6938181263181263 || 0.6881475468975469 || 0.6741793934479692 || 0.6789473684210525 || 0.6258585858585859 |
CNN Axial
Mean	| Precision |	| Recall |	| F-score |	| Accuracy |	| Specificity |
| 0.7055211455211454 || 0.7009586247086248 || 0.6983063168939463 || 0.7105263157894737 || 0.6833838383838383 |
CNN Coronal
Mean	| Precision |	| Recall |	| F-score |	| Accuracy |	| Specificity |
| 0.6377171439671439 || 0.6329365079365079 || 0.6269942344526742 || 0.6368421052631579 || 0.597449494949495 |


#### k-fold Majority Vote  |  FP priori

In [4]:
def mjVote(l1,l2,l3):
    from scipy.stats import mode
    final_label = np.vstack([l1,l2,l3])
    return np.reshape(mode(final_label)[0],(l1.shape[1],1))

def FPpriori(l1,l2,l3):
    fp = np.zeros_like(l1)
    for i in range(l1.shape[0]):
        tmp_tuple = (l1[i],l2[i],l3[i])
        val = 0
        if 1 in tmp_tuple:
            val = 1
        fp[i] = val
    return fp

mjPred, fppPred = [],[]
for r,a,c in zip(sargitalPred,axialPred,coronalPred):
    fppPred.append(FPpriori(r,a,c))
    reshape = lambda list_: [np.reshape(array,(1,len(array))) for array in list_]
    r,a,c = reshape([r,a,c])
    mjPred.append(mjVote(r,a,c))

print('Emsemble Majority Vote')
meanMetrics(mjPred,y_labels)
print('Emsemble FP Priori')
meanMetrics(fppPred,y_labels)

Emsemble Majority Vote
Mean	| Precision |	| Recall |	| F-score |	| Accuracy |	| Specificity |
| 0.6759698634698634 || 0.6688908313908314 || 0.6647368808970302 || 0.6736842105263158 || 0.636969696969697 |
Emsemble FP Priori
Mean	| Precision |	| Recall |	| F-score |	| Accuracy |	| Specificity |
| 0.709623432123432 || 0.724728465978466 || 0.7020770895770896 || 0.7210526315789474 || 0.7538492063492063 |


In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

def super_learner(axial_f,coronal_f,sargital_f,f_i):
    y_f = []
    for i in range(10):
        test_id = axial_f+'fold_{}/'.format(i)+'test/test.csv'
        test_id = pd.read_csv(test_id,header=None,index_col=0)

        x_test = X_zscore.loc[X_zscore['ID'].isin(test_id.index.values)]
        y_test = x_test.CDR.values;
        y_f.append(y_test)

    sg_f = np.load(sargital_f+'features_fold_{}.npy'.format(f_i))
    ax_f = np.load(axial_f+'features_fold_{}.npy'.format(f_i))
    cr_f = np.load(coronal_f+'features_fold_{}.npy'.format(f_i))

    features = [cr_f,ax_f,sg_f]
    level_one_l = []

    [level_one_l.append(np.array(f).ravel()) for f in features]
    f_len = len(level_one_l[0])
    level_one = np.hstack([np.reshape(l,(f_len,1)) for l in level_one_l])
    level_one = np.hstack([level_one,np.reshape(np.array(y_f),(f_len,1))]).astype(float)

    l2 = RandomForestClassifier(bootstrap=True, class_weight=None,n_estimators=500, criterion='gini')
    l2.fit(level_one[:,:3],level_one[:,3])
    
    test_f = [coronalPred[f_i],axialPred[f_i],sargitalPred[f_i]]
    t_len = len(test_f[0])
    level_two = np.hstack([np.reshape(t,(t_len,1)) for t in test_f]).astype(float)

    prediction = l2.predict(level_two)
    
    return prediction

ax_str = 'CV_AXIAL/'
cr_str = 'CV_CORONAL/'
sg_str = 'CV_SARGITAL/'

slPred = []
for i in range(10):
    tmp_f = 'fold_{}/train/'.format(i)
    f_ax_str,f_cr_str,f_sg_str = ax_str+tmp_f, cr_str+tmp_f, sg_str+tmp_f
    slPred.append(super_learner(f_ax_str,f_cr_str,f_sg_str,i))

print('Super Learner')
print(meanMetrics(slPred,y_labels))

Super Learner
Mean	| Precision |	| Recall |	| F-score |	| Accuracy |	| Specificity |
| 0.6757192807192808 || 0.6721808746808746 || 0.6590462168288928 || 0.6736842105263158 || 0.6797871572871573 |
None
