In [65]:
import os, sys
import numpy as np
import pandas as pd
import math
from scipy.stats import norm
import subprocess
import random
import statistics
import warnings
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.tree import DecisionTreeClassifier 
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier

In [66]:
def getData():
    folder = '/Users/guglielmo/Desktop/Entropy/DataSet/PPMI'
    
    SnpToGene = pd.read_csv(os.path.join(folder, "SnpToGene_RSID.txt"), sep="\t", 
                            usecols = ['Genes','SNPsNamesConverter'])
    coreGenes = pd.read_csv(os.path.join(folder,"EmpiricPvalue_3_classes_1000/Intragenic_log_0.5.txt"), 
                            header=None, sep=" ")
    coreGenes['class'] = coreGenes[0]+"_"+coreGenes[1]
    coreGenes = coreGenes[coreGenes['class'] != 'PD_SWEDD'] # 2.5 class hills
    coreGenes = list(set(coreGenes[2]))
    SnpToGene = SnpToGene[SnpToGene['Genes'].isin(coreGenes)]
    snps = SnpToGene['SNPsNamesConverter']
    snps = snps.str.split(",").values
    snps = np.concatenate(snps)
    snps = np.char.lstrip(snps)
    
    res_folder = folder+'/Classification/Classification_2.5/'
    
    !mkdir {res_folder}
    
    path = os.path.join(res_folder, "snps.txt")
    file = open(path, "w+")
    for i in snps.tolist():
        file.write(i+"\n")
    file.close()
    
    !plink --bfile {os.path.join(res_folder,"PPMI")} --extract {path} --make-bed --out {os.path.join(res_folder, "Core")}
    !plink --bfile {os.path.join(res_folder, "Core")} --recodeA --out {os.path.join(res_folder, "Core_recoded")}
    
    recoded = pd.read_csv(os.path.join(folder,"Classification/Classification_2.5/Core_recoded.raw"),sep=" ")
    recoded = recoded.drop(["FID", "PAT","MAT","SEX","PHENOTYPE"], axis = 1)

    
    pheno = pd.read_csv(os.path.join(folder,"Subjects/PPMI_WHITE_INFO.csv"),header=None)
    pheno = pheno.drop([1,2], axis = 1)
    pheno.columns = ['IID', 'Group']
    for i in range(pheno.shape[0]):
        if(pheno.at[i,'Group'] == 'SWEDD'):
            pheno.at[i,'Group'] = 'PD'
    merged = recoded.merge(pheno, on="IID")    
    data = merged.drop(["IID","Group"], axis = 1)
    data.fillna(data.median(), inplace=True)
    remove = data.columns[data.isna().any()].tolist()
    data.drop(remove, axis = 1, inplace=True)
    data = data.replace(2,1)
   
    return data, merged

In [67]:
def algo(s, X_train, X_test, y_train, y_test, k):
    if(s == 'SVM_LINEAR'): clf = svm.SVC(kernel='linear')
    elif(s == 'SVM_POLY'): clf = svm.SVC(kernel='poly')
    elif(s == 'LDA'):      clf = LDA(n_components=1)
    elif(s == 'DT'):       clf = DecisionTreeClassifier()
    elif(s == 'KNN'):      clf = KNeighborsClassifier(n_neighbors=k)
    else: print('ERROR !!!!')
        
    return clf

In [68]:
def bestk(x_train, x_test, y_train, y_test):
    k=10
    acc_array=np.zeros(k)

    for k in np.arange(1,k+1,1): # here k will take values from 1 to 10
        classifier = KNeighborsClassifier(n_neighbors=k).fit(x_train, y_train) # k changes after each iteration
        y_pred = classifier.predict(x_test)
        acc_tmp = metrics.accuracy_score(y_test, y_pred)
        acc_array[k-1]=acc_tmp # store correctly the results

    max_acc=np.amax(acc_array)
    acc_list=list(acc_array)
    k = acc_list.index(max_acc)
    
    return (k+1)

In [69]:
def classification_results(d_test, d_pred, pos="PD", neg="HC"):
    d_test = d_test.tolist()
    d_pred = d_pred.tolist()
    
    tp = tn = fp = fn = 0 
    
    
    for i in range(len(d_test)):
        if d_test[i] == pos:
            if d_test[i] == d_pred[i]:
                tp += 1
            else:
                fn += 1
        else:
            if d_test[i] == d_pred[i]:
                tn += 1
            else:
                fp += 1
    
    if(tp+tn+fp+fn == 0): 
        accuracy=0
    else:
        accuracy = (tp + tn) / (tp+tn+fp+fn)
        
    if(tp + fp == 0):
        precision = 0
    else:
        precision = tp / (tp + fp)
        
    if(tp + fn == 0):
        recall = 0
    else:
         recall = tp / (tp + fn)
        
    if(precision == recall == 0):
        f1score = 0
    else:
        f1score = 2 * ( (precision*recall) / (precision + recall) )
            
    
    return tp, tn, fp, fn, accuracy, precision, recall, f1score

In [70]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

# Classification

In [71]:
warnings.filterwarnings('ignore')

In [72]:
[data, merged] = getData()
column_names   = ["Method", "Accuracy", "F1", "Precision", "Recall", "TrainSize"]
df = pd.DataFrame(columns = column_names)
for method in ['SVM_LINEAR','SVM_POLY','LDA','DT','KNN']: 
    print(bcolors.BOLD + bcolors.OKGREEN + '#' * 47 + "  METHOD: " + method + '  ' + '#' * 47 + bcolors.ENDC)
    for s in [0.3, 0.1]:
        acc = list()
        f1  = list()
        precision = list()
        recall    = list()

        v_acc = list()
        v_f1  = list()
        v_precision = list()
        v_recall    = list()

        v_acc_cn = list()
        v_f1_cn  = list()
        v_precision_cn = list()
        v_recall_cn    = list()

        conf_mat = dict()

        flag = True 
        k = 5

        for i in range(1, 101):
            rand = i 
            X_train, X_test, y_train, y_test = train_test_split(data, merged['Group'], test_size=s, 
                                                                random_state=rand, stratify=merged['Group'])
            if(method=='KNN' and flag):
                    k = bestk(X_train, X_test, y_train, y_test)
                    flag = False

            clf = algo(method, X_train, X_test, y_train, y_test, k)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

            dd_pred = [ 0 if x == 'HC' else 1 for x in y_pred.tolist()]
            dd_test = [ 0 if x == 'HC' else 1 for x in y_test.tolist()]

            acc.append(metrics.accuracy_score(dd_test, dd_pred))
            f1.append(metrics.f1_score(dd_test, dd_pred))
            precision.append(metrics.precision_score(dd_test, dd_pred))
            recall.append(metrics.recall_score(dd_test, dd_pred))


            v_tp, v_tn, v_fp, v_fn, v_a, v_p, v_r, v_f = classification_results(y_test, y_pred)
            v_acc.append(v_a)
            v_f1.append(v_f)
            v_precision.append(v_p)
            v_recall.append(v_r)

            v_tp, v_tn, v_fp, v_fn, v_a, v_p, v_r, v_f = classification_results(y_test,y_pred, pos='HC', neg='PD')
            v_acc_cn.append(v_a)
            v_f1_cn.append(v_f)
            v_precision_cn.append(v_p)
            v_recall_cn.append(v_r)

            df_confusion = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
            for c in df_confusion.columns:
                for r in df_confusion.index:
                    cr = (c,r)
                    if cr not in conf_mat:
                        conf_mat[cr] = list()
                    conf_mat[cr].append( int(df_confusion.at[r,c]) )

        print("****HC****")
        print(bcolors.BOLD + bcolors.WARNING +"Train "+ str((1-s)*100) + "%" +"- Test " + str(s*100) + "%" +bcolors.ENDC +
              " - Accuracy: "  + str(round(statistics.mean(v_acc_cn), 4)) +" "+str(round(statistics.stdev(v_acc_cn), 4))+
              " - F1: "        + str(round(statistics.mean(v_f1_cn), 4)) + " "+str(round(statistics.stdev(v_f1_cn), 4))+
              " - Precision: " + str(round(statistics.mean(v_precision_cn), 4)) + " "+str(round(statistics.stdev(v_precision_cn), 4))+
              " - Recall: "    + str(round(statistics.mean(v_recall_cn), 4)) +" "+str(round(statistics.stdev(v_recall_cn), 4)))
        print()

        print("****PD****")
        print(bcolors.BOLD + bcolors.WARNING +"Train "+ str((1-s)*100) + "%" +"- Test " + str(s*100) + "%" +bcolors.ENDC +
              " - Accuracy: "  + str(round(statistics.mean(v_acc), 4)) + " "+str(round(statistics.stdev(v_acc), 4))+
              " - F1: "        + str(round(statistics.mean(v_f1), 4)) + " "+str(round(statistics.stdev(v_f1), 4))+
              " - Precision: " + str(round(statistics.mean(v_precision), 4)) + " "+str(round(statistics.stdev(v_precision), 4))+
              " - Recall: "    + str(round(statistics.mean(v_recall), 4)) +" "+str(round(statistics.stdev(v_recall), 4)))
        print()

        '''
        print("****MEAN(AD,CN)****")
        print("Train "+ str((1-s)*100) + "%" +" - Test " + str(s*100) + "%" + 
              " - V Accuracy: "  + str(round(statistics.mean([statistics.mean(v_acc),statistics.mean(v_acc_cn)]), 4)) +
              " - V F1: "        + str(round(statistics.mean([statistics.mean(v_f1),statistics.mean(v_f1_cn)]), 4)) +
              " - V Precision: " + str(round(statistics.mean([statistics.mean(v_precision),statistics.mean(v_precision_cn)]), 4)) +
              " - V Recall: "    + str(round(statistics.mean([statistics.mean(v_recall),statistics.mean(v_recall_cn)]), 4)))
        print()
        '''

        print("****Confusion Matrix****")
        print("# "+ " ".join(list(df_confusion.columns)))

        for r in df_confusion.index:
            print(r+" ",end="")
            for c in df_confusion.columns:
                print( str(round(statistics.mean(conf_mat[(c,r)]),3)) + 
                      "(" + str(round(statistics.stdev(conf_mat[(c,r)]),3)) + ") " , end="" )
            print()
        print('-' * 115)
        

        acc_plot = round(statistics.mean([statistics.mean(v_acc),statistics.mean(v_acc_cn)]), 4)
        f1_plot  = round(statistics.mean([statistics.mean(v_f1),statistics.mean(v_f1_cn)]), 4)
        precision_plot = round(statistics.mean([statistics.mean(v_precision),statistics.mean(v_precision_cn)]), 4)
        recall_plot = round(statistics.mean([statistics.mean(v_recall),statistics.mean(v_recall_cn)]), 4)

        df = df.append({"Method":method, "Accuracy":acc_plot, "F1":f1_plot, "Precision":precision_plot, 
                        "Recall":recall_plot, "TrainSize":int((1-s)*100)}, ignore_index=True)

df.to_csv("../PLOT/PPMI_SWEDD_res.csv", header=True, index=False)


mkdir: /Users/guglielmo/Desktop/Entropy/DataSet/PPMI/Classification/Classification_2.5/: File exists
PLINK v1.90b6.12 64-bit (28 Oct 2019)          www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /Users/guglielmo/Desktop/Entropy/DataSet/PPMI/Classification/Classification_2.5/Core.log.
Options in effect:
  --bfile /Users/guglielmo/Desktop/Entropy/DataSet/PPMI/Classification/Classification_2.5/PPMI
  --extract /Users/guglielmo/Desktop/Entropy/DataSet/PPMI/Classification/Classification_2.5/snps.txt
  --make-bed
  --out /Users/guglielmo/Desktop/Entropy/DataSet/PPMI/Classification/Classification_2.5/Core

32768 MB RAM detected; reserving 16384 MB for main workspace.
457171 variants loaded from .bim file.
520 people (341 males, 179 females) loaded from .fam.
--extract: 6271 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 520 founders and 0 nonfounders present.
Ca

****HC****
[1m[93mTrain 90.0%- Test 10.0%[0m - Accuracy: 0.6206 0.0644 - F1: 0.3288 0.1006 - Precision: 0.3455 0.1076 - Recall: 0.3214 0.1091

****PD****
[1m[93mTrain 90.0%- Test 10.0%[0m - Accuracy: 0.6206 0.0644 - F1: 0.7338 0.0525 - Precision: 0.7267 0.0394 - Recall: 0.7438 0.0776

****Confusion Matrix****
# HC PD All
HC 4.5(1.528) 9.5(1.528) 14(0.0) 
PD 8.71(2.637) 25.29(2.637) 34(0.0) 
All 13.21(3.006) 34.79(3.006) 48(0.0) 
-------------------------------------------------------------------------------------------------------------------
[1m[92m###############################################  METHOD: KNN  ###############################################[0m
****HC****
[1m[93mTrain 70.0%- Test 30.0%[0m - Accuracy: 0.7449 0.0334 - F1: 0.5062 0.0667 - Precision: 0.5667 0.0741 - Recall: 0.4617 0.0755

****PD****
[1m[93mTrain 70.0%- Test 30.0%[0m - Accuracy: 0.7449 0.0334 - F1: 0.8277 0.0237 - Precision: 0.8005 0.0235 - Recall: 0.8577 0.0359

****Confusion Matrix****
# HC 