# **Simple Graph Creation and GNN Model Testing**


# Work Environment


#### File Prerequisites

Run the notebook preferably on google colab.


Upload the following csvs in the path: '/thesis/Data_Preprocessing/':
* ADNI_dataset_ROIs_and_SNPs.csv
* ROIs.csv


#### Setting Google Colab as Filesystem

In [1]:
from google.colab import drive
path = "/gdrive/My Drive/thesis/Data_Preprocessing/"
drive.mount('/gdrive')
import os

Mounted at /gdrive


#### Libraries and Installations

In [2]:
import copy
import random
import time

import numpy as np
import pandas as pd

import itertools

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import trange, tqdm

In [3]:
import scipy.sparse as sp

from sklearn import metrics
from sklearn import decomposition
from sklearn import manifold
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import networkx as nx

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision.transforms as transforms

In [4]:
def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric

Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu121/torch_scatter-2.1.2%2Bpt21cu121-cp310-cp310-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt21cu121
Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu121/torch_sparse-0.6.18%2Bpt21cu121-cp310-cp310-linux_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt21cu121
Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html
Collecting torch-cluster
 

#### Versions

Put the following in a txt file and run the following command to instantly download the version requirements for py projects:  
pip install -r /path/to/requirements.txt

In [5]:
!python --version # python v
print("pandas ", pd.__version__) # pandas v
print("numpy ", np.__version__) # numpy v
print("torch ", torch.__version__) # torch v
# print("torchvision ", torchvision.__version__) # torchvision v
# print("pytorch lightning ", pl.__version__) # pytorch lightning v
# print("torch geometric ", pyg.__version__) # torch geometric v

Python 3.10.12
pandas  1.5.3
numpy  1.25.2
torch  2.1.0+cu121


In [6]:
!pip list

Package                          Version
-------------------------------- ---------------------
absl-py                          1.4.0
aiohttp                          3.9.3
aiosignal                        1.3.1
alabaster                        0.7.16
albumentations                   1.3.1
altair                           4.2.2
annotated-types                  0.6.0
anyio                            3.7.1
appdirs                          1.4.4
argon2-cffi                      23.1.0
argon2-cffi-bindings             21.2.0
array-record                     0.5.0
arviz                            0.15.1
astropy                          5.3.4
astunparse                       1.6.3
async-timeout                    4.0.3
atpublic                         4.0
attrs                            23.2.0
audioread                        3.0.1
autograd                         1.6.2
Babel                            2.14.0
backcall                         0.2.0
beautifulsoup4                   4.12.3
bi

#### Setting Up Device

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### Seeds

In [8]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Datasets

#### ADNI

In [9]:
data = pd.read_csv(path + 'ADNI_dataset_ROIs_and_SNPs.csv', encoding='ISO-8859-1')

# Data Preparation


#### Studying the Data




In [10]:
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,PTID,Age,Date,Sex,DLICV_baseline,diagnosis,MUSE_Volume_4,MUSE_Volume_11,MUSE_Volume_23,MUSE_Volume_30,MUSE_Volume_31,MUSE_Volume_32,MUSE_Volume_35,MUSE_Volume_36,MUSE_Volume_37,MUSE_Volume_38,MUSE_Volume_39,MUSE_Volume_40,MUSE_Volume_41,MUSE_Volume_47,MUSE_Volume_48,MUSE_Volume_49,MUSE_Volume_50,MUSE_Volume_51,MUSE_Volume_52,MUSE_Volume_55,MUSE_Volume_56,MUSE_Volume_57,MUSE_Volume_58,MUSE_Volume_59,MUSE_Volume_60,MUSE_Volume_61,MUSE_Volume_62,MUSE_Volume_71,MUSE_Volume_72,MUSE_Volume_73,MUSE_Volume_75,MUSE_Volume_76,MUSE_Volume_81,MUSE_Volume_82,MUSE_Volume_83,MUSE_Volume_84,MUSE_Volume_85,MUSE_Volume_86,MUSE_Volume_87,MUSE_Volume_88,MUSE_Volume_89,MUSE_Volume_90,MUSE_Volume_91,MUSE_Volume_92,MUSE_Volume_93,MUSE_Volume_94,MUSE_Volume_95,MUSE_Volume_100,MUSE_Volume_101,MUSE_Volume_102,MUSE_Volume_103,MUSE_Volume_104,MUSE_Volume_105,MUSE_Volume_106,MUSE_Volume_107,MUSE_Volume_108,MUSE_Volume_109,MUSE_Volume_112,MUSE_Volume_113,MUSE_Volume_114,MUSE_Volume_115,MUSE_Volume_116,MUSE_Volume_117,MUSE_Volume_118,MUSE_Volume_119,MUSE_Volume_120,MUSE_Volume_121,MUSE_Volume_122,MUSE_Volume_123,MUSE_Volume_124,MUSE_Volume_125,MUSE_Volume_128,MUSE_Volume_129,MUSE_Volume_132,MUSE_Volume_133,MUSE_Volume_134,MUSE_Volume_135,MUSE_Volume_136,MUSE_Volume_137,MUSE_Volume_138,MUSE_Volume_139,MUSE_Volume_140,MUSE_Volume_141,MUSE_Volume_142,MUSE_Volume_143,MUSE_Volume_144,MUSE_Volume_145,MUSE_Volume_146,MUSE_Volume_147,MUSE_Volume_148,MUSE_Volume_149,MUSE_Volume_150,MUSE_Volume_151,MUSE_Volume_152,MUSE_Volume_153,MUSE_Volume_154,MUSE_Volume_155,MUSE_Volume_156,MUSE_Volume_157,MUSE_Volume_160,MUSE_Volume_161,MUSE_Volume_162,MUSE_Volume_163,MUSE_Volume_164,MUSE_Volume_165,MUSE_Volume_166,MUSE_Volume_167,MUSE_Volume_168,MUSE_Volume_169,MUSE_Volume_170,MUSE_Volume_171,MUSE_Volume_172,MUSE_Volume_173,MUSE_Volume_174,MUSE_Volume_175,MUSE_Volume_176,MUSE_Volume_177,MUSE_Volume_178,MUSE_Volume_179,MUSE_Volume_180,MUSE_Volume_181,MUSE_Volume_182,MUSE_Volume_183,MUSE_Volume_184,MUSE_Volume_185,MUSE_Volume_186,MUSE_Volume_187,MUSE_Volume_190,MUSE_Volume_191,MUSE_Volume_192,MUSE_Volume_193,MUSE_Volume_194,MUSE_Volume_195,MUSE_Volume_196,MUSE_Volume_197,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207,rs4575098,rs6656401,rs2093760,rs4844610,rs4663105,rs6733839,rs10933431,rs35349669,rs6448453,rs190982,rs9271058,rs9473117,rs9381563,rs10948363,rs2718058,rs4723711,rs1859788,rs1476679,rs12539172,rs10808026,rs7810606,rs11771145,rs28834970,rs73223431,rs4236673,rs9331896,rs11257238,rs7920721,rs3740688,rs10838725,rs983392,rs7933202,rs2081545,rs867611,rs10792832,rs3851179,rs17125924,rs17125944,rs10498633,rs12881735,rs12590654,rs442495,rs59735493,rs113260531,rs28394864,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358
0,002_S_0295,84.742466,2006-04-18,M,1485405.375,CN,1873.124153,1586.249283,302.695176,352.265466,1062.069832,1159.101038,20657.100036,3254.764153,3118.709527,52564.546547,52086.773326,14018.899912,14294.173225,3600.701497,3368.670352,750.93716,587.460672,16514.289409,23626.044006,1544.061802,1339.452519,4182.888734,4105.896581,7365.93417,8007.18388,4747.146291,4789.333772,4638.513528,2017.616275,2812.850291,352.265466,380.742015,107813.271569,102646.359842,26682.526999,30497.329962,51500.367341,52266.07012,63530.127527,63595.518123,426.093557,543.163817,3778.943604,3226.287604,2316.092703,2299.21771,11240.854293,4464.490169,4446.56049,3624.959299,3474.139054,1558.82742,741.444977,9202.144277,10142.925102,3385.545344,3549.021833,2759.061253,2699.998779,5983.239482,4772.45878,2428.944214,2103.045924,954.491756,976.640183,2716.873772,2089.334993,8379.488399,7090.660857,1878.397588,1371.09313,6936.676551,6667.73136,11534.057285,11793.510293,9326.597346,7320.582628,2008.124092,1507.147756,3968.787268,4167.068429,1769.764825,1258.241619,14127.532675,13099.212828,4775.622841,5759.645833,3586.990566,3536.365589,942.890199,769.921527,2916.209619,1978.592855,4195.544978,4115.388764,11762.924369,11668.002537,3131.365772,4021.521619,3495.232795,5277.653864,2280.233344,2930.975237,1168.593222,1541.952428,3406.639085,3815.85765,11444.408888,11739.721255,3415.076581,3348.631299,2000.741283,2098.827176,1443.866535,1758.163268,8868.863178,8873.081926,2589.256642,2578.709772,1332.06971,1373.202504,10701.909224,10142.925102,1171.757283,1339.452519,1123.24168,1319.413466,9906.675209,12091.986721,3378.162535,3411.91252,6568.59078,7252.027971,2965.779909,2489.061375,9220.073956,7986.090139,4336.873039,4313.669925,8816.128827,7512.535666,2559.725405,2955.233039,994.569863,1110.585435,0,0,0,0,1,1,1,0,0,1,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,0,1
1,002_S_0413,76.283562,2006-05-02,F,1364116.0,CN,2131.516933,1505.034469,384.959763,340.663023,988.239172,1051.520229,18405.295394,3021.670466,3151.396633,45240.682248,43280.024169,12993.710346,13640.231811,3586.981241,3350.731962,475.662611,761.482051,16498.426213,13491.521327,1096.871653,1092.652915,2941.514461,3243.154165,6479.980225,6874.432146,4181.823176,4367.447609,3893.894367,1782.416435,2235.930676,503.084402,371.248867,103124.919662,101385.745282,29939.322695,27712.884177,48141.064022,54393.232442,54684.325304,53309.017001,504.139086,497.810981,3042.764152,2457.414376,2021.829767,1968.040869,11103.716114,2839.210085,3769.441622,3323.310171,3544.79387,1840.424071,1424.878464,9169.425142,8470.169464,2715.812024,3414.013019,2874.014667,2941.514461,5370.452361,6264.824631,2298.157049,2118.860721,1387.964514,967.145486,2358.274053,2307.649208,7267.829383,8130.561125,2144.173144,1907.923865,6878.650883,7117.009531,12912.499657,11882.073114,8530.286468,8347.826087,2611.398281,2317.141366,4837.836799,4199.752808,1187.574501,1671.674586,15983.740284,14967.024637,4672.251366,4851.547694,4239.830811,4672.251366,1018.825016,606.443462,3551.121975,3038.545415,6675.096817,4862.094537,12907.226235,11194.418962,3109.209261,3177.76374,3142.959158,4005.690901,2556.554698,2594.523332,1206.558818,1193.902606,3702.996512,3976.159741,10143.953418,11154.34096,3077.568733,3113.427999,1596.792002,1522.964102,1555.659315,1731.79159,9037.589607,8741.223324,2840.26477,2546.007855,1505.034469,1198.121344,12067.697547,11828.284215,902.809745,1252.964926,948.161169,971.364223,10644.928452,12009.689912,3580.653135,4822.016534,8888.879123,7140.212585,4647.993628,3295.888379,9925.633772,11152.231591,7421.813288,6059.161196,6676.151501,7780.405944,2469.015903,2239.094729,1097.926337,744.607103,1,0,0,0,0,0,0,1,0,0,0,0,0,0,2,2,1,1,1,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0
2,002_S_0559,79.223288,2006-05-23,M,1570479.625,CN,2366.71768,3157.732947,512.577893,552.656,1172.81197,1141.171359,23368.700372,3041.717375,3198.865741,49984.782089,49158.96215,17229.36721,17536.281134,4728.161925,4206.091848,764.648092,640.195023,16674.601836,16115.617714,1304.647848,1335.233771,4119.607512,4195.544978,7287.88733,7510.426292,5413.70849,5620.427146,3802.146719,1700.155481,2524.920733,563.20287,591.67942,115548.346197,110692.567142,30553.228374,27892.253015,54122.319281,61104.347374,63260.127649,63511.143161,543.163817,640.195023,3323.31881,3096.5611,2498.553558,2377.26455,12633.041163,3755.740489,4386.443329,3620.740551,3068.08455,2433.162962,2104.100611,11164.916827,10359.135942,3451.990627,2741.131573,3804.256093,3905.506047,4038.396612,3980.388825,2408.905161,2620.897253,1344.725955,1242.421313,3342.303176,2947.85023,8735.972613,10286.362537,1908.983512,2000.741283,5694.255238,4487.693284,12286.049133,15746.477256,9619.800338,9030.230292,2045.038138,2290.780214,3946.638841,4054.216917,1891.053833,1156.991664,16048.117744,14235.110752,4362.185528,5427.419421,4695.466627,4353.748032,653.905954,1110.585435,1854.139787,2276.014596,6314.411208,5783.903635,13022.220675,12608.783362,3445.662505,2907.772123,5668.94275,4525.662016,2641.990993,2326.639573,1201.288519,1131.679176,3535.310902,3904.45136,10796.831056,11263.00272,3952.966963,4306.287116,1615.780519,1374.257191,1595.741466,1631.600825,8051.480735,12965.267576,3138.748581,3280.076642,1138.007298,1420.66342,11015.15127,12643.588034,1885.780397,1861.522596,932.343328,1025.155787,11981.244583,11908.471179,4343.201161,5510.739696,7549.449712,5636.247452,2599.803512,2494.33481,7430.270078,6746.832887,7983.980765,7081.168674,10475.151514,11083.705926,3069.139237,2872.967451,1051.522962,1274.061924,0,0,0,0,1,0,1,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1,0,0,2,0,0,0,0,1,0,0,0,0
3,002_S_0619,77.447945,2006-06-01,M,1859348.25,Dementia,5124.734093,2981.605944,280.547287,356.484899,876.4466,908.087272,21112.765397,2883.519862,2848.715123,56650.512928,55319.495347,13810.098417,14307.911649,3485.747309,3231.567248,1810.901098,1584.142953,47125.61612,60669.932896,1393.244235,1401.681747,3791.607134,3956.138626,7059.03381,7534.698571,5230.202998,5155.320075,4848.405562,2089.339007,3279.028256,557.930507,343.82863,105178.865483,105828.553938,30496.333873,30318.091423,52001.443596,58340.124785,64032.281587,63620.952857,546.328928,608.555582,3098.676428,2852.933879,2209.573559,2206.409492,12681.581134,2698.949278,4427.584631,3850.669721,3989.888675,2088.284318,2110.432788,10630.210932,9738.998685,2335.081556,3525.825493,3081.801403,3971.958961,4032.076237,5876.727385,1759.221335,1517.697543,1132.736039,1098.98599,2337.190934,1737.072865,9832.866011,8968.02099,2304.495574,2317.151842,8209.699563,8048.332138,12752.2453,13784.78588,9587.123462,11270.407185,3097.621739,2240.159542,4577.350476,4796.725798,1399.572369,1617.893002,17917.057577,16677.797943,6813.291261,7985.050795,5135.280983,4916.96035,831.094971,594.844624,2534.417786,2599.808508,5705.867759,4306.29539,11084.781913,13730.996738,5032.976145,4884.264989,4352.701708,5000.280785,2023.948286,2122.034368,1509.26003,1475.509981,5866.180495,5736.453741,11578.376388,11748.181325,3241.05945,2811.801007,1759.221335,2247.542365,2254.925188,2166.331308,9747.436198,10617.554664,2975.277809,3174.61404,1750.783823,1521.916299,12143.689718,12557.127826,1405.900503,1814.065166,1921.643449,1717.033773,12618.299791,13427.246292,3545.864585,3870.708812,7061.143188,8703.294038,3338.090842,2569.222525,10113.413298,11186.032061,7032.666584,6352.392146,7272.080998,7162.393337,2489.066157,2506.995871,1155.939198,1574.650751,0,1,1,1,0,0,0,2,1,1,0,0,0,0,1,1,1,1,1,0,1,1,1,1,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,2
4,002_S_0685,89.561644,2006-07-06,F,1372862.125,CN,2941.520445,1693.826402,278.437217,328.007479,910.194387,966.092767,19718.417441,3275.856043,3304.332576,48017.763654,46338.702862,15370.999989,15619.905986,3663.980648,3558.512005,850.07726,378.632427,19563.378537,20767.830436,1211.834705,1276.170577,3940.308492,3830.621103,6907.141411,7299.484762,4610.034373,4816.752913,4007.808423,1973.318305,3543.746395,275.273157,261.562234,100127.710645,97369.705638,28625.244318,26733.136868,50063.855322,54914.358201,58502.401426,55236.037561,503.085426,562.147866,3130.309316,2807.575269,2454.255316,2260.193013,10907.567029,2037.654177,3700.894673,3134.528061,3218.902976,1961.716754,1551.443734,6058.118837,6199.446819,3072.301562,3788.433646,2181.091531,2566.052077,4183.941056,4580.503153,1784.529435,1827.771578,930.233429,866.952243,2598.747356,1822.498146,5713.236376,5468.549125,1692.771716,1665.349868,6355.54041,6073.939134,10347.528536,10602.762652,7501.984556,7832.101408,2016.560449,1691.717029,3852.769518,4266.206598,996.678674,1531.404692,13870.181203,12898.815004,3552.183887,3546.910454,3434.059007,3966.675652,834.256964,632.811856,2536.520857,2079.841634,6290.149851,3814.800807,10999.324748,9911.943042,3577.496361,3192.535815,3371.832507,3824.292985,1453.357897,1714.92013,1076.834842,984.022436,3021.676614,3154.567104,7552.609505,7804.679561,3350.738779,3221.012348,1459.686015,1302.537737,855.350692,1380.584533,7132.844307,8861.475361,2633.552009,2252.810208,986.131809,1033.592699,9263.310889,9459.482565,714.022711,719.296143,1048.358309,1097.928571,10256.825504,9957.294558,3674.527512,3801.089883,7668.625012,5915.73617,2531.247425,2325.583572,7989.249685,8070.46054,4039.449016,3574.332301,6934.563258,6778.469667,2066.130711,2608.239534,434.530808,710.858652,0,0,0,0,0,0,1,1,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,0,0,1,2,2,0,0,0,0,0,2,0,0,2,1,1,1,0,0,0,0,0,0


In [11]:
data.describe()

Unnamed: 0,Age,DLICV_baseline,MUSE_Volume_4,MUSE_Volume_11,MUSE_Volume_23,MUSE_Volume_30,MUSE_Volume_31,MUSE_Volume_32,MUSE_Volume_35,MUSE_Volume_36,MUSE_Volume_37,MUSE_Volume_38,MUSE_Volume_39,MUSE_Volume_40,MUSE_Volume_41,MUSE_Volume_47,MUSE_Volume_48,MUSE_Volume_49,MUSE_Volume_50,MUSE_Volume_51,MUSE_Volume_52,MUSE_Volume_55,MUSE_Volume_56,MUSE_Volume_57,MUSE_Volume_58,MUSE_Volume_59,MUSE_Volume_60,MUSE_Volume_61,MUSE_Volume_62,MUSE_Volume_71,MUSE_Volume_72,MUSE_Volume_73,MUSE_Volume_75,MUSE_Volume_76,MUSE_Volume_81,MUSE_Volume_82,MUSE_Volume_83,MUSE_Volume_84,MUSE_Volume_85,MUSE_Volume_86,MUSE_Volume_87,MUSE_Volume_88,MUSE_Volume_89,MUSE_Volume_90,MUSE_Volume_91,MUSE_Volume_92,MUSE_Volume_93,MUSE_Volume_94,MUSE_Volume_95,MUSE_Volume_100,MUSE_Volume_101,MUSE_Volume_102,MUSE_Volume_103,MUSE_Volume_104,MUSE_Volume_105,MUSE_Volume_106,MUSE_Volume_107,MUSE_Volume_108,MUSE_Volume_109,MUSE_Volume_112,MUSE_Volume_113,MUSE_Volume_114,MUSE_Volume_115,MUSE_Volume_116,MUSE_Volume_117,MUSE_Volume_118,MUSE_Volume_119,MUSE_Volume_120,MUSE_Volume_121,MUSE_Volume_122,MUSE_Volume_123,MUSE_Volume_124,MUSE_Volume_125,MUSE_Volume_128,MUSE_Volume_129,MUSE_Volume_132,MUSE_Volume_133,MUSE_Volume_134,MUSE_Volume_135,MUSE_Volume_136,MUSE_Volume_137,MUSE_Volume_138,MUSE_Volume_139,MUSE_Volume_140,MUSE_Volume_141,MUSE_Volume_142,MUSE_Volume_143,MUSE_Volume_144,MUSE_Volume_145,MUSE_Volume_146,MUSE_Volume_147,MUSE_Volume_148,MUSE_Volume_149,MUSE_Volume_150,MUSE_Volume_151,MUSE_Volume_152,MUSE_Volume_153,MUSE_Volume_154,MUSE_Volume_155,MUSE_Volume_156,MUSE_Volume_157,MUSE_Volume_160,MUSE_Volume_161,MUSE_Volume_162,MUSE_Volume_163,MUSE_Volume_164,MUSE_Volume_165,MUSE_Volume_166,MUSE_Volume_167,MUSE_Volume_168,MUSE_Volume_169,MUSE_Volume_170,MUSE_Volume_171,MUSE_Volume_172,MUSE_Volume_173,MUSE_Volume_174,MUSE_Volume_175,MUSE_Volume_176,MUSE_Volume_177,MUSE_Volume_178,MUSE_Volume_179,MUSE_Volume_180,MUSE_Volume_181,MUSE_Volume_182,MUSE_Volume_183,MUSE_Volume_184,MUSE_Volume_185,MUSE_Volume_186,MUSE_Volume_187,MUSE_Volume_190,MUSE_Volume_191,MUSE_Volume_192,MUSE_Volume_193,MUSE_Volume_194,MUSE_Volume_195,MUSE_Volume_196,MUSE_Volume_197,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207,rs4575098,rs6656401,rs2093760,rs4844610,rs4663105,rs6733839,rs10933431,rs35349669,rs6448453,rs190982,rs9271058,rs9473117,rs9381563,rs10948363,rs2718058,rs4723711,rs1859788,rs1476679,rs12539172,rs10808026,rs7810606,rs11771145,rs28834970,rs73223431,rs4236673,rs9331896,rs11257238,rs7920721,rs3740688,rs10838725,rs983392,rs7933202,rs2081545,rs867611,rs10792832,rs3851179,rs17125924,rs17125944,rs10498633,rs12881735,rs12590654,rs442495,rs59735493,rs113260531,rs28394864,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358
count,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0
mean,73.81393,1428763.0,1760.170636,1747.376593,354.51935,390.800399,934.421522,982.750502,20204.546261,3176.595664,3073.943628,48502.767251,47544.487474,13477.859096,13760.769017,3558.694047,3263.864868,789.089259,745.75441,20072.757205,22245.285413,1357.105899,1375.6513,3946.335931,3985.483514,6942.422294,7132.958883,4462.614985,4633.40334,4207.894972,2122.038855,2863.019536,342.218076,316.213274,93517.13819,91003.370739,23704.286179,23757.460942,45064.038634,47594.723132,54939.594856,54196.217571,468.51749,513.857501,3058.296978,2746.387878,2103.066347,2056.08248,11296.94778,3133.491107,4067.248729,3657.448684,3649.463509,1536.637447,1425.383938,7690.903778,7393.917601,2897.849903,2993.546567,3002.885283,3149.397027,4497.613369,4400.981967,2129.288967,2022.024455,1410.440053,1316.011305,2765.047791,2371.482681,7484.910847,7112.47125,1877.960776,1831.652737,6845.064304,6559.150427,11411.27218,11106.140979,8425.185999,7683.311672,2169.168064,1885.165281,4192.520679,3978.033455,1482.773103,1545.337928,15593.306724,15546.883475,4457.669816,4919.097513,3933.589762,4376.999607,807.345012,568.688612,2528.324933,2388.297442,5984.481527,4675.459143,12776.632068,12275.818579,3140.713031,3230.752465,3871.49267,3879.486379,2489.984487,2312.472013,1227.131184,1482.934727,3590.328637,3894.922829,9753.021226,9578.031103,3148.077039,3262.949261,1901.655197,1730.543287,1759.374385,1948.75505,8385.291181,9363.44434,2932.532682,2754.70679,1340.007647,1443.574321,11016.685765,11075.781712,1325.970363,1589.365488,1426.429219,1501.694075,11751.565902,12097.737733,4034.722059,4402.293778,7521.862398,7315.938942,3409.253657,2976.968274,8704.286336,8605.153094,5984.63252,6052.626847,8006.932423,7733.883457,2697.332343,2807.495753,1171.346482,1299.51021,0.454371,0.377154,0.394384,0.376516,0.848756,0.789407,0.441608,0.938098,0.587109,0.782387,0.53478,0.577537,0.765795,0.572431,0.751755,0.713465,0.632419,0.555839,0.575622,0.364391,0.959796,0.68411,0.733886,0.736439,0.768347,0.776643,0.797064,0.824505,0.88896,0.643906,0.790045,0.758137,0.775367,0.609445,0.703255,0.703255,0.1806,0.181238,0.444161,0.449266,0.664965,0.649011,0.57626,0.264837,0.950223,0.343331,0.355456,0.360562,0.167837,0.590938,0.176133,0.181876,0.178685,0.569241
std,7.149737,143870.6,777.161321,579.871305,65.326396,70.733256,162.643803,167.684822,2243.884699,489.53855,441.699929,5120.497417,5020.592049,1596.151189,1612.511285,558.676487,518.260944,659.717463,628.743527,10932.51682,12187.913957,167.722216,169.702897,513.177597,512.382842,698.46162,699.304528,468.937838,491.600299,575.009714,298.281983,412.232589,113.936015,117.329251,12341.786604,11942.339011,3414.91706,3624.056625,5779.393875,6056.089689,6784.429505,6889.202109,72.482932,75.253639,397.740369,361.328293,287.241275,270.482462,1723.699442,723.395345,817.928383,565.9253,574.682677,303.696144,289.250441,1482.041218,1552.176925,603.259404,633.180136,569.18226,576.179401,823.90223,823.070353,426.024133,425.138293,297.174011,301.909503,491.996451,452.719165,1228.9528,1132.737766,297.079053,310.391373,1155.142359,1084.372462,1730.931697,1668.070613,1254.502,1158.825576,405.91545,399.729942,647.64952,622.18836,371.07867,354.417481,2367.168831,2409.018703,857.899488,945.782745,554.783199,607.243284,212.361112,186.858918,560.284508,517.459049,1112.149648,792.038558,2083.562888,2082.830997,611.571923,714.194427,702.847957,677.818067,616.539627,592.915181,341.850572,379.27755,607.495133,626.082804,1700.707796,1655.518129,505.260849,524.021379,348.819524,333.493714,431.702909,454.178137,1244.329236,1343.44604,478.38745,442.05741,241.350931,270.463025,1610.663249,1595.176639,331.069016,396.432575,304.985202,318.9956,1652.725931,1711.69288,768.647817,783.578504,1193.36644,1234.212899,658.471423,649.234262,1319.72009,1371.76467,1007.989909,1100.527491,1359.762514,1366.511667,550.951148,589.229934,311.89336,380.021734,0.595045,0.546732,0.559528,0.545418,0.707177,0.699619,0.589612,0.722294,0.637213,0.668959,0.614768,0.636446,0.683245,0.637048,0.689472,0.677284,0.657151,0.639716,0.64564,0.550707,0.703243,0.679303,0.701226,0.704007,0.680372,0.679862,0.692722,0.695148,0.704247,0.64688,0.701178,0.692661,0.6989,0.646723,0.671921,0.671921,0.407379,0.407879,0.586602,0.588153,0.666239,0.674738,0.632575,0.486794,0.711212,0.533242,0.546094,0.550909,0.380612,0.642667,0.406985,0.411493,0.40744,0.66678
min,54.273973,1057343.0,279.599289,649.198349,188.787377,219.599442,428.399974,465.599972,14147.216048,1785.595459,1909.195145,33671.25,32319.375,8848.799473,9111.599457,1739.177918,1655.858626,34.799911,2.475464,2818.799832,3604.799785,913.199946,903.866268,2608.241008,2521.249264,4339.488159,4380.115207,3168.75,3248.4375,2570.267046,1265.625,1640.625,2.4,0.0,63106.796239,61430.396338,15850.759689,14431.253177,29985.714386,30093.598206,38335.102509,36892.797801,151.199991,224.648336,1925.625,1840.795319,1420.799915,1371.599918,7014.73687,1244.528155,1628.848322,1678.364502,1702.264855,639.140625,542.398621,3305.625,2831.999831,1336.875,1305.708143,1291.988975,1553.550824,2206.875,2169.375,933.75,836.39995,575.545349,521.999969,1183.35884,1076.835451,3627.599784,3786.326413,958.710504,820.799951,3543.599789,2583.988172,6137.999634,5839.18515,4623.738503,4325.262682,612.677307,616.390503,1891.875,2371.989775,0.0,1.199997,8025.0,7935.46875,2213.792315,2062.5,1734.960153,1858.358535,0.0,9.375,1005.0,924.960519,2716.873772,2355.116123,6550.657399,5364.132137,1314.137357,1425.939595,1412.225924,1730.625,750.0,870.116794,0.0,0.0,1953.75,2213.788062,4640.625,4296.149335,1775.625,1678.125,881.718351,741.443469,614.883716,681.328125,5220.731172,5583.385091,1638.983634,1535.999908,778.358583,747.590088,6710.973528,4726.052551,542.108272,556.875,586.405985,454.570107,5368.356948,5285.036673,1730.737884,1789.752791,3833.787329,3688.79978,1214.999451,1129.567504,4633.125,4793.573109,2890.799828,1906.875,3224.291687,2693.670657,1095.0,1081.875,416.399975,423.984375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,69.256164,1323742.0,1181.114188,1327.802492,310.077739,345.937011,832.798916,869.998868,18651.099886,2827.918127,2771.884766,44934.739444,44147.342547,12397.848759,12617.234438,3179.376055,2916.101102,344.69999,304.802856,12306.509566,13577.359251,1239.599926,1263.662114,3611.630461,3638.534576,6454.217291,6641.024802,4119.385503,4283.924872,3805.263448,1919.625004,2571.599847,263.999984,245.999985,84630.9375,82494.990204,21172.5,21082.753998,40779.107855,43278.068912,50009.02351,49106.90411,418.799975,461.999972,2770.661557,2484.609375,1901.024943,1860.937445,10027.642013,2641.875,3493.762396,3290.170815,3268.640585,1334.69996,1232.137463,6715.773102,6368.15354,2472.186522,2551.151324,2600.130998,2757.014426,3922.372172,3845.017759,1853.904695,1736.390272,1204.199928,1098.75,2417.665795,2071.307043,6642.778318,6368.88731,1681.53745,1633.984265,6028.269592,5844.199219,10249.312195,9961.868487,7510.836873,6909.224789,1885.421254,1607.579947,3758.913473,3551.138833,1229.477187,1299.609226,13970.080285,13939.192639,3887.253059,4273.610414,3566.399787,3959.133244,659.235279,438.599974,2130.599936,2016.350886,5236.821749,4137.862377,11371.86343,10906.204601,2722.799838,2726.243596,3410.626116,3401.853076,2067.599877,1867.676626,1015.955768,1231.3716,3167.684233,3449.0625,8527.465834,8446.789008,2804.411842,2906.48996,1657.949951,1499.882768,1450.3125,1632.128168,7520.399552,8459.999748,2606.12913,2449.34042,1166.481277,1250.625,9923.087024,9985.259549,1077.888858,1307.897461,1223.439539,1279.812108,10634.477204,10936.416527,3478.390076,3865.19977,6660.071564,6456.909543,2942.491137,2515.773302,7766.25,7652.283426,5342.899782,5347.799681,7160.454479,6845.999592,2324.401478,2417.867951,952.909455,1022.999939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,73.967123,1423724.0,1636.88129,1643.708008,352.265346,388.125,939.599944,984.375,20012.663645,3114.120348,3028.811855,48298.797121,47384.397176,13397.999201,13660.847351,3592.5,3281.25,588.515625,554.399967,17745.073062,19592.398832,1347.890016,1369.196518,3911.999767,3959.296875,6903.599589,7123.199575,4443.457642,4595.27136,4185.589355,2103.044736,2839.199831,325.199981,301.640625,92283.75,89994.303797,23521.198598,23578.125,44751.435791,47198.397187,54347.047472,53805.596793,466.875,515.741663,3028.799819,2724.25227,2070.350626,2029.207741,11217.564514,3086.009346,4004.399761,3635.509867,3631.199784,1520.859375,1414.799916,7666.511597,7368.043544,2853.59983,2941.199825,2966.843445,3127.148438,4466.599543,4348.125,2130.0,2047.148438,1395.0,1293.040883,2743.125,2330.625,7481.947205,7107.517372,1852.79989,1799.295451,6791.178916,6510.469971,11388.510476,11119.199337,8364.375,7625.390625,2149.940369,1867.5,4138.125,3930.0,1468.124336,1507.19991,15485.264221,15430.79908,4424.412062,4885.315261,3907.617188,4357.5,788.435242,539.999968,2487.599852,2336.25,5886.20495,4621.199725,12758.486704,12263.999269,3087.599816,3166.799811,3845.990219,3832.799772,2433.75,2260.80009,1217.999927,1466.399913,3519.505713,3856.790192,9723.119004,9524.878506,3133.199813,3251.600092,1897.199887,1713.75,1741.875,1911.092886,8335.191544,9253.199448,2923.125,2725.199838,1320.0,1420.084746,10848.75,10990.93819,1285.664062,1545.599908,1408.799916,1486.516052,11699.042236,12007.169464,3976.875,4348.125,7456.875,7265.738903,3368.412849,2945.742188,8654.739213,8546.399491,5963.984833,6040.190534,8018.027466,7719.375,2645.033142,2764.799835,1147.961323,1271.999924,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,78.953425,1527860.0,2230.3125,2039.2882,394.799976,432.211664,1041.29025,1093.455436,21697.919751,3441.866151,3311.257966,51966.086067,50885.2862,14513.078394,14822.02137,3952.361832,3624.328481,1012.194883,991.902871,25057.945899,28160.675369,1466.889835,1479.54961,4235.159212,4254.375,7420.799558,7610.524435,4771.228581,4964.931701,4575.417051,2317.796983,3115.195946,408.30652,387.419498,101146.121986,98605.696952,25918.536728,26148.949825,48869.720837,51710.178185,59564.979757,58827.780823,514.38126,561.599967,3319.099874,2984.585846,2278.125,2227.207031,12391.783877,3580.79534,4579.366209,4014.142667,4008.431435,1719.597762,1595.810471,8641.84436,8372.779884,3253.457584,3378.037399,3373.263371,3512.634932,5001.449851,4905.169682,2403.086004,2318.271912,1597.596946,1521.5625,3064.259634,2624.76356,8302.039444,7852.60555,2045.970886,1997.9974,7614.941955,7235.803299,12555.575993,12203.805615,9193.703663,8375.859375,2421.553878,2134.359355,4631.71875,4360.973738,1719.898091,1775.520782,17051.432014,17059.97115,4975.219593,5489.141792,4279.922117,4772.474858,932.899191,661.244175,2870.996263,2705.036027,6636.656195,5177.130022,14087.321027,13635.595335,3527.92889,3685.738953,4301.516018,4318.623229,2861.63623,2693.996493,1445.398118,1707.599898,3967.995534,4307.070773,10880.420428,10700.345398,3482.399792,3602.144371,2122.344465,1952.997458,2028.97887,2236.195868,9180.429108,10228.144512,3216.997614,3024.077057,1488.577453,1601.810524,12070.905419,12104.699639,1540.886169,1842.267186,1616.442036,1707.603206,12812.999236,13225.724606,4518.854645,4896.268424,8302.498123,8114.635061,3845.830536,3380.995599,9571.799429,9463.5241,6612.589433,6742.799598,8845.946033,8618.514329,3033.680969,3169.199811,1358.399919,1554.902359,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
max,91.306849,1905572.0,5464.820847,4804.855004,621.341431,661.875,1616.396692,1691.996537,31839.946542,5561.985855,5312.38649,67704.578765,66360.907497,23236.864494,23956.161044,5383.188984,5128.789504,5745.935552,4957.5,70383.340608,84133.438525,1959.599883,2030.625,7181.981735,6802.799595,9519.585703,9660.913477,6308.083086,6631.872002,6821.744966,3135.0,4216.637289,974.095032,756.254211,141230.625,136239.591879,37100.725414,36256.875,66583.453995,75606.288414,82662.315331,84714.571074,779.767814,787.19751,4855.779055,4260.935574,3480.473864,3112.387386,18686.039001,5829.599653,7468.784716,5692.143934,5672.104887,2945.802002,2601.599845,13671.921789,12558.75,5812.822174,5480.164302,5255.986633,5609.999666,7508.399552,7674.029274,3439.657043,3332.802329,2581.197795,2399.999857,4903.893921,5514.095764,11909.406196,10935.788185,3954.553528,6711.181083,10529.999372,11524.541655,18003.598927,16764.216126,13418.3992,11733.75,3612.304688,3490.540202,6809.999594,7790.384058,2794.802887,3046.058289,26441.25,26130.0,7886.39953,8400.0,6853.300212,7898.37825,1577.999906,1653.674365,5092.799696,4209.375,12530.374357,8068.351167,20258.387125,19915.61454,6283.199625,5856.702194,6285.599625,6643.194324,5121.599695,4709.570007,2489.999852,2900.00592,6249.599627,6295.187117,15816.976379,14477.75095,5035.093506,5474.388797,3553.199788,2864.099557,4300.799744,4163.904367,13021.875,14338.799145,5376.1226,4664.399722,2519.645874,2568.282878,19388.398844,17305.198969,2951.999824,3166.118286,2754.84157,2673.599841,18711.561708,18617.297456,7352.399562,6998.900712,12270.0,12236.399271,5843.985138,5578.434185,13815.599177,13048.125,9442.5,11927.972096,16396.234924,16002.636169,4601.887329,6470.399614,2335.600159,2987.922258,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [12]:
categorical_features = [column_name for column_name in data.columns if data[column_name].dtype == 'O']
print("Number of Categorical Features: {}".format(len(categorical_features)))
print("Categorical Features: ",categorical_features)

Number of Categorical Features: 4
Categorical Features:  ['PTID', 'Date', 'Sex', 'diagnosis']


In [13]:
numerical_features = [column_name for column_name in data.columns if data[column_name].dtype != 'O']
print("Number of Numerical Features: {}".format(len(numerical_features)))
print("Numerical Features: ",numerical_features)

Number of Numerical Features: 201
Numerical Features:  ['Age', 'DLICV_baseline', 'MUSE_Volume_4', 'MUSE_Volume_11', 'MUSE_Volume_23', 'MUSE_Volume_30', 'MUSE_Volume_31', 'MUSE_Volume_32', 'MUSE_Volume_35', 'MUSE_Volume_36', 'MUSE_Volume_37', 'MUSE_Volume_38', 'MUSE_Volume_39', 'MUSE_Volume_40', 'MUSE_Volume_41', 'MUSE_Volume_47', 'MUSE_Volume_48', 'MUSE_Volume_49', 'MUSE_Volume_50', 'MUSE_Volume_51', 'MUSE_Volume_52', 'MUSE_Volume_55', 'MUSE_Volume_56', 'MUSE_Volume_57', 'MUSE_Volume_58', 'MUSE_Volume_59', 'MUSE_Volume_60', 'MUSE_Volume_61', 'MUSE_Volume_62', 'MUSE_Volume_71', 'MUSE_Volume_72', 'MUSE_Volume_73', 'MUSE_Volume_75', 'MUSE_Volume_76', 'MUSE_Volume_81', 'MUSE_Volume_82', 'MUSE_Volume_83', 'MUSE_Volume_84', 'MUSE_Volume_85', 'MUSE_Volume_86', 'MUSE_Volume_87', 'MUSE_Volume_88', 'MUSE_Volume_89', 'MUSE_Volume_90', 'MUSE_Volume_91', 'MUSE_Volume_92', 'MUSE_Volume_93', 'MUSE_Volume_94', 'MUSE_Volume_95', 'MUSE_Volume_100', 'MUSE_Volume_101', 'MUSE_Volume_102', 'MUSE_Volume_103'

Turning categorical data to numerical.

In [14]:
data['diagnosis'] = data['diagnosis'].map({'CN': 0, 'MCI': 1, 'Dementia': 2}) #multi-class or binary?

In [15]:
data['Sex'] = data['Sex'].map({'M': 0, 'F': 1})

In [16]:
data_new= data.drop(['Date'], axis=1)
data=data_new

#### Data Preparation

In [17]:
data_controls_age = data.loc[(data['diagnosis'] == 0)]
data_controls_age_60 = data_controls_age.loc[(data_controls_age['Age'] > 60)]
data_controls = data_controls_age_60.loc[(data_controls_age_60['Age'] < 86)]
print(data_controls.shape) #controls ages 60-86

data_age_60 = data.loc[(data['Age'] > 60)]
data_age_filtered = data_age_60.loc[(data_age_60['Age'] < 86)]
print(data_age_filtered.shape) #ages 60-86

(449, 204)
(1463, 204)


In [18]:
roi_features = numerical_features[2:147]
snip_features = numerical_features[147:]
first_features = ['PTID','Sex','Age','DLICV_baseline','diagnosis']
all_but_roi_features = ['PTID','Sex','Age','DLICV_baseline','diagnosis']
all_but_roi_features.extend(snip_features)

In [19]:
print(all_but_roi_features)

['PTID', 'Sex', 'Age', 'DLICV_baseline', 'diagnosis', 'rs4575098', 'rs6656401', 'rs2093760', 'rs4844610', 'rs4663105', 'rs6733839', 'rs10933431', 'rs35349669', 'rs6448453', 'rs190982', 'rs9271058', 'rs9473117', 'rs9381563', 'rs10948363', 'rs2718058', 'rs4723711', 'rs1859788', 'rs1476679', 'rs12539172', 'rs10808026', 'rs7810606', 'rs11771145', 'rs28834970', 'rs73223431', 'rs4236673', 'rs9331896', 'rs11257238', 'rs7920721', 'rs3740688', 'rs10838725', 'rs983392', 'rs7933202', 'rs2081545', 'rs867611', 'rs10792832', 'rs3851179', 'rs17125924', 'rs17125944', 'rs10498633', 'rs12881735', 'rs12590654', 'rs442495', 'rs59735493', 'rs113260531', 'rs28394864', 'rs111278892', 'rs3752246', 'rs4147929', 'rs41289512', 'rs3865444', 'rs6024870', 'rs6014724', 'rs7274581', 'rs429358']


In [20]:
roi_mapping = pd.read_csv(path + 'ROIs.csv')

In [21]:
roi = roi_mapping.values.tolist()
roi_names = []
for i in roi:
  roi_names.append(i[0])

In [22]:
roi_names.remove('42,Right Cerebral Exterior')
roi_names.remove('43,Left Cerebral Exterior')
roi_names.remove('44,Cerebral')
roi_names.remove('45,Cerebral')
roi_names.remove('46,CSF')
roi_names.remove('63,Right vessel')
roi_names.remove('64,Left vessel')
roi_names.remove('69,Optic Chiasm')

In [23]:
ch = ','
roi_name=[]
# Remove all characters before the character ',' from string
for roi in roi_names:
  listOfWords = roi.split(ch, 1)
  if len(listOfWords) > 0:
      strValue = listOfWords[1]
      roi_name.append(strValue)

In [24]:
import warnings
warnings.filterwarnings('ignore')

# Data preprocessing & Model

In [25]:
data_age_filtered

Unnamed: 0,PTID,Age,Sex,DLICV_baseline,diagnosis,MUSE_Volume_4,MUSE_Volume_11,MUSE_Volume_23,MUSE_Volume_30,MUSE_Volume_31,MUSE_Volume_32,MUSE_Volume_35,MUSE_Volume_36,MUSE_Volume_37,MUSE_Volume_38,MUSE_Volume_39,MUSE_Volume_40,MUSE_Volume_41,MUSE_Volume_47,MUSE_Volume_48,MUSE_Volume_49,MUSE_Volume_50,MUSE_Volume_51,MUSE_Volume_52,MUSE_Volume_55,MUSE_Volume_56,MUSE_Volume_57,MUSE_Volume_58,MUSE_Volume_59,MUSE_Volume_60,MUSE_Volume_61,MUSE_Volume_62,MUSE_Volume_71,MUSE_Volume_72,MUSE_Volume_73,MUSE_Volume_75,MUSE_Volume_76,MUSE_Volume_81,MUSE_Volume_82,MUSE_Volume_83,MUSE_Volume_84,MUSE_Volume_85,MUSE_Volume_86,MUSE_Volume_87,MUSE_Volume_88,MUSE_Volume_89,MUSE_Volume_90,MUSE_Volume_91,MUSE_Volume_92,MUSE_Volume_93,MUSE_Volume_94,MUSE_Volume_95,MUSE_Volume_100,MUSE_Volume_101,MUSE_Volume_102,MUSE_Volume_103,MUSE_Volume_104,MUSE_Volume_105,MUSE_Volume_106,MUSE_Volume_107,MUSE_Volume_108,MUSE_Volume_109,MUSE_Volume_112,MUSE_Volume_113,MUSE_Volume_114,MUSE_Volume_115,MUSE_Volume_116,MUSE_Volume_117,MUSE_Volume_118,MUSE_Volume_119,MUSE_Volume_120,MUSE_Volume_121,MUSE_Volume_122,MUSE_Volume_123,MUSE_Volume_124,MUSE_Volume_125,MUSE_Volume_128,MUSE_Volume_129,MUSE_Volume_132,MUSE_Volume_133,MUSE_Volume_134,MUSE_Volume_135,MUSE_Volume_136,MUSE_Volume_137,MUSE_Volume_138,MUSE_Volume_139,MUSE_Volume_140,MUSE_Volume_141,MUSE_Volume_142,MUSE_Volume_143,MUSE_Volume_144,MUSE_Volume_145,MUSE_Volume_146,MUSE_Volume_147,MUSE_Volume_148,MUSE_Volume_149,MUSE_Volume_150,MUSE_Volume_151,MUSE_Volume_152,MUSE_Volume_153,MUSE_Volume_154,MUSE_Volume_155,MUSE_Volume_156,MUSE_Volume_157,MUSE_Volume_160,MUSE_Volume_161,MUSE_Volume_162,MUSE_Volume_163,MUSE_Volume_164,MUSE_Volume_165,MUSE_Volume_166,MUSE_Volume_167,MUSE_Volume_168,MUSE_Volume_169,MUSE_Volume_170,MUSE_Volume_171,MUSE_Volume_172,MUSE_Volume_173,MUSE_Volume_174,MUSE_Volume_175,MUSE_Volume_176,MUSE_Volume_177,MUSE_Volume_178,MUSE_Volume_179,MUSE_Volume_180,MUSE_Volume_181,MUSE_Volume_182,MUSE_Volume_183,MUSE_Volume_184,MUSE_Volume_185,MUSE_Volume_186,MUSE_Volume_187,MUSE_Volume_190,MUSE_Volume_191,MUSE_Volume_192,MUSE_Volume_193,MUSE_Volume_194,MUSE_Volume_195,MUSE_Volume_196,MUSE_Volume_197,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207,rs4575098,rs6656401,rs2093760,rs4844610,rs4663105,rs6733839,rs10933431,rs35349669,rs6448453,rs190982,rs9271058,rs9473117,rs9381563,rs10948363,rs2718058,rs4723711,rs1859788,rs1476679,rs12539172,rs10808026,rs7810606,rs11771145,rs28834970,rs73223431,rs4236673,rs9331896,rs11257238,rs7920721,rs3740688,rs10838725,rs983392,rs7933202,rs2081545,rs867611,rs10792832,rs3851179,rs17125924,rs17125944,rs10498633,rs12881735,rs12590654,rs442495,rs59735493,rs113260531,rs28394864,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358
0,002_S_0295,84.742466,0,1485405.375,0,1873.124153,1586.249283,302.695176,352.265466,1062.069832,1159.101038,20657.100036,3254.764153,3118.709527,52564.546547,52086.773326,14018.899912,14294.173225,3600.701497,3368.670352,750.937160,587.460672,16514.289409,23626.044006,1544.061802,1339.452519,4182.888734,4105.896581,7365.934170,8007.183880,4747.146291,4789.333772,4638.513528,2017.616275,2812.850291,352.265466,380.742015,107813.271569,102646.359842,26682.526999,30497.329962,51500.367341,52266.070120,63530.127527,63595.518123,426.093557,543.163817,3778.943604,3226.287604,2316.092703,2299.217710,11240.854293,4464.490169,4446.560490,3624.959299,3474.139054,1558.827420,741.444977,9202.144277,10142.925102,3385.545344,3549.021833,2759.061253,2699.998779,5983.239482,4772.458780,2428.944214,2103.045924,954.491756,976.640183,2716.873772,2089.334993,8379.488399,7090.660857,1878.397588,1371.093130,6936.676551,6667.731360,11534.057285,11793.510293,9326.597346,7320.582628,2008.124092,1507.147756,3968.787268,4167.068429,1769.764825,1258.241619,14127.532675,13099.212828,4775.622841,5759.645833,3586.990566,3536.365589,942.890199,769.921527,2916.209619,1978.592855,4195.544978,4115.388764,11762.924369,11668.002537,3131.365772,4021.521619,3495.232795,5277.653864,2280.233344,2930.975237,1168.593222,1541.952428,3406.639085,3815.857650,11444.408888,11739.721255,3415.076581,3348.631299,2000.741283,2098.827176,1443.866535,1758.163268,8868.863178,8873.081926,2589.256642,2578.709772,1332.069710,1373.202504,10701.909224,10142.925102,1171.757283,1339.452519,1123.241680,1319.413466,9906.675209,12091.986721,3378.162535,3411.912520,6568.590780,7252.027971,2965.779909,2489.061375,9220.073956,7986.090139,4336.873039,4313.669925,8816.128827,7512.535666,2559.725405,2955.233039,994.569863,1110.585435,0,0,0,0,1,1,1,0,0,1,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,0,1
1,002_S_0413,76.283562,1,1364116.000,0,2131.516933,1505.034469,384.959763,340.663023,988.239172,1051.520229,18405.295394,3021.670466,3151.396633,45240.682248,43280.024169,12993.710346,13640.231811,3586.981241,3350.731962,475.662611,761.482051,16498.426213,13491.521327,1096.871653,1092.652915,2941.514461,3243.154165,6479.980225,6874.432146,4181.823176,4367.447609,3893.894367,1782.416435,2235.930676,503.084402,371.248867,103124.919662,101385.745282,29939.322695,27712.884177,48141.064022,54393.232442,54684.325304,53309.017001,504.139086,497.810981,3042.764152,2457.414376,2021.829767,1968.040869,11103.716114,2839.210085,3769.441622,3323.310171,3544.793870,1840.424071,1424.878464,9169.425142,8470.169464,2715.812024,3414.013019,2874.014667,2941.514461,5370.452361,6264.824631,2298.157049,2118.860721,1387.964514,967.145486,2358.274053,2307.649208,7267.829383,8130.561125,2144.173144,1907.923865,6878.650883,7117.009531,12912.499657,11882.073114,8530.286468,8347.826087,2611.398281,2317.141366,4837.836799,4199.752808,1187.574501,1671.674586,15983.740284,14967.024637,4672.251366,4851.547694,4239.830811,4672.251366,1018.825016,606.443462,3551.121975,3038.545415,6675.096817,4862.094537,12907.226235,11194.418962,3109.209261,3177.763740,3142.959158,4005.690901,2556.554698,2594.523332,1206.558818,1193.902606,3702.996512,3976.159741,10143.953418,11154.340960,3077.568733,3113.427999,1596.792002,1522.964102,1555.659315,1731.791590,9037.589607,8741.223324,2840.264770,2546.007855,1505.034469,1198.121344,12067.697547,11828.284215,902.809745,1252.964926,948.161169,971.364223,10644.928452,12009.689912,3580.653135,4822.016534,8888.879123,7140.212585,4647.993628,3295.888379,9925.633772,11152.231591,7421.813288,6059.161196,6676.151501,7780.405944,2469.015903,2239.094729,1097.926337,744.607103,1,0,0,0,0,0,0,1,0,0,0,0,0,0,2,2,1,1,1,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0
2,002_S_0559,79.223288,0,1570479.625,0,2366.717680,3157.732947,512.577893,552.656000,1172.811970,1141.171359,23368.700372,3041.717375,3198.865741,49984.782089,49158.962150,17229.367210,17536.281134,4728.161925,4206.091848,764.648092,640.195023,16674.601836,16115.617714,1304.647848,1335.233771,4119.607512,4195.544978,7287.887330,7510.426292,5413.708490,5620.427146,3802.146719,1700.155481,2524.920733,563.202870,591.679420,115548.346197,110692.567142,30553.228374,27892.253015,54122.319281,61104.347374,63260.127649,63511.143161,543.163817,640.195023,3323.318810,3096.561100,2498.553558,2377.264550,12633.041163,3755.740489,4386.443329,3620.740551,3068.084550,2433.162962,2104.100611,11164.916827,10359.135942,3451.990627,2741.131573,3804.256093,3905.506047,4038.396612,3980.388825,2408.905161,2620.897253,1344.725955,1242.421313,3342.303176,2947.850230,8735.972613,10286.362537,1908.983512,2000.741283,5694.255238,4487.693284,12286.049133,15746.477256,9619.800338,9030.230292,2045.038138,2290.780214,3946.638841,4054.216917,1891.053833,1156.991664,16048.117744,14235.110752,4362.185528,5427.419421,4695.466627,4353.748032,653.905954,1110.585435,1854.139787,2276.014596,6314.411208,5783.903635,13022.220675,12608.783362,3445.662505,2907.772123,5668.942750,4525.662016,2641.990993,2326.639573,1201.288519,1131.679176,3535.310902,3904.451360,10796.831056,11263.002720,3952.966963,4306.287116,1615.780519,1374.257191,1595.741466,1631.600825,8051.480735,12965.267576,3138.748581,3280.076642,1138.007298,1420.663420,11015.151270,12643.588034,1885.780397,1861.522596,932.343328,1025.155787,11981.244583,11908.471179,4343.201161,5510.739696,7549.449712,5636.247452,2599.803512,2494.334810,7430.270078,6746.832887,7983.980765,7081.168674,10475.151514,11083.705926,3069.139237,2872.967451,1051.522962,1274.061924,0,0,0,0,1,0,1,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1,0,0,2,0,0,0,0,1,0,0,0,0
3,002_S_0619,77.447945,0,1859348.250,2,5124.734093,2981.605944,280.547287,356.484899,876.446600,908.087272,21112.765397,2883.519862,2848.715123,56650.512928,55319.495347,13810.098417,14307.911649,3485.747309,3231.567248,1810.901098,1584.142953,47125.616120,60669.932896,1393.244235,1401.681747,3791.607134,3956.138626,7059.033810,7534.698571,5230.202998,5155.320075,4848.405562,2089.339007,3279.028256,557.930507,343.828630,105178.865483,105828.553938,30496.333873,30318.091423,52001.443596,58340.124785,64032.281587,63620.952857,546.328928,608.555582,3098.676428,2852.933879,2209.573559,2206.409492,12681.581134,2698.949278,4427.584631,3850.669721,3989.888675,2088.284318,2110.432788,10630.210932,9738.998685,2335.081556,3525.825493,3081.801403,3971.958961,4032.076237,5876.727385,1759.221335,1517.697543,1132.736039,1098.985990,2337.190934,1737.072865,9832.866011,8968.020990,2304.495574,2317.151842,8209.699563,8048.332138,12752.245300,13784.785880,9587.123462,11270.407185,3097.621739,2240.159542,4577.350476,4796.725798,1399.572369,1617.893002,17917.057577,16677.797943,6813.291261,7985.050795,5135.280983,4916.960350,831.094971,594.844624,2534.417786,2599.808508,5705.867759,4306.295390,11084.781913,13730.996738,5032.976145,4884.264989,4352.701708,5000.280785,2023.948286,2122.034368,1509.260030,1475.509981,5866.180495,5736.453741,11578.376388,11748.181325,3241.059450,2811.801007,1759.221335,2247.542365,2254.925188,2166.331308,9747.436198,10617.554664,2975.277809,3174.614040,1750.783823,1521.916299,12143.689718,12557.127826,1405.900503,1814.065166,1921.643449,1717.033773,12618.299791,13427.246292,3545.864585,3870.708812,7061.143188,8703.294038,3338.090842,2569.222525,10113.413298,11186.032061,7032.666584,6352.392146,7272.080998,7162.393337,2489.066157,2506.995871,1155.939198,1574.650751,0,1,1,1,0,0,0,2,1,1,0,0,0,0,1,1,1,1,1,0,1,1,1,1,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,2
5,002_S_0729,65.056164,1,1166961.750,1,966.095170,1921.643449,356.484899,415.547486,761.485494,853.243441,18195.495486,3485.747309,2921.488668,44029.049070,42879.438005,11626.892084,12026.619234,2887.738618,2592.425684,392.344326,283.711354,8403.762348,9388.841921,1287.775330,1255.079969,3543.755207,3659.771003,6422.001624,6685.673886,3798.989957,4033.130926,3400.317496,1551.447592,2534.417786,268.945708,158.203357,87173.214027,83566.177477,21149.679514,18514.011579,40216.348155,41546.311047,49603.080697,48494.602506,435.586578,433.477199,2722.152437,2497.503670,2012.346707,1917.424692,9808.608162,2805.472872,3514.223914,3042.777908,3393.989362,1351.056673,769.923006,9316.068376,7022.119693,2155.784418,2744.300907,2465.862998,2690.511766,3676.646027,3777.896176,1416.447394,1371.095765,1420.666150,1110.587569,1976.487279,1818.283922,7236.221570,7037.940029,1790.862006,1707.541571,5660.516130,4246.178114,10451.968483,10478.335709,6965.166484,7380.713970,1843.596459,1515.588164,3769.458664,3579.614635,1319.416001,1700.158748,15817.171679,13515.840172,4775.632017,4807.272689,3063.871689,3464.653528,1248.751835,534.727348,2865.590148,2599.808508,5004.499541,3889.693215,10419.273122,10018.491283,2405.745722,1425.939595,4143.873276,4313.678213,2103.049965,2000.745127,988.243640,1255.079969,2139.964082,3043.832597,8480.754649,6326.024920,2684.183632,2847.660434,1367.931697,1286.720641,1517.697543,1421.720839,6882.900738,9298.138662,2307.659641,2541.800610,1081.056276,1029.376513,8876.263042,9259.115167,980.860816,1336.291026,1059.962495,1116.915704,9093.528987,11257.750917,4128.052941,4293.639121,6690.947331,7654.933123,2551.292811,2318.206531,7825.792749,7452.432825,5885.164897,6838.603798,6198.407545,5983.250979,2492.230224,2634.613246,907.032583,965.040480,0,0,0,0,1,1,0,1,0,0,2,0,1,0,2,1,2,1,1,0,1,1,0,0,2,2,1,1,0,1,1,1,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,941_S_4377,69.187671,1,1213846.750,1,881.999947,1006.799940,291.599983,368.399978,848.399949,842.399950,16942.798990,2813.999832,2732.399837,38367.597713,37732.797751,10575.599370,10729.199360,3331.199801,3163.199811,277.199983,122.399993,10655.999365,10243.199389,1113.599934,1213.199928,3841.199771,3997.199762,6476.399614,6640.799604,3844.799771,3826.799772,3287.999804,2023.199879,2435.999855,275.999984,219.599987,70166.395818,70043.995825,18130.798919,17141.998978,34400.397950,34846.797923,39651.597637,39454.797648,511.199970,561.599967,2305.199863,2029.199879,1618.799904,1685.999900,9392.399440,2234.399867,2710.799838,3969.599763,3922.799766,1229.999927,1067.999936,4531.199730,4151.999753,3071.999817,3301.199803,2642.399843,2969.999823,3874.799769,3181.199810,1715.999898,1789.199893,1654.799901,1523.999909,2671.199841,2146.799872,6149.999633,5678.399662,1540.799908,1766.399895,6223.199629,5893.199649,9580.799429,9399.599440,6956.399585,6032.399640,1623.599903,1963.199883,3746.399777,3551.999788,673.199960,1451.999913,14074.799161,13335.599205,3883.199769,3945.599765,3530.399790,3260.399806,664.799960,518.399969,2203.199869,2095.199875,6124.799635,4739.999717,11431.199319,11432.399319,2571.599847,2919.599826,3221.999808,2474.399853,3680.399781,2359.199859,1059.599937,1625.999903,3466.799793,3680.399781,7255.199568,7193.999571,3002.399821,2816.399832,1857.599889,1628.399903,2129.999873,1987.199882,7388.399560,9505.199433,2341.199860,2143.199872,1143.599932,1201.199928,9118.799456,10125.599396,1430.399915,2164.799871,1171.199930,1341.599920,11763.599299,14017.199165,3299.999803,4031.999760,6173.999632,6542.399610,3784.799774,3429.599796,7781.999536,6890.399589,5427.599676,5349.599681,6283.199625,6148.799634,2323.199862,2611.199844,1282.799924,1429.199915,1,0,0,0,1,1,2,0,2,0,1,1,1,1,0,0,0,0,0,0,1,2,0,0,1,1,1,1,0,2,0,0,0,2,2,2,0,0,0,0,1,2,1,0,2,0,1,1,0,1,0,0,0,1
1563,941_S_4420,81.383562,0,1536545.875,1,1079.999936,1298.399923,431.999974,445.199973,1088.399935,985.199941,19977.598809,3214.799808,3257.999806,47859.597147,48692.397098,12937.199229,13166.399215,4255.199746,3687.599780,751.199955,795.599953,19527.598836,24886.798517,1491.599911,1505.999910,4939.199706,5123.999695,7327.199563,7389.599560,4948.799705,4453.199735,3733.199777,1881.599888,2569.199847,214.799987,379.199977,104933.993745,101272.793964,24487.198540,25827.598461,48183.597128,51826.796911,62361.596283,61167.596354,466.799972,519.599969,3023.999820,2663.999841,2312.399862,2177.999870,11644.799306,4216.799749,3838.799771,4249.199747,4172.399751,1487.999911,1598.399905,8042.399521,9107.999457,2579.999846,2102.399875,3128.399814,3187.199810,5668.799662,5121.599695,2408.399856,2120.399874,1171.199930,1093.199935,3076.799817,2522.399850,8249.999508,6717.599600,1777.199894,1523.999909,7329.599563,7115.999576,13821.599176,13209.599213,8264.399507,7203.599571,2499.599851,1689.599899,4246.799747,4184.399751,1691.999899,1109.999934,17227.198973,17866.798935,4737.599718,5257.199687,3725.999778,3925.199766,743.999956,706.799958,2959.199824,2786.399834,4959.599704,5493.599673,11245.199330,13256.399210,3637.199783,4102.799755,3964.799764,2979.599822,2301.599863,1179.599930,1507.199910,1688.399899,3853.199770,3777.599775,11365.199323,10028.399402,4081.199757,3471.599793,1966.799883,1672.799900,2194.799869,2072.399876,6458.399615,8288.399506,3418.799796,3122.399814,1657.199901,1630.799903,12458.399257,13774.799179,1852.799890,2234.399867,1373.999918,1565.999907,11845.199294,13184.399214,3931.199766,4689.599720,6287.999625,5626.799665,3124.799814,2771.999835,8386.799500,8369.999501,6985.199584,7204.799571,9337.199443,7413.599558,3394.799798,2974.799823,1454.399913,1228.799927,1,1,1,1,0,0,1,1,0,1,0,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,1,1,2,0,0,0,0,2,2,2,0,0,2,2,2,1,1,2,2,0,0,0,0,1,0,0,0,0
1564,941_S_4764,82.672603,1,1438682.375,1,2911.199826,2806.799833,203.999988,257.999985,872.399948,879.599948,20135.998800,2771.999835,2692.799839,49766.397034,49005.597079,14546.399133,14725.199122,3502.799791,3417.599796,1269.599924,898.799946,34168.797963,24159.598560,1229.999927,1301.999922,3470.399793,3406.799797,6903.599589,7033.199581,4495.199732,4612.799725,3977.999763,2102.399875,2887.199828,340.799980,275.999984,77746.795366,74926.795534,21007.198748,18071.998923,36950.397798,39129.597668,46949.997202,45447.597291,604.799964,455.999973,2845.199830,2633.999843,1772.399894,1849.199890,8560.799490,2911.199826,2881.199828,3688.799780,3614.399785,1567.199907,973.199942,7833.599533,5200.799690,2761.199835,2303.999863,2530.799849,2887.199828,3716.399778,3257.999806,2343.599860,2126.399873,1429.199915,1045.199938,2788.799834,2362.799859,7852.799532,7195.199571,1484.399912,1898.399887,6519.599611,4475.999733,10491.599375,9629.999426,8755.199478,7741.199539,1605.599904,1519.199909,3568.799787,3154.799812,1018.799939,1431.599915,14619.599129,14492.399136,4258.799746,4612.799725,2867.999829,3763.199776,862.799949,463.199972,2491.199852,1895.999887,5236.799688,4847.999711,10013.999403,11163.599335,3194.399810,2228.399867,3721.199778,3015.599820,1940.399884,1718.399898,970.799942,1102.799934,2851.199830,3519.599790,8074.799519,7475.999554,3034.799819,3064.799817,1535.999908,1312.799922,1397.999917,1745.999896,6812.399594,9147.599455,2774.399835,2467.199853,1150.799931,1268.399924,10103.999398,10629.599366,1148.399932,1419.599915,1233.599926,1661.999901,11665.199305,10906.799350,3868.799769,4814.399713,5631.599664,6375.599620,3123.599814,3779.999775,8471.999495,6921.599587,5782.799655,6337.199622,7234.799569,7052.399580,1897.199887,1976.399882,826.799951,1083.599935,0,0,0,0,2,2,0,2,0,1,1,2,2,2,1,1,1,0,0,0,0,1,2,2,1,1,0,0,1,0,2,2,2,1,1,1,1,1,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0
1565,941_S_5124,76.664384,1,1353772.750,0,1496.399911,1142.399932,226.799986,238.799986,814.799951,986.399941,17121.598979,2571.599847,2613.599844,40799.997568,37919.997740,11252.399329,11056.799341,2611.199844,2401.199857,1173.599930,903.599946,39554.397642,49455.597052,1089.599935,1198.799929,3371.999799,3441.599795,6752.399598,7288.799566,3669.599781,4016.399761,4837.199712,2242.799866,2045.999878,368.399978,356.399979,72938.395653,72790.795661,16736.399002,19359.598846,37213.197782,37405.197770,52543.196868,48931.197083,151.199991,237.599986,2701.199839,2535.599849,1742.399896,1747.199896,9619.199427,3761.999776,4827.599712,3044.399819,3215.999808,1461.599913,1097.999935,5925.599647,8337.599503,3215.999808,2239.199867,2836.799831,2810.399832,5439.599676,4383.599739,2066.399877,2001.599881,1113.599934,1605.599904,3344.399801,2876.399829,5933.999646,6645.599604,1389.599917,1534.799909,7534.799551,6320.399623,11071.199340,9958.799406,7286.399566,8525.999492,2161.199871,2290.799863,4663.199722,4142.399753,1066.799936,1808.399892,16085.999041,15931.199050,2968.799823,5188.799691,3347.999800,3628.799784,843.599950,481.199971,1402.799916,1694.399899,5456.399675,3436.799795,15909.599052,13811.999177,3209.999809,4238.399747,3220.799808,3424.799796,2506.799851,2048.399878,1117.199933,1532.399909,3375.599799,3566.399787,8815.199475,7204.799571,3658.799782,3085.199816,1631.999903,1789.199893,1604.399904,1658.399901,8210.399511,9099.599458,2953.199824,2977.199823,1442.399914,1610.399904,9107.999457,10970.399346,1537.199908,1352.399919,1473.599912,1432.799915,10263.599388,11079.599340,3147.599812,3183.599810,7949.999526,7853.999532,3065.999817,4151.999753,9046.799461,7985.999524,5089.199697,5537.999670,6945.599586,7079.999578,2033.999879,1708.799898,739.199956,791.999953,1,0,0,0,1,1,0,1,0,0,2,1,1,1,0,0,0,0,1,1,2,2,2,2,1,1,1,1,0,1,0,0,0,1,1,1,0,0,1,1,1,2,2,1,1,2,0,0,0,0,1,1,1,0


10-fold stratified cross validation

In [26]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier


# Splitting dataset into train and test
X = data_age_filtered.drop(['diagnosis', 'PTID'], axis=1)
y = data_age_filtered['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the cross-validator
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


# Define the parameter grid for manual hyperparameter tuning
# param_grid = {
#     'n_neighbors': [3, 5, 7, 10, 15],  # Example: trying different numbers of neighbors
#     'weights': ['uniform', 'distance'],  # Example: uniform weights or distance-based
#     'metric': ['euclidean', 'manhattan']  # Example: different distance metrics
# }
# Define the KNN classifier
knn_model = KNeighborsClassifier(n_neighbors=30)

i=0

# Define a function for visualization
def visualize_data(X, y, title):
    # Separate the data based on diagnosis
    data_age_cn = X[y == 0]['Age']
    data_vol_cn = X[y == 0]['MUSE_Volume_48']

    data_age_mci = X[y == 1]['Age']
    data_vol_mci = X[y == 1]['MUSE_Volume_48']

    data_age_dem = X[y == 2]['Age']
    data_vol_dem = X[y == 2]['MUSE_Volume_48']

    # Scatter plot
    plt.scatter(data_age_cn, data_vol_cn, s=10, c='blue')
    plt.scatter(data_age_mci, data_vol_mci, s=10, c='green')
    plt.scatter(data_age_dem, data_vol_dem, s=10, c='red')

    plt.xlabel("Age (years)")
    plt.ylabel("Volume (mm^3)")
    plt.legend(["CN", "MCI", "DEM"])
    plt.title(title)
    plt.show()

# Iterate over each fold
for train_index, val_index in skf.split(X_train, y_train):
    # Split the data into the current fold's training and validation partitions
    fold_X_train, fold_X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    fold_y_train, fold_y_val = y_train.iloc[train_index], y_train.iloc[val_index]

    # Visualize initial data
    # visualize_data(fold_X_train, fold_y_train, "Initial Data - Fold")

    # Linear correction
    data_controls_train = fold_X_train[fold_y_train == 0]  # Control group for the fold
    for roi_feature in roi_features:
        if roi_feature in fold_X_train.columns:
            regr = LinearRegression()
            regr.fit(data_controls_train[['Sex', 'Age', 'DLICV_baseline']], data_controls_train[roi_feature])
            # Apply correction to the training set
            correction_train = regr.predict(fold_X_train[['Sex', 'Age', 'DLICV_baseline']])
            fold_X_train[roi_feature] -= correction_train

            # Apply the same correction to the validation set
            correction_val = regr.predict(fold_X_val[['Sex', 'Age', 'DLICV_baseline']])
            fold_X_val[roi_feature] -= correction_val
    # visualize_data(fold_X_train, fold_y_train, "Linearly Corrected Data - Fold")

    # Z-normalization using control group in fold_X_train
    scaler = MinMaxScaler().fit(data_controls_train[roi_features])
    fold_X_train[roi_features] = scaler.transform(fold_X_train[roi_features])
    fold_X_val[roi_features] = scaler.transform(fold_X_val[roi_features])
    # visualize_data(fold_X_train, fold_y_train, "Z-Normalized Data - Fold")

    fold_X_train = fold_X_train.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
    fold_X_val = fold_X_val.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

    # Train the KNN model
    knn_model.fit(fold_X_train, fold_y_train)

    # Make predictions on the validation set
    y_pred_val = knn_model.predict(fold_X_val)

    # Calculate metrics (weighted because of dataset's imbalance)
    accuracy = knn_model.score(fold_X_val, fold_y_val)
    precision = precision_score(fold_y_val, y_pred_val, average='weighted')
    recall = recall_score(fold_y_val, y_pred_val, average='weighted')
    f1 = f1_score(fold_y_val, y_pred_val, average='weighted')
    auc = roc_auc_score(fold_y_val, knn_model.predict_proba(fold_X_val), multi_class='ovr', average='weighted')

    # Print the validation metrics
    print(f"Validation Metrics, Fold {i}: Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}, AUC: {auc}")
    i+=1

Validation Metrics, Fold 0: Accuracy: 0.5384615384615384, Precision: 0.5253592561284869, Recall: 0.5384615384615384, F1 Score: 0.47830073755786395, AUC: 0.6326356087869247
Validation Metrics, Fold 1: Accuracy: 0.5299145299145299, Precision: 0.4316897325636402, Recall: 0.5299145299145299, F1 Score: 0.4655057242197316, AUC: 0.5345316334404909
Validation Metrics, Fold 2: Accuracy: 0.5384615384615384, Precision: 0.45158371040723977, Recall: 0.5384615384615384, F1 Score: 0.44686771029482025, AUC: 0.6274648221781379
Validation Metrics, Fold 3: Accuracy: 0.5641025641025641, Precision: 0.6403954468470597, Recall: 0.5641025641025641, F1 Score: 0.5130769230769231, AUC: 0.6394112539538366
Validation Metrics, Fold 4: Accuracy: 0.41025641025641024, Precision: 0.32136382136382136, Recall: 0.41025641025641024, F1 Score: 0.3571916749174274, AUC: 0.5112972815836296
Validation Metrics, Fold 5: Accuracy: 0.4700854700854701, Precision: 0.3547008547008547, Recall: 0.4700854700854701, F1 Score: 0.3878977841

Training on the whole train dataset and fitting on both the train and the test data

In [27]:
from sklearn.metrics import balanced_accuracy_score

# Define the control group from the training set
data_controls_train = X_train[y_train == 0]

# Linear Correction
for roi_feature in roi_features:
    if roi_feature in X_train.columns:
        regr = LinearRegression()
        regr.fit(data_controls_train[['Sex', 'Age', 'DLICV_baseline']], data_controls_train[roi_feature])
        # Apply correction to the training set
        correction_train = regr.predict(X_train[['Sex', 'Age', 'DLICV_baseline']])
        X_train[roi_feature] -= correction_train

        # Apply the same correction to the test set
        correction_test = regr.predict(X_test[['Sex', 'Age', 'DLICV_baseline']])
        X_test[roi_feature] -= correction_test

X_train = X_train.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
X_test = X_test.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

# Z-Normalization
scaler = MinMaxScaler().fit(data_controls_train[roi_features])
X_train[roi_features] = scaler.transform(X_train[roi_features])
X_test[roi_features] = scaler.transform(X_test[roi_features])

# After cross-validation, train the model on the entire training set
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = knn_model.predict(X_test)

# Calculate and print standard accuracy for the test set
test_accuracy = knn_model.score(X_test, y_test)
print(f"Test set accuracy: {test_accuracy}")

# Calculate and print balanced accuracy for the test set
test_balanced_accuracy = balanced_accuracy_score(y_test, y_pred_test)
print(f"Test set Balanced Accuracy: {test_balanced_accuracy}")

# Calculate and print metrics for the test set
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1 = f1_score(y_test, y_pred_test, average='weighted')
test_auc = roc_auc_score(y_test, knn_model.predict_proba(X_test), multi_class='ovr', average='weighted')

print(f"Test set Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}, AUC: {test_auc}")


Test set accuracy: 0.5085324232081911
Test set Balanced Accuracy: 0.38910273910273907
Test set Precision: 0.5115855976355176, Recall: 0.5085324232081911, F1 Score: 0.4528850214848092, AUC: 0.6309088823242608


In [28]:
X_train

Unnamed: 0,MUSE_Volume_4,MUSE_Volume_11,MUSE_Volume_23,MUSE_Volume_30,MUSE_Volume_31,MUSE_Volume_32,MUSE_Volume_35,MUSE_Volume_36,MUSE_Volume_37,MUSE_Volume_38,MUSE_Volume_39,MUSE_Volume_40,MUSE_Volume_41,MUSE_Volume_47,MUSE_Volume_48,MUSE_Volume_49,MUSE_Volume_50,MUSE_Volume_51,MUSE_Volume_52,MUSE_Volume_55,MUSE_Volume_56,MUSE_Volume_57,MUSE_Volume_58,MUSE_Volume_59,MUSE_Volume_60,MUSE_Volume_61,MUSE_Volume_62,MUSE_Volume_71,MUSE_Volume_72,MUSE_Volume_73,MUSE_Volume_75,MUSE_Volume_76,MUSE_Volume_81,MUSE_Volume_82,MUSE_Volume_83,MUSE_Volume_84,MUSE_Volume_85,MUSE_Volume_86,MUSE_Volume_87,MUSE_Volume_88,MUSE_Volume_89,MUSE_Volume_90,MUSE_Volume_91,MUSE_Volume_92,MUSE_Volume_93,MUSE_Volume_94,MUSE_Volume_95,MUSE_Volume_100,MUSE_Volume_101,MUSE_Volume_102,MUSE_Volume_103,MUSE_Volume_104,MUSE_Volume_105,MUSE_Volume_106,MUSE_Volume_107,MUSE_Volume_108,MUSE_Volume_109,MUSE_Volume_112,MUSE_Volume_113,MUSE_Volume_114,MUSE_Volume_115,MUSE_Volume_116,MUSE_Volume_117,MUSE_Volume_118,MUSE_Volume_119,MUSE_Volume_120,MUSE_Volume_121,MUSE_Volume_122,MUSE_Volume_123,MUSE_Volume_124,MUSE_Volume_125,MUSE_Volume_128,MUSE_Volume_129,MUSE_Volume_132,MUSE_Volume_133,MUSE_Volume_134,MUSE_Volume_135,MUSE_Volume_136,MUSE_Volume_137,MUSE_Volume_138,MUSE_Volume_139,MUSE_Volume_140,MUSE_Volume_141,MUSE_Volume_142,MUSE_Volume_143,MUSE_Volume_144,MUSE_Volume_145,MUSE_Volume_146,MUSE_Volume_147,MUSE_Volume_148,MUSE_Volume_149,MUSE_Volume_150,MUSE_Volume_151,MUSE_Volume_152,MUSE_Volume_153,MUSE_Volume_154,MUSE_Volume_155,MUSE_Volume_156,MUSE_Volume_157,MUSE_Volume_160,MUSE_Volume_161,MUSE_Volume_162,MUSE_Volume_163,MUSE_Volume_164,MUSE_Volume_165,MUSE_Volume_166,MUSE_Volume_167,MUSE_Volume_168,MUSE_Volume_169,MUSE_Volume_170,MUSE_Volume_171,MUSE_Volume_172,MUSE_Volume_173,MUSE_Volume_174,MUSE_Volume_175,MUSE_Volume_176,MUSE_Volume_177,MUSE_Volume_178,MUSE_Volume_179,MUSE_Volume_180,MUSE_Volume_181,MUSE_Volume_182,MUSE_Volume_183,MUSE_Volume_184,MUSE_Volume_185,MUSE_Volume_186,MUSE_Volume_187,MUSE_Volume_190,MUSE_Volume_191,MUSE_Volume_192,MUSE_Volume_193,MUSE_Volume_194,MUSE_Volume_195,MUSE_Volume_196,MUSE_Volume_197,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207,rs4575098,rs6656401,rs2093760,rs4844610,rs4663105,rs6733839,rs10933431,rs35349669,rs6448453,rs190982,rs9271058,rs9473117,rs9381563,rs10948363,rs2718058,rs4723711,rs1859788,rs1476679,rs12539172,rs10808026,rs7810606,rs11771145,rs28834970,rs73223431,rs4236673,rs9331896,rs11257238,rs7920721,rs3740688,rs10838725,rs983392,rs7933202,rs2081545,rs867611,rs10792832,rs3851179,rs17125924,rs17125944,rs10498633,rs12881735,rs12590654,rs442495,rs59735493,rs113260531,rs28394864,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358
1230,0.137941,-0.195808,-0.613759,-0.633286,-0.796038,-1.205358,-1.180002,-0.330287,-0.698389,-1.236774,-1.420976,-1.044045,-1.175569,-1.123318,-1.458726,0.325858,0.225101,-0.042512,0.025778,-1.025236,-1.208970,-0.620806,-0.738945,-1.481101,-1.202700,-1.301405,-1.383525,-0.844676,-0.811316,-1.266096,0.023435,0.123870,-0.936388,-1.016048,-0.805140,-0.732615,-0.888166,-0.674637,-0.865895,-0.882391,-0.223742,-0.492368,-1.097672,-1.095278,-0.654459,-0.869222,-0.893495,-0.456229,-0.647782,-0.884255,-1.268541,-0.407646,-0.636675,-0.392848,-0.263001,-0.347972,-0.340212,-0.554296,-0.654066,-0.541860,-0.598043,-0.476227,-0.662651,-0.809929,-0.513947,-0.346344,-0.051247,-0.750558,-1.069520,-0.232653,-0.314718,-0.811352,-0.801978,-0.853823,-0.877184,-0.770901,-0.614088,-0.007459,-0.152658,-0.596291,-0.826728,-0.276720,-0.334389,-0.786105,-0.685731,-0.181801,-0.232862,-0.574635,-0.849875,0.109704,-0.028420,-0.303942,-0.370400,-0.450481,-0.675179,-0.946408,-0.840091,-0.179985,-0.251656,-0.539619,-0.804556,-0.592686,-0.556713,0.111613,0.208835,-0.793858,-1.119812,-1.018955,-0.866141,-0.679737,-0.904641,-0.670278,-0.867298,-0.636695,-0.595237,-0.546702,-0.797692,-0.879306,-0.713993,-0.810611,-0.679401,-0.642413,-0.849574,-0.626558,-0.824554,-0.688630,-0.644318,-0.771726,-0.658348,-0.610297,-0.444373,-0.767045,-0.724745,-0.472770,-0.549345,-0.911505,-0.816018,-0.402815,-0.638190,-0.449331,-0.539745,-0.591007,-0.493150,-0.734923,-0.550304,1,0,0,0,0,0,1,1,1,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,1,0,0,2,0,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1
707,0.001459,-0.124386,-0.742582,-0.496298,-0.731694,-1.094506,-1.287583,-0.538710,-0.721528,-1.447740,-1.551025,-1.166832,-1.270349,-1.655513,-1.719959,0.324804,0.191469,-0.107484,-0.151007,-0.852242,-0.954965,-0.910552,-0.739751,-1.654023,-1.279033,-1.580371,-1.511829,-0.767573,-0.834257,-1.109042,0.198063,0.333839,-0.914801,-0.973088,-0.972867,-0.928296,-0.784365,-0.604937,-1.029665,-0.975821,-0.617290,-0.854334,-1.200153,-1.094286,-0.911706,-0.941347,-1.159967,-0.441827,-0.541294,-0.684537,-0.917644,-0.835875,-0.669361,-0.655393,-0.700200,-0.390777,-0.567153,-0.553312,-0.716929,-0.724745,-0.804365,-0.419756,-0.722309,-0.420888,-0.231325,-0.467963,-0.312976,-0.895273,-1.097382,-0.441605,-0.273509,-1.051015,-0.643163,-0.936975,-1.061462,-0.811218,-0.822357,-0.308066,-0.277372,-0.907858,-0.641102,-0.495419,-0.048438,-0.766268,-0.705627,-0.579470,-0.847468,-0.701394,-0.780379,-0.013356,-0.346329,-0.540054,-0.534879,-0.398727,-0.639822,-0.855767,-0.879026,-0.507542,-0.503589,-0.369046,-0.849094,-0.129210,-0.394303,-0.025579,0.101223,-0.706542,-0.873669,-0.825200,-0.794486,-1.101163,-1.028678,-0.433751,-0.645560,-0.574428,-0.429178,-0.657838,-0.694993,-0.506899,-0.443668,-0.462225,-0.710026,-0.858466,-1.103342,-0.697956,-0.394365,-0.382295,-0.525270,-1.144205,-0.961701,-0.591205,-0.680462,-0.675867,-0.533110,-0.465692,-0.604109,-0.883483,-0.747142,-0.853208,-0.597790,-0.521713,-0.559686,-0.803128,-0.531813,-0.469053,-0.488534,1,0,0,0,1,1,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,2,1,1,1,2,2,2,0,0,1,1,1,0,0,0,2,0,0,0,0,0,0,0,0,0
776,-0.096425,-0.130586,-0.324484,-0.456285,-0.582846,-0.793494,-1.164804,-0.580054,-0.955826,-1.239037,-1.406009,-1.004363,-1.078197,-1.068449,-1.150641,-0.036170,-0.059258,-0.254188,-0.245977,-0.816836,-0.841700,-0.842323,-0.837876,-1.456823,-1.185598,-1.397866,-1.381644,-0.593566,-0.888330,-0.648524,-0.127509,-0.201212,-0.863218,-0.875625,-0.908304,-0.645382,-0.706381,-0.589335,-0.788344,-0.767633,-0.331220,-0.491752,-1.136278,-1.000308,-0.907787,-0.984138,-0.618499,-0.296002,-0.912375,-1.029428,-1.378167,-0.576468,-0.732103,-0.570355,-0.357706,-0.362504,-0.374950,-0.666956,-0.416696,-0.524859,-0.506372,-0.409237,-0.393777,-0.520409,-0.319114,-0.643896,-0.399601,-0.809139,-1.136430,-0.566952,-0.443428,-0.892172,-0.559944,-0.823921,-0.945561,-0.851112,-0.876204,-0.345482,-0.465505,-0.613578,-0.802193,-0.626869,-0.345172,-0.932244,-0.617247,-0.710868,-0.442432,-0.840491,-0.974260,-0.136950,-0.189874,-0.371127,-0.275104,-0.708059,-0.988375,-0.724013,-0.858819,-0.316616,-0.374102,-0.266757,-0.787222,-0.423855,-0.454446,-0.171274,-0.226987,-0.485446,-0.700783,-0.683006,-0.910455,-0.897209,-1.017190,-0.634772,-0.671881,-0.562770,-0.531723,-0.553145,-0.658383,-0.932998,-0.983097,-0.669370,-0.681059,-1.020044,-0.764612,-0.669036,-0.586949,-0.557904,-0.482955,-1.054041,-0.930709,-0.510942,-0.655885,-0.522996,-0.600777,-0.387759,-0.554464,-0.999468,-0.996330,-0.880897,-0.524240,-0.396484,-0.503632,-0.577095,-0.652312,-0.565671,-0.286804,0,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,1,1,1,1,1,2,1,1,2,2,1,1,2,1,1,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1
556,-0.277082,-0.381370,-0.611449,-0.442314,-0.559224,-0.712323,-1.222449,-0.528569,-0.845774,-1.433455,-1.483083,-1.203863,-1.236980,-1.082334,-1.161197,-0.001517,-0.100097,-0.284588,-0.283946,-0.947432,-1.043064,-0.985082,-0.859191,-1.480323,-1.247154,-1.443085,-1.413199,-0.794265,-0.724570,-1.009369,0.094393,0.080090,-0.860212,-0.875104,-0.901054,-0.706429,-0.789711,-0.613442,-0.860348,-0.765221,-0.275056,-0.464891,-1.070217,-0.889442,-0.886353,-0.949481,-0.615809,-0.528527,-0.902785,-0.694233,-0.942927,-0.310752,-0.533395,-0.232796,-0.076930,-0.223631,-0.319706,-0.524827,-0.507381,-0.556418,-0.476218,-0.479527,-0.553791,-0.374734,-0.356926,-0.411271,-0.252215,-0.763132,-0.982276,-0.366836,-0.404809,-0.650755,-0.554641,-0.967418,-0.718642,-0.370994,-0.559609,-0.088342,-0.175906,-0.639657,-0.664147,-0.277413,-0.382825,-0.712324,-0.537243,-0.352361,-0.715145,-0.735207,-0.879803,-0.129465,-0.283878,-0.427336,-0.408012,-0.454989,-0.658550,-0.770046,-0.705697,-0.076981,-0.251589,-0.435326,-0.890902,-0.386539,0.198781,0.062477,-0.041550,-0.470721,-0.724655,-0.533419,-0.563499,-0.674317,-0.657056,-0.347898,-0.292615,-0.135596,-0.294028,-0.726713,-0.608299,-0.697911,-0.707764,-0.815164,-0.769033,-0.891771,-0.815088,-0.001584,0.022879,-0.636655,-0.772355,-0.918011,-0.706972,-0.686229,-0.749219,-0.795784,-0.501583,-0.480395,-0.323864,-0.826742,-0.818553,-0.324721,-0.479780,-0.482795,-0.477709,-0.416209,-0.573957,-0.265847,-0.138787,1,0,0,0,0,0,0,2,0,0,0,1,1,1,1,1,1,1,1,0,0,0,1,1,0,0,2,2,0,1,0,0,0,0,0,0,0,0,1,1,1,2,2,0,0,0,0,0,0,1,1,1,1,1
910,-0.190552,-0.197355,-0.815742,-0.681914,-0.508880,-0.796188,-1.291474,-0.643473,-0.738167,-1.183863,-1.314423,-1.002059,-1.155113,-0.800060,-1.034633,-0.101346,-0.093838,-0.236322,-0.227349,-1.023383,-1.084155,-0.921016,-0.860598,-1.508374,-1.207716,-1.535611,-1.528243,-0.865218,-0.484594,-0.825214,-0.045882,0.017449,-0.968781,-1.056859,-1.019055,-0.884071,-0.967110,-0.740804,-1.043361,-1.001688,-0.349086,-0.500316,-1.242785,-1.172915,-1.026397,-1.045867,-0.851076,-0.415041,-0.685809,-0.695557,-1.086745,-0.262130,-0.077836,-0.281177,-0.414741,-0.329520,-0.286896,-0.479495,-0.475962,-0.435143,-0.610342,-0.316908,-0.453994,-0.251750,-0.428073,-0.667365,-0.323160,-0.736137,-1.005859,-0.520344,-0.353956,-0.827288,-0.625291,-0.885116,-0.978793,-0.540973,-0.816099,-0.191311,-0.355673,-0.708374,-0.935745,-0.363240,-0.093542,-0.739571,-0.624159,-0.344587,-0.435397,-0.487214,-0.606523,0.009554,-0.299515,-0.336638,-0.490112,-0.566660,-0.790368,-0.844711,-0.716265,-0.224194,-0.489724,-0.518422,-0.260317,-0.147429,-0.289015,0.171777,0.057712,-0.487539,-0.564843,-0.813912,-0.946999,-0.615893,-0.777854,-0.393047,-0.422420,-0.184007,-0.178340,-0.581152,-0.635858,-0.502686,-0.715688,-0.324565,-0.545398,-0.802625,-0.875793,-0.249431,-0.075079,-0.462805,-0.534422,-0.918071,-0.607537,-0.667617,-0.759068,-0.615704,-0.443792,-0.219372,-0.230471,-0.730693,-0.658493,-0.440731,-0.373103,-0.419993,-0.486473,-0.414934,-0.582011,-0.056935,-0.263283,1,1,1,1,2,2,0,1,1,0,1,0,1,0,1,1,1,1,1,1,2,2,1,1,1,1,1,1,0,1,2,2,2,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,0.233381,-0.090092,-0.844276,-0.778787,-0.656480,-0.814827,-0.970274,-0.635000,-0.908099,-1.247123,-1.368847,-0.975450,-1.095355,-1.270786,-1.418230,0.365439,0.709126,0.118933,0.342200,-0.952939,-0.988945,-1.038057,-0.866293,-1.602998,-1.335232,-1.319065,-1.205246,-1.044360,-0.858265,-0.909026,-0.086753,-0.187558,-0.881765,-0.920668,-0.851513,-0.630512,-0.772790,-0.641816,-0.825264,-0.781935,-0.277184,-0.414649,-1.071531,-0.934277,-0.951460,-1.063531,-0.933765,-0.319848,-0.978021,-1.014885,-1.444543,-0.629945,-0.444162,-0.586030,-0.646782,-0.416103,-0.553382,-0.425906,-0.737095,-0.494149,-0.433790,-0.695012,-0.544827,-0.782605,-0.677227,-0.555305,-0.427005,-0.786240,-1.201330,-0.540783,-0.509674,-0.932791,-0.835158,-1.090340,-1.552741,-0.813045,-0.529066,-0.370435,-0.360565,-0.848232,-0.684796,-0.485669,-0.389780,-0.958152,-0.821642,-0.533341,-0.657130,-0.844581,-1.037718,0.004528,-0.130156,-0.190848,-0.236939,-0.751179,-0.842154,-1.341647,-1.380008,-0.398301,-0.627957,-0.297931,-0.736439,-0.611638,-0.735919,-0.119735,-0.029615,-0.690354,-1.016634,-0.815893,-1.074297,-0.971489,-1.057526,-0.762873,-1.049984,-0.355422,-0.286131,-0.520670,-0.803389,-0.651499,-0.722791,-0.710044,-1.156878,-0.889831,-0.965921,-0.529167,-0.434461,-0.411020,-0.400428,-0.998151,-0.907738,-0.507237,-0.716751,-0.965242,-0.711818,-0.323183,-0.181489,-0.981712,-0.970964,-0.524651,-0.871049,-0.595667,-0.740645,-0.656561,-0.647592,-0.050801,-0.412031,0,1,1,1,1,1,0,1,0,1,1,2,2,2,1,1,0,0,0,1,1,1,0,0,1,1,2,2,2,0,1,0,0,1,1,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0
693,-0.161797,-0.322143,-0.587661,-0.540203,-0.541821,-0.784716,-1.299120,-0.717072,-0.981868,-1.418439,-1.575280,-1.050169,-1.161686,-1.001258,-1.092568,-0.058708,-0.030613,-0.092213,-0.123704,-1.076639,-1.150083,-0.978433,-0.922179,-1.455730,-1.160906,-1.484251,-1.496352,-0.987517,-0.711555,-1.178222,-0.059928,-0.029011,-0.857910,-0.908093,-0.768942,-0.597792,-0.685670,-0.594651,-0.724220,-0.713301,-0.311484,-0.416339,-0.993540,-1.084719,-0.924300,-1.013721,-0.809915,-0.518707,-0.864948,-0.774635,-1.028276,-0.611883,-0.316524,-0.584428,-0.560175,-0.333848,-0.402427,-0.495815,-0.615301,-0.505320,-0.672152,-0.508143,-0.471197,-0.636805,-0.296117,-0.524764,-0.386914,-0.538992,-0.796596,-0.489494,-0.456503,-0.877005,-0.607575,-0.804383,-0.927538,-0.591802,-0.738535,-0.273075,-0.412266,-0.771279,-0.934078,-0.524842,-0.295917,-1.109862,-0.726923,-0.537260,-0.528270,-0.735013,-0.977157,-0.068559,-0.338385,-0.422082,-0.500770,-0.614230,-0.862683,-0.833536,-0.824198,-0.505622,-0.217833,-0.285246,-0.632228,-0.544097,-0.264599,0.050188,-0.139094,-0.653608,-0.636303,-0.621484,-0.831767,-0.778404,-0.931365,-0.513421,-0.599920,-0.269978,-0.284157,-0.572837,-0.845879,-0.641371,-0.687737,-0.660043,-0.585157,-1.010722,-1.118709,-0.392036,-0.238307,-0.442697,-0.452071,-1.005118,-1.005949,-0.655490,-0.738146,-0.780429,-0.541150,-0.205254,-0.605979,-0.778526,-0.726029,-0.424574,-0.419612,-0.452464,-0.544838,-0.584398,-0.328698,-0.106303,-0.092790,2,0,0,0,0,0,1,1,1,0,1,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,1,1,1,1,1,1,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,1
91,0.018301,-0.384421,-0.756423,-0.709344,-0.862190,-1.032222,-1.294672,-0.693335,-1.035539,-1.282053,-1.724014,-1.035563,-1.306274,-1.271089,-1.531640,0.211003,0.196189,-0.070039,0.114132,-0.975729,-1.129767,-0.823347,-0.727806,-1.659970,-1.324611,-1.575458,-1.484174,-0.773504,-0.588247,-1.095392,-0.256646,0.003923,-0.834543,-0.870452,-0.935087,-0.824054,-0.797224,-0.617088,-0.837464,-0.826687,-0.242183,-0.291716,-1.091034,-0.952697,-0.819489,-0.798198,-0.788956,-0.508780,-0.643326,-0.880857,-1.309232,-0.589393,-0.277777,-0.531650,-0.601325,-0.420361,-0.586971,-0.609146,-0.395008,-0.465804,-0.429065,-0.655441,-0.927988,-0.744642,-0.551099,-0.649337,-0.350348,-0.913118,-1.159844,-0.542652,-0.510447,-0.846563,-0.724921,-0.894715,-0.991401,-0.553707,-0.850816,-0.371792,-0.235547,-0.652741,-0.633227,-0.539381,-0.482693,-1.087768,-0.800150,-0.669780,-0.455740,-0.665119,-0.994114,-0.040547,-0.205568,-0.328053,-0.246305,-0.446982,-0.715848,-1.125893,-0.979273,-0.365480,-0.368971,-0.195703,-0.524904,-0.399913,-0.151930,-0.053693,0.039832,-0.613449,-0.898945,-0.725219,-0.760500,-0.886196,-1.176785,-0.589655,-0.750460,-0.305727,-0.245514,-0.694775,-0.616975,-0.775598,-0.545348,-0.817946,-0.943392,-0.865334,-0.720941,-0.591071,-0.420514,-0.459582,-0.422200,-0.807500,-0.812736,-0.644225,-0.892900,-0.597352,-0.639191,-0.375316,-0.513074,-1.033123,-0.952505,-0.725905,-0.999231,-0.539559,-0.715237,-0.106266,-0.339221,-0.230594,-0.102016,0,0,0,0,0,0,0,1,2,1,1,1,1,1,1,1,0,0,0,1,2,2,0,0,1,1,1,1,0,2,0,0,0,1,1,1,0,0,1,1,2,1,0,0,0,1,0,0,0,0,0,0,0,2
1055,0.029604,-0.283786,-0.673792,-0.599772,-0.575312,-0.851025,-1.159875,-0.628763,-1.057697,-1.069200,-1.198382,-0.905542,-1.009248,-1.114099,-1.264249,0.133274,0.102648,0.052849,0.089549,-1.004522,-1.105452,-1.034931,-0.965764,-1.522435,-1.229527,-1.336998,-1.343547,-0.786417,-0.775784,-1.018543,0.044476,0.169669,-0.881156,-0.890687,-0.924952,-0.727284,-0.894879,-0.737878,-0.799373,-0.819349,-0.013359,-0.326518,-1.112135,-1.007633,-0.791874,-0.976132,-1.050013,-0.435016,-0.864268,-0.766768,-1.181383,-0.554770,-0.608610,-0.269602,-0.569541,-0.395140,-0.489071,-0.722469,-0.910458,-0.521633,-0.600003,-0.579075,-0.596446,-0.964289,-0.568888,-0.487906,-0.268065,-1.155385,-1.177499,-0.450193,-0.302086,-0.936237,-0.704462,-0.876889,-1.123023,-0.776580,-0.724005,-0.014724,-0.206164,-0.816273,-0.905442,-0.636029,-0.357046,-1.033115,-0.690735,-0.632846,-0.401339,-0.765314,-0.925512,0.115122,0.044968,-0.332473,-0.419985,-0.593124,-0.935661,-0.925847,-1.024967,-0.319138,-0.068026,-0.623337,-0.850428,-0.700255,-0.539063,-0.037025,-0.102772,-0.894480,-1.011272,-0.944910,-1.050792,-0.736486,-0.761407,-0.699561,-0.911122,-0.695437,-0.456557,-0.657944,-0.940307,-0.532930,-0.872088,-0.669345,-0.618892,-0.735975,-0.770528,-0.504461,-0.475581,-0.561642,-0.475610,-0.798543,-0.768692,-0.644309,-0.724154,-0.885910,-0.784157,-0.583728,-0.808990,-1.085631,-1.066701,-0.744303,-0.671752,-0.529437,-0.538454,-0.434688,-0.458311,-0.569697,-0.449439,0,0,0,0,1,1,0,2,2,1,2,0,0,0,1,1,2,2,2,1,1,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,1,1,0,1,1,1,1,0,0,0,0,1


In [29]:
X_test

Unnamed: 0,MUSE_Volume_4,MUSE_Volume_11,MUSE_Volume_23,MUSE_Volume_30,MUSE_Volume_31,MUSE_Volume_32,MUSE_Volume_35,MUSE_Volume_36,MUSE_Volume_37,MUSE_Volume_38,MUSE_Volume_39,MUSE_Volume_40,MUSE_Volume_41,MUSE_Volume_47,MUSE_Volume_48,MUSE_Volume_49,MUSE_Volume_50,MUSE_Volume_51,MUSE_Volume_52,MUSE_Volume_55,MUSE_Volume_56,MUSE_Volume_57,MUSE_Volume_58,MUSE_Volume_59,MUSE_Volume_60,MUSE_Volume_61,MUSE_Volume_62,MUSE_Volume_71,MUSE_Volume_72,MUSE_Volume_73,MUSE_Volume_75,MUSE_Volume_76,MUSE_Volume_81,MUSE_Volume_82,MUSE_Volume_83,MUSE_Volume_84,MUSE_Volume_85,MUSE_Volume_86,MUSE_Volume_87,MUSE_Volume_88,MUSE_Volume_89,MUSE_Volume_90,MUSE_Volume_91,MUSE_Volume_92,MUSE_Volume_93,MUSE_Volume_94,MUSE_Volume_95,MUSE_Volume_100,MUSE_Volume_101,MUSE_Volume_102,MUSE_Volume_103,MUSE_Volume_104,MUSE_Volume_105,MUSE_Volume_106,MUSE_Volume_107,MUSE_Volume_108,MUSE_Volume_109,MUSE_Volume_112,MUSE_Volume_113,MUSE_Volume_114,MUSE_Volume_115,MUSE_Volume_116,MUSE_Volume_117,MUSE_Volume_118,MUSE_Volume_119,MUSE_Volume_120,MUSE_Volume_121,MUSE_Volume_122,MUSE_Volume_123,MUSE_Volume_124,MUSE_Volume_125,MUSE_Volume_128,MUSE_Volume_129,MUSE_Volume_132,MUSE_Volume_133,MUSE_Volume_134,MUSE_Volume_135,MUSE_Volume_136,MUSE_Volume_137,MUSE_Volume_138,MUSE_Volume_139,MUSE_Volume_140,MUSE_Volume_141,MUSE_Volume_142,MUSE_Volume_143,MUSE_Volume_144,MUSE_Volume_145,MUSE_Volume_146,MUSE_Volume_147,MUSE_Volume_148,MUSE_Volume_149,MUSE_Volume_150,MUSE_Volume_151,MUSE_Volume_152,MUSE_Volume_153,MUSE_Volume_154,MUSE_Volume_155,MUSE_Volume_156,MUSE_Volume_157,MUSE_Volume_160,MUSE_Volume_161,MUSE_Volume_162,MUSE_Volume_163,MUSE_Volume_164,MUSE_Volume_165,MUSE_Volume_166,MUSE_Volume_167,MUSE_Volume_168,MUSE_Volume_169,MUSE_Volume_170,MUSE_Volume_171,MUSE_Volume_172,MUSE_Volume_173,MUSE_Volume_174,MUSE_Volume_175,MUSE_Volume_176,MUSE_Volume_177,MUSE_Volume_178,MUSE_Volume_179,MUSE_Volume_180,MUSE_Volume_181,MUSE_Volume_182,MUSE_Volume_183,MUSE_Volume_184,MUSE_Volume_185,MUSE_Volume_186,MUSE_Volume_187,MUSE_Volume_190,MUSE_Volume_191,MUSE_Volume_192,MUSE_Volume_193,MUSE_Volume_194,MUSE_Volume_195,MUSE_Volume_196,MUSE_Volume_197,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207,rs4575098,rs6656401,rs2093760,rs4844610,rs4663105,rs6733839,rs10933431,rs35349669,rs6448453,rs190982,rs9271058,rs9473117,rs9381563,rs10948363,rs2718058,rs4723711,rs1859788,rs1476679,rs12539172,rs10808026,rs7810606,rs11771145,rs28834970,rs73223431,rs4236673,rs9331896,rs11257238,rs7920721,rs3740688,rs10838725,rs983392,rs7933202,rs2081545,rs867611,rs10792832,rs3851179,rs17125924,rs17125944,rs10498633,rs12881735,rs12590654,rs442495,rs59735493,rs113260531,rs28394864,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358
1153,-0.193360,-0.204314,-0.682414,-0.555665,-0.515549,-0.856120,-1.122425,-0.596741,-0.874333,-1.522294,-1.627408,-1.011713,-1.072978,-0.848112,-0.898967,0.038123,-0.027254,-0.173231,-0.168984,-0.935629,-1.122963,-0.838422,-0.762591,-1.381210,-1.056951,-1.278181,-1.328223,-0.761230,-0.672456,-1.128345,0.126476,0.273669,-0.718311,-0.759826,-0.678118,-0.415543,-0.642039,-0.461743,-0.700045,-0.621537,-0.368792,-0.495868,-0.795876,-0.700872,-0.761099,-0.815392,-0.773855,-0.441211,-0.634010,-0.709082,-1.078290,-0.552896,-0.481977,-0.455824,-0.527816,-0.369706,-0.367956,-0.506875,-0.404132,-0.360146,-0.487436,-0.455896,-0.123364,-0.385703,-0.207311,-0.432699,-0.264312,-0.906527,-1.147696,-0.384415,-0.302903,-0.873594,-0.499532,-0.996777,-1.104498,-0.734497,-0.688310,-0.212778,-0.328804,-0.671388,-0.802009,-0.305866,-0.354842,-0.887832,-0.708138,-0.722386,-0.710829,-0.661183,-0.818863,0.065738,-0.302059,-0.339406,-0.300231,-0.392798,-0.715348,-0.867657,-0.710477,-0.103043,-0.478850,-0.400819,-0.683539,-0.049087,-0.276958,0.024095,-0.075966,-0.551790,-0.827012,-0.822982,-0.830855,-0.890570,-0.890627,-0.388979,-0.570412,-0.065272,-0.378123,-0.634214,-0.664239,-0.684775,-0.763329,-0.273493,-0.477227,-0.850770,-0.911624,-0.480464,-0.583134,-0.214778,-0.373830,-0.917422,-0.681698,-0.454221,-0.418572,-0.605629,-0.632241,-0.377481,-0.424120,-1.096035,-0.805804,-0.557872,-0.586613,-0.426401,-0.494730,-0.327536,-0.402647,-0.017365,-0.035780,0,0,0,0,1,1,0,2,0,0,0,1,1,1,2,0,1,1,1,0,1,0,2,2,1,1,1,1,1,0,0,0,0,1,2,2,2,2,1,1,2,0,0,0,0,1,0,0,0,1,1,1,1,0
1061,-0.089630,-0.210487,-0.488502,-0.516644,-0.715727,-0.961720,-1.371937,-0.654663,-0.824504,-1.539226,-1.568819,-1.201564,-1.301092,-1.429771,-1.542473,0.142240,0.102772,0.077992,-0.003888,-0.895951,-0.943645,-0.905187,-0.727301,-1.694218,-1.278956,-1.610166,-1.553833,-1.025592,-0.899968,-1.163781,-0.114368,-0.194016,-0.816325,-0.819942,-0.922824,-0.639180,-0.712991,-0.545165,-0.860449,-0.803391,-0.281321,-0.582386,-1.167325,-1.104217,-0.929110,-0.958615,-0.873229,-0.201412,-0.691824,-0.986593,-1.169946,-0.373719,-0.478486,-0.532849,-0.641485,-0.608205,-0.417895,-0.478606,-0.484527,-0.934646,-0.784074,-0.805323,-0.717744,-0.580935,-0.435554,-0.773621,-0.444071,-1.079368,-1.410639,-0.549594,-0.456647,-1.320162,-0.914817,-1.140819,-1.029274,-1.158816,-1.113459,-0.447113,-0.580089,-0.950995,-1.084381,-0.579727,-0.423457,-1.092746,-0.716484,-0.772476,-0.792857,-0.847179,-0.914300,-0.074320,-0.421034,-0.467455,-0.578907,-0.820484,-0.729263,-1.223145,-0.944829,-0.630128,-0.572952,-0.887591,-0.766442,-0.349606,-0.336492,0.002145,-0.035455,-0.615715,-0.919196,-1.068999,-1.180248,-1.151289,-1.219836,-0.590868,-0.626297,-0.678749,-0.436008,-0.762648,-0.719879,-0.290003,-0.313829,-0.784298,-0.704483,-1.004170,-1.076679,-0.861912,-0.439127,-0.496431,-0.329628,-1.072114,-0.978420,-0.750832,-0.710359,-0.967870,-0.435487,-0.895912,-0.954498,-1.353881,-1.194352,-0.779269,-0.611461,-0.538157,-0.538542,-0.629689,-0.765662,-0.010220,-0.127964,1,0,0,0,1,1,1,1,0,1,0,2,2,2,2,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,2,0,0,0,0,0,1,1,1,0,1,0,0,0,1,1,1,0,0,0,0,1
1014,-0.167441,-0.178359,-0.874266,-0.671917,-0.569426,-0.811811,-1.309168,-0.707040,-0.989299,-1.110985,-1.284196,-0.942772,-1.100389,-1.158047,-1.181189,-0.044187,-0.050007,-0.157578,-0.127711,-0.809603,-0.972048,-0.871129,-0.817346,-1.382869,-1.136440,-1.532718,-1.472825,-0.900422,-0.535420,-0.851497,0.035234,-0.026238,-0.870075,-0.916877,-0.886672,-0.703298,-0.735796,-0.551988,-0.803732,-0.838902,-0.322232,-0.503527,-0.837584,-0.864078,-0.862900,-0.906578,-1.081138,-0.095101,-0.587672,-0.971265,-1.266947,-0.511333,-0.471049,-0.446329,-0.369004,-0.438115,-0.685059,-0.447603,-0.294595,-0.666130,-0.746117,-0.466915,-0.422418,-0.657564,-0.297854,-0.455927,-0.364810,-0.478133,-0.914451,-0.545002,-0.479715,-0.911290,-0.667246,-0.737480,-1.104493,-0.627036,-0.520648,-0.371134,-0.209231,-0.470277,-0.883646,-0.172219,-0.186415,-0.715986,-0.722950,-0.547709,-0.662975,-0.762071,-0.936364,0.070734,-0.327051,-0.255182,-0.109443,-0.639027,-0.712569,-0.877895,-0.816281,-0.504885,-0.481547,-0.501272,-0.792478,-0.489265,-0.231222,0.106551,-0.038053,-0.542484,-0.770938,-0.600711,-0.724213,-0.865780,-0.970622,-0.451078,-0.646601,-0.517400,-0.449549,-0.548031,-0.498266,-0.352946,-0.585910,-0.666625,-0.665275,-0.759046,-0.848563,-0.275752,-0.400146,-0.529048,-0.399597,-0.979364,-0.551450,-0.490415,-0.264231,-0.741735,-0.548160,-0.478369,-0.448409,-0.827984,-0.746462,-0.624844,-0.598677,-0.466151,-0.524081,-0.676089,-0.310506,-0.465130,-0.274327,1,0,0,0,0,0,1,0,0,1,2,0,1,0,2,2,1,1,1,0,1,1,1,0,0,0,1,1,0,1,1,1,1,1,1,1,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
456,-0.071585,-0.240196,-0.631520,-0.469354,-0.544535,-0.823874,-1.343770,-0.609763,-0.737803,-1.244338,-1.325820,-1.160610,-1.213974,-1.127774,-1.116007,-0.112968,-0.100713,-0.269783,-0.284814,-0.925174,-1.027990,-1.024848,-0.883403,-1.334494,-1.056321,-1.487150,-1.435537,-0.883248,-0.626575,-1.132019,-0.076297,-0.039430,-1.013970,-1.010195,-0.868152,-0.621158,-0.821016,-0.707834,-0.980619,-0.949457,-0.351553,-0.582681,-1.112773,-1.047790,-0.894843,-0.979863,-0.938722,-0.281877,-0.659082,-0.697181,-0.986471,-0.503430,-0.391672,-0.473225,-0.651914,-0.077833,-0.074537,-0.667669,-0.702683,-0.236048,-0.191129,-0.267264,-0.375048,-0.189091,-0.532582,-0.426172,-0.260861,-1.028940,-1.046700,-0.484115,-0.492602,-0.641414,-0.524309,-0.940571,-0.985312,-0.578734,-0.445776,-0.317103,-0.235817,-0.821381,-0.989247,-0.473125,-0.365952,-0.949030,-0.671803,-0.319261,-0.305381,-0.629809,-0.852490,0.184271,-0.148857,-0.511084,-0.498391,-0.425713,-0.711987,-0.902912,-0.872961,-0.355332,-0.304549,-0.377891,-0.699346,-0.194858,-0.281703,-0.046726,0.158054,-0.484912,-0.591110,-0.664493,-0.747345,-0.517936,-0.568406,-0.576802,-0.855912,-0.506922,-0.238534,-0.518866,-0.691758,-0.675126,-0.701601,-0.713304,-0.778797,-1.023030,-0.983237,-0.706827,-0.245072,-0.509961,-0.596192,-0.836588,-0.630281,-0.640259,-0.777512,-0.645670,-0.573495,-0.307575,-0.373506,-1.090557,-0.976831,-0.669667,-0.775834,-0.351333,-0.470009,-0.327262,-0.452860,-0.430965,-0.188258,1,2,2,2,0,0,0,2,0,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1,2,2,0,0,1,1,0,0,0,0,1,1,0,0,2,2,2,1,1,0,1,1,0,0,1,0,0,0,0,1
1047,-0.080921,-0.306472,-0.551267,-0.344604,-0.767422,-1.046869,-1.013621,-0.654537,-0.964556,-1.145585,-1.161844,-0.746441,-0.819224,-1.160512,-1.350050,0.139141,0.217154,-0.067166,0.075308,-0.840714,-1.030404,-1.004886,-0.848222,-1.532081,-1.239094,-1.194968,-1.116588,-0.565629,-0.631580,-0.975786,0.041500,0.070456,-1.045953,-1.047572,-0.841355,-0.595393,-0.857649,-0.582694,-0.809731,-0.823613,-0.265177,-0.279413,-0.862582,-0.620076,-0.675178,-0.596280,-0.870203,-0.192328,-0.666046,-0.830043,-1.175847,-0.707002,-0.438186,-0.405892,-0.290950,-0.275041,-0.427969,-0.305422,-0.361417,-0.488488,-0.600030,-0.633565,-0.573856,-0.664452,-0.420565,-0.189032,0.264042,-0.592859,-0.913532,-0.381445,-0.364069,-1.019210,-0.696500,-0.809628,-1.120694,-0.486826,-0.432815,-0.347684,-0.211208,-0.621495,-0.841731,-0.435698,-0.461529,-0.811956,-0.613287,-0.455579,-0.641034,-0.403508,-0.822773,-0.067394,0.153592,-0.269318,-0.480775,-0.796565,-0.839320,-0.753815,-0.787215,-0.641604,-0.333900,-0.209619,-0.602988,-0.242637,-0.110972,0.043908,-0.081903,-0.566753,-0.788855,-0.653341,-0.642806,-0.613385,-0.797427,-0.640427,-0.864522,-0.569532,-0.327592,-0.482411,-0.546327,-0.628361,-0.667719,-0.769951,-0.671145,-0.750760,-0.795392,-0.297621,-0.356350,-0.719890,-0.789045,-0.744505,-0.573243,-0.836621,-0.679913,-0.685174,-0.699376,-0.481290,-0.637253,-1.061875,-0.917473,-0.672691,-0.384098,-0.271194,-0.409519,-0.258054,-0.633751,-0.494374,-0.390938,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,1,1,1,2,1,2,2,1,2,1,1,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,1,2,0,0,0,0,1,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,-0.090134,-0.297055,-0.799891,-0.577783,-0.616755,-0.871025,-1.126512,-0.729233,-1.084560,-1.476783,-1.560515,-1.026963,-1.091669,-1.120543,-1.284505,-0.027242,-0.013754,-0.052089,-0.083546,-1.149384,-1.234489,-1.085294,-1.025506,-1.472698,-1.142741,-1.433562,-1.380185,-0.724408,-0.772518,-0.894635,0.070152,0.078535,-0.840795,-0.879799,-0.940140,-0.845877,-0.746767,-0.608776,-0.815312,-0.802497,-0.157833,-0.465911,-1.016684,-0.967248,-0.920286,-0.961602,-0.881714,-0.242435,-0.732398,-0.831339,-1.271762,-0.433138,-0.417634,-0.399148,-0.513911,-0.528595,-0.606779,-0.475721,-0.567117,-0.549824,-0.529944,-0.464738,-0.479899,-0.523914,-0.433465,-0.659120,-0.339017,-0.754530,-0.934107,-0.469031,-0.398345,-0.818566,-0.872001,-0.913521,-1.089334,-0.814456,-0.752363,-0.312081,-0.310347,-0.809261,-0.819323,-0.371965,-0.136835,-0.882517,-0.629211,-0.676692,-0.812849,-0.725927,-0.899706,-0.106945,0.027264,-0.508441,-0.495525,-0.723357,-0.686144,-0.954120,-1.016536,-0.401851,-0.597863,-0.434152,-0.999197,-0.421119,-0.294115,-0.008625,0.022419,-0.725243,-0.922298,-0.732583,-0.862434,-0.943565,-0.680899,-0.392145,-0.651423,-0.436522,-0.326239,-0.777648,-0.997391,-0.683693,-0.495882,-0.647779,-0.884486,-0.978234,-1.049714,-0.397135,-0.314297,-0.424012,-0.401500,-0.979140,-0.911531,-0.646036,-0.725882,-0.999836,-0.500233,-0.647308,-0.468798,-1.082152,-0.945995,-0.676251,-0.767164,-0.592304,-0.511433,-0.444092,-0.522462,-0.242425,-0.093570,1,0,0,0,1,1,2,0,1,0,0,0,1,0,0,0,1,1,1,1,1,0,1,1,0,0,1,1,1,1,1,1,1,2,2,2,0,0,1,1,1,1,0,0,0,1,2,2,0,2,0,0,0,2
246,0.165525,0.039348,-0.872256,-0.570575,-0.262357,-0.409921,-1.357411,-0.720286,-1.015224,-1.543883,-1.670268,-1.141870,-1.136736,-0.962517,-1.072139,-0.036261,-0.055297,-0.031127,-0.063462,-0.996681,-1.112351,-1.030257,-0.929808,-1.606574,-1.252453,-1.486548,-1.371622,-1.074123,-0.876270,-1.121995,-0.127496,-0.119557,-0.784004,-0.845849,-0.586181,-0.670364,-0.813535,-0.637451,-0.753529,-0.743970,-0.164549,-0.427052,-0.978129,-0.849662,-0.957005,-0.936318,-0.897460,-0.372650,-0.884200,-0.580805,-0.773634,-0.634408,-0.538389,-0.495312,-0.584453,-0.132905,-0.380451,-0.857842,-0.558292,-0.668174,-0.818728,-0.327868,-0.278729,-0.602076,-0.505762,-0.685055,-0.441897,-0.825662,-1.301298,-0.490578,-0.320223,-0.808282,-0.604738,-0.909338,-0.982743,-0.793589,-0.834574,-0.342968,-0.121997,-0.851951,-0.917709,-0.453500,-0.266213,-0.973372,-0.670038,-0.521148,-0.651070,-0.711222,-0.942655,-0.247231,-0.261336,-0.491805,-0.458667,-0.728756,-0.893804,-0.916379,-0.730272,-0.412582,-0.581686,-0.500036,-0.590483,-0.533712,-0.413334,-0.295934,-0.197233,-0.681500,-0.962914,-0.797162,-0.820610,-0.958811,-0.931953,-0.275645,-0.548685,-0.553581,-0.472238,-1.132338,-0.994214,-0.673909,-0.555797,-0.854777,-0.741125,-1.009131,-0.974369,-0.470327,-0.419444,-0.209048,-0.244404,-0.808442,-0.803849,-0.777782,-0.907276,-1.000694,-0.595032,-0.467348,-0.592795,-1.072622,-0.805298,-0.579978,-0.620301,-0.375932,-0.472863,-0.540296,-0.405558,-0.638971,-0.263694,0,1,1,1,1,1,1,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,1,2,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,1
506,-0.004089,-0.272517,-0.453773,-0.380443,-0.528540,-0.828728,-1.201633,-0.546418,-0.814431,-1.412218,-1.502413,-1.254479,-1.321577,-0.931024,-1.082379,-0.048334,-0.052532,-0.204403,-0.189909,-0.896740,-0.968518,-0.978595,-0.836392,-1.239263,-0.971108,-1.184458,-1.223623,-0.896863,-1.026663,-0.937467,0.207606,0.174675,-1.029670,-1.008764,-1.055993,-0.866073,-0.828458,-0.656697,-0.866935,-0.866695,-0.356869,-0.415643,-0.904896,-0.843566,-0.692944,-0.784434,-0.784852,-0.141866,-0.777010,-0.702947,-0.959360,-0.423885,-0.579254,-0.359160,-0.285098,-0.420233,-0.488066,-0.495288,-0.278926,-0.467688,-0.488296,-0.400925,-0.382289,-0.566777,-0.448930,-0.562259,-0.230448,-0.590816,-0.894116,-0.319919,-0.393784,-0.606148,-0.773094,-0.725085,-1.065923,-0.441021,-0.857104,-0.256480,-0.243099,-0.963129,-1.168170,-0.312237,-0.586914,-0.961054,-0.651340,-0.490957,-0.697862,-0.638720,-0.633582,-0.070613,-0.270099,-0.335704,-0.578795,-0.679102,-0.692190,-0.726375,-0.769133,-0.587247,-0.372971,-0.561928,-0.571806,-0.361488,-0.182891,0.198434,0.045002,-0.678087,-0.771183,-0.588102,-0.699296,-0.459738,-0.556112,-0.547628,-0.683726,-0.394026,-0.267558,-0.580493,-0.741919,-0.586865,-0.669155,-0.516528,-0.645563,-0.763883,-0.787659,-0.460392,-0.255819,-0.539806,-0.538732,-0.966863,-0.703089,-0.680474,-0.663008,-0.569067,-0.555162,-0.187931,-0.415349,-1.052808,-0.800815,-0.444449,-0.561228,-0.457634,-0.527541,-0.626254,-0.721295,-0.034837,-0.333268,0,0,0,0,0,0,0,2,1,0,0,1,1,1,0,0,0,0,0,0,2,1,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0
669,-0.257926,-0.202264,-0.715596,-0.530291,-0.608718,-0.826822,-1.171860,-0.613861,-0.883094,-1.571875,-1.706907,-0.970343,-1.121851,-0.929977,-1.134786,-0.116537,-0.079947,-0.191291,-0.183716,-1.087295,-1.092453,-0.916094,-0.799698,-1.477176,-1.217379,-1.432709,-1.409040,-0.866303,-0.827354,-0.820630,-0.102935,-0.073213,-0.947888,-0.984875,-0.979450,-0.851588,-0.749087,-0.614043,-0.924232,-0.881266,-0.323614,-0.492505,-1.109821,-0.975064,-0.946962,-0.988420,-1.040226,-0.305203,-0.990855,-0.870506,-1.126276,-0.483162,-0.232111,-0.510762,-0.570200,-0.226572,-0.633284,-0.602025,-0.637263,-0.506177,-0.625584,-0.477509,-0.541257,-0.411634,-0.384638,-0.410240,-0.285352,-0.990420,-0.979124,-0.506292,-0.366628,-0.759056,-0.783642,-0.995895,-1.153913,-0.730232,-0.859165,-0.203934,-0.128894,-0.769612,-0.920151,-0.597140,-0.324276,-0.826485,-0.681303,-0.449685,-0.605493,-0.752793,-0.949215,0.027120,-0.053875,-0.350424,-0.618642,-0.727058,-0.521194,-0.775345,-0.646840,-0.498425,-0.629882,-0.406225,-1.086513,-0.187255,-0.586005,-0.048077,0.052666,-0.522897,-0.686304,-0.640660,-0.794009,-0.829261,-0.971182,-0.594473,-0.887204,-0.336673,-0.408885,-0.720118,-0.809514,-0.815920,-0.825524,-0.845412,-0.735468,-0.921178,-0.774593,-0.472317,-0.433625,-0.564500,-0.500468,-1.081758,-0.863139,-0.671944,-0.762562,-0.668227,-0.505152,-0.425505,-0.165198,-0.605569,-0.545772,-0.815505,-0.620228,-0.459985,-0.543825,-0.611470,-0.558466,-0.226783,-0.141460,0,1,1,1,1,1,0,0,1,2,0,0,0,0,0,0,1,0,0,1,1,1,2,2,1,1,1,1,0,2,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0


A few considerations should be kept in mind to further ensure that our approach is unbiased:

* Representation of Control Group: Make sure that the control group (CN) in the training data is representative of the normal population.

* Generalizability of Corrections: Applying corrections based solely on the control group assumes that the relationship between predictors (like age, sex) and the outcome (ROI features) is the same in control and diseased groups. If this assumption does not hold, the model may not capture disease-specific patterns effectively.

* Statistical Assumptions: Linear regression makes certain assumptions (like linearity, normality, homoscedasticity, and independence of errors). Ensure these assumptions hold for your data; otherwise, the corrections might be inappropriate.

Nested CV with GridSearch

In [30]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

X = data_age_filtered.drop(['diagnosis', 'PTID'], axis=1)
y = data_age_filtered['diagnosis']

# Assuming X, y, roi_features are defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

param_grid = {
    'n_neighbors': [3, 5, 7, 10, 15, 30],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean']
}

outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

outer_fold_results = []

for train_index, test_index in outer_cv.split(X_train, y_train):
    X_train_outer, X_val_outer = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_outer, y_val_outer = y_train.iloc[train_index], y_train.iloc[test_index]

    inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=inner_cv, scoring='accuracy', n_jobs=-1)

    for inner_train_index, inner_val_index in inner_cv.split(X_train_outer, y_train_outer):
        X_train_inner, X_val_inner = X_train_outer.iloc[inner_train_index], X_train_outer.iloc[inner_val_index]
        y_train_inner, y_val_inner = y_train_outer.iloc[inner_train_index], y_train_outer.iloc[inner_val_index]

        # Preprocess for the inner fold
        data_controls_train_inner = X_train_inner[y_train_inner == 0]
        for roi_feature in roi_features:
            if roi_feature in X_train_inner.columns:
                regr = LinearRegression()
                regr.fit(data_controls_train_inner[['Sex', 'Age', 'DLICV_baseline']], data_controls_train_inner[roi_feature])
                # Apply correction to the training set
                correction_train = regr.predict(X_train_inner[['Sex', 'Age', 'DLICV_baseline']])
                X_train_inner[roi_feature] -= correction_train

                # Apply the same correction to the validation set
                correction_val = regr.predict(X_val_inner[['Sex', 'Age', 'DLICV_baseline']])
                X_val_inner[roi_feature] -= correction_val

        # Z-normalization for training and validation sets
        scaler = MinMaxScaler().fit(data_controls_train_inner[roi_features])
        X_train_inner[roi_features] = scaler.transform(X_train_inner[roi_features])
        X_val_inner[roi_features] = scaler.transform(X_val_inner[roi_features])

        X_train_inner = X_train_inner.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
        X_val_inner = X_val_inner.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

        grid_search.fit(X_train_inner, y_train_inner)

    # Evaluate the best model on the outer validation set
    best_model = grid_search.best_estimator_

    # We apply the linear correction and the z normalisation of the outer
    # loop after we are doe with the inner loop, in order to avoid applying
    # these filter twice.
    data_controls_train_outer = X_train_outer[y_train_outer == 0]
    for roi_feature in roi_features:
        if roi_feature in X_train_outer.columns:
            regr = LinearRegression()
            regr.fit(data_controls_train_outer[['Sex', 'Age', 'DLICV_baseline']], data_controls_train_outer[roi_feature])
            # Apply correction to the training set
            correction_train = regr.predict(X_train_outer[['Sex', 'Age', 'DLICV_baseline']])
            X_train_outer[roi_feature] -= correction_train

            # Apply the same correction to the validation set
            correction_val = regr.predict(X_val_outer[['Sex', 'Age', 'DLICV_baseline']])
            X_val_outer[roi_feature] -= correction_val

    # Z-normalization for training and validation sets
    scaler = MinMaxScaler().fit(data_controls_train_outer[roi_features])
    X_train_outer[roi_features] = scaler.transform(X_train_outer[roi_features])
    X_val_outer[roi_features] = scaler.transform(X_val_outer[roi_features])

    X_train_outer = X_train_outer.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
    X_val_outer = X_val_outer.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)


    y_pred_val = best_model.predict(X_val_outer)

    # # Calculate and print metrics for each outer fold
    # accuracy = best_model.score(X_val_outer, y_val_outer)
    # print(f"Outer Fold Metrics: Accuracy: {accuracy}, Balanced Accuracy: {balanced_accuracy_score(y_val_outer, y_pred_val)}, ...")

    # Append metrics to dict
    fold_metrics = {
        "Accuracy": best_model.score(X_val_outer, y_val_outer),
        "Balanced Accuracy": balanced_accuracy_score(y_val_outer, y_pred_val),
        "Precision": precision_score(y_val_outer, y_pred_val, average='weighted'),
        "Recall": recall_score(y_val_outer, y_pred_val, average='weighted'),
        "F1 Score": f1_score(y_val_outer, y_pred_val, average='weighted'),
        "AUC": roc_auc_score(y_val_outer, best_model.predict_proba(X_val_outer), multi_class='ovr', average='weighted')
    }
    outer_fold_results.append(fold_metrics)
    print(f"Outer Fold Metrics: {fold_metrics}")

# Average metrics across all outer folds
avg_metrics = {metric: np.mean([fold[metric] for fold in outer_fold_results]) for metric in outer_fold_results[0]}
print("\nAverage Metrics Across All Outer Folds:")
print(avg_metrics)



Outer Fold Metrics: {'Accuracy': 0.4700854700854701, 'Balanced Accuracy': 0.3666355431061314, 'Precision': 0.3813215163251068, 'Recall': 0.4700854700854701, 'F1 Score': 0.4208743068958123, 'AUC': 0.5205032876869806}
Outer Fold Metrics: {'Accuracy': 0.46153846153846156, 'Balanced Accuracy': 0.339488804484662, 'Precision': 0.5392489490875586, 'Recall': 0.46153846153846156, 'F1 Score': 0.3980630349847353, 'AUC': 0.5924612615245907}
Outer Fold Metrics: {'Accuracy': 0.49572649572649574, 'Balanced Accuracy': 0.37821720025109856, 'Precision': 0.39923621090224215, 'Recall': 0.49572649572649574, 'F1 Score': 0.43559650673988237, 'AUC': 0.550460748699641}
Outer Fold Metrics: {'Accuracy': 0.47863247863247865, 'Balanced Accuracy': 0.3746147919876733, 'Precision': 0.5650110249007982, 'Recall': 0.47863247863247865, 'F1 Score': 0.4325759976495402, 'AUC': 0.5352396656380994}
Outer Fold Metrics: {'Accuracy': 0.46153846153846156, 'Balanced Accuracy': 0.3700564971751413, 'Precision': 0.3792257933010797, '

In [31]:
# Retrieve the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Retrain the model on the entire training dataset (X_train, y_train) with these parameters
final_knn_model = KNeighborsClassifier(**best_params)

# Apply preprocessing (linear correction and Z-normalization) to the entire training dataset X_train
data_controls = X_train[y_train == 0]  # Control group for the training dataset
for roi_feature in roi_features:
    if roi_feature in X_train.columns:
        regr = LinearRegression()
        regr.fit(data_controls[['Sex', 'Age', 'DLICV_baseline']], data_controls[roi_feature])
        # Apply correction to the training set
        correction_train = regr.predict(X_train[['Sex', 'Age', 'DLICV_baseline']])
        X_train[roi_feature] -= correction_train

        # Apply the same correction to the test set
        correction_test = regr.predict(X_test[['Sex', 'Age', 'DLICV_baseline']])
        X_test[roi_feature] -= correction_test


# Z-normalization
scaler = MinMaxScaler().fit(data_controls[roi_features])
X_train[roi_features] = scaler.transform(X_train[roi_features])
X_test[roi_features] = scaler.transform(X_test[roi_features])


X_train = X_train.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
X_test = X_test.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

# Train the final model on the entire training dataset
final_knn_model.fit(X_train, y_train)

# # Apply linear correction and Z-normalization to X_test
# for roi_feature in roi_features:
#     if roi_feature in X_test.columns:
#         correction_test = regr.predict(X_test[['Sex', 'Age', 'DLICV_baseline']])
#         X_test[roi_feature] -= correction_test
# X_test[roi_features] = scaler.transform(X_test[roi_features])
# X_test = X_test.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

# Evaluate the final model on the test set (X_test, y_test)
y_pred_test = final_knn_model.predict(X_test)
test_accuracy = final_knn_model.score(X_test, y_test)
test_balanced_accuracy = balanced_accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1 = f1_score(y_test, y_pred_test, average='weighted')
test_auc = roc_auc_score(y_test, final_knn_model.predict_proba(X_test), multi_class='ovr', average='weighted')

print(f"Test set Metrics: Accuracy: {test_accuracy}, Balanced Accuracy: {test_balanced_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}, AUC: {test_auc}")

Test set Metrics: Accuracy: 0.45733788395904434, Balanced Accuracy: 0.3745472745472745, Precision: 0.4978020864007415, Recall: 0.45733788395904434, F1 Score: 0.42838858521282336, AUC: 0.6084543927021203


In [32]:
best_params

{'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'uniform'}

#### Nested CV with Randomized Search

In [33]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
import scipy.stats as stats

X = data_age_filtered.drop(['diagnosis', 'PTID'], axis=1)
y = data_age_filtered['diagnosis']

# Assuming X, y, roi_features are defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define parameter distribution (instead of a fixed grid)
param_dist = {
    'n_neighbors': stats.randint(3, 1000),  # Uniformly distributed integers from 3 to 30
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

outer_fold_results = []

for train_index, test_index in outer_cv.split(X_train, y_train):
    X_train_outer, X_val_outer = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_outer, y_val_outer = y_train.iloc[train_index], y_train.iloc[test_index]

    inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    # Use RandomizedSearchCV
    n_iter_search = 20  # Number of parameter settings sampled
    random_search = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=param_dist, n_iter=n_iter_search, cv=inner_cv, scoring='accuracy', n_jobs=-1, random_state=42)

    for inner_train_index, inner_val_index in inner_cv.split(X_train_outer, y_train_outer):
        X_train_inner, X_val_inner = X_train_outer.iloc[inner_train_index], X_train_outer.iloc[inner_val_index]
        y_train_inner, y_val_inner = y_train_outer.iloc[inner_train_index], y_train_outer.iloc[inner_val_index]

        # Preprocess for the inner fold
        data_controls_train_inner = X_train_inner[y_train_inner == 0]
        for roi_feature in roi_features:
            if roi_feature in X_train_inner.columns:
                regr = LinearRegression()
                regr.fit(data_controls_train_inner[['Sex', 'Age', 'DLICV_baseline']], data_controls_train_inner[roi_feature])
                # Apply correction to the training set
                correction_train = regr.predict(X_train_inner[['Sex', 'Age', 'DLICV_baseline']])
                X_train_inner[roi_feature] -= correction_train

                # Apply the same correction to the validation set
                correction_val = regr.predict(X_val_inner[['Sex', 'Age', 'DLICV_baseline']])
                X_val_inner[roi_feature] -= correction_val

        # Z-normalization for training and validation sets
        scaler = MinMaxScaler().fit(data_controls_train_inner[roi_features])
        X_train_inner[roi_features] = scaler.transform(X_train_inner[roi_features])
        X_val_inner[roi_features] = scaler.transform(X_val_inner[roi_features])

        X_train_inner = X_train_inner.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
        X_val_inner = X_val_inner.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

        random_search.fit(X_train_inner, y_train_inner)

    # Evaluate the best model on the outer validation set
    best_model = grid_search.best_estimator_

    # We apply the linear correction and the z normalisation of the outer
    # loop after we are doe with the inner loop, in order to avoid applying
    # these filter twice.
    data_controls_train_outer = X_train_outer[y_train_outer == 0]
    for roi_feature in roi_features:
        if roi_feature in X_train_outer.columns:
            regr = LinearRegression()
            regr.fit(data_controls_train_outer[['Sex', 'Age', 'DLICV_baseline']], data_controls_train_outer[roi_feature])
            # Apply correction to the training set
            correction_train = regr.predict(X_train_outer[['Sex', 'Age', 'DLICV_baseline']])
            X_train_outer[roi_feature] -= correction_train

            # Apply the same correction to the validation set
            correction_val = regr.predict(X_val_outer[['Sex', 'Age', 'DLICV_baseline']])
            X_val_outer[roi_feature] -= correction_val

    # Z-normalization for training and validation sets
    scaler = MinMaxScaler().fit(data_controls_train_outer[roi_features])
    X_train_outer[roi_features] = scaler.transform(X_train_outer[roi_features])
    X_val_outer[roi_features] = scaler.transform(X_val_outer[roi_features])

    X_train_outer = X_train_outer.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
    X_val_outer = X_val_outer.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)


    y_pred_val = best_model.predict(X_val_outer)
    # Calculate and print metrics for each outer fold
    accuracy = best_model.score(X_val_outer, y_val_outer)
    print(f"Outer Fold Metrics: Accuracy: {accuracy}, Balanced Accuracy: {balanced_accuracy_score(y_val_outer, y_pred_val)}, ...")

    # Append metrics to dict
    fold_metrics = {
        "Accuracy": best_model.score(X_val_outer, y_val_outer),
        "Balanced Accuracy": balanced_accuracy_score(y_val_outer, y_pred_val),
        "Precision": precision_score(y_val_outer, y_pred_val, average='weighted'),
        "Recall": recall_score(y_val_outer, y_pred_val, average='weighted'),
        "F1 Score": f1_score(y_val_outer, y_pred_val, average='weighted'),
        "AUC": roc_auc_score(y_val_outer, best_model.predict_proba(X_val_outer), multi_class='ovr', average='weighted')
    }
    outer_fold_results.append(fold_metrics)
    print(f"Outer Fold Metrics: {fold_metrics}")

# Average metrics across all outer folds
avg_metrics = {metric: np.mean([fold[metric] for fold in outer_fold_results]) for metric in outer_fold_results[0]}
print("\nAverage Metrics Across All Outer Folds:")
print(avg_metrics)



Outer Fold Metrics: Accuracy: 0.5, Balanced Accuracy: 0.40818549642079055, ...
Outer Fold Metrics: {'Accuracy': 0.5, 'Balanced Accuracy': 0.40818549642079055, 'Precision': 0.421252151173411, 'Recall': 0.5, 'F1 Score': 0.4547975488446758, 'AUC': 0.6277057798721052}
Outer Fold Metrics: Accuracy: 0.5170940170940171, Balanced Accuracy: 0.4224204406474497, ...
Outer Fold Metrics: {'Accuracy': 0.5170940170940171, 'Balanced Accuracy': 0.4224204406474497, 'Precision': 0.539226008387507, 'Recall': 0.5170940170940171, 'F1 Score': 0.4826308627801165, 'AUC': 0.6853537131569168}
Outer Fold Metrics: Accuracy: 0.5, Balanced Accuracy: 0.4187567768076242, ...
Outer Fold Metrics: {'Accuracy': 0.5, 'Balanced Accuracy': 0.4187567768076242, 'Precision': 0.5915039295070255, 'Recall': 0.5, 'F1 Score': 0.4649448134329603, 'AUC': 0.6419251250568048}
Outer Fold Metrics: Accuracy: 0.5427350427350427, Balanced Accuracy: 0.4429178793585573, ...
Outer Fold Metrics: {'Accuracy': 0.5427350427350427, 'Balanced Accurac

In [34]:
# Retrieve the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Retrain the model on the entire training dataset (X_train, y_train) with these parameters
final_knn_model = KNeighborsClassifier(**best_params)

# Apply preprocessing (linear correction and Z-normalization) to the entire training dataset X_train
data_controls = X_train[y_train == 0]  # Control group for the training dataset
for roi_feature in roi_features:
    if roi_feature in X_train.columns:
        regr = LinearRegression()
        regr.fit(data_controls[['Sex', 'Age', 'DLICV_baseline']], data_controls[roi_feature])
        # Apply correction to the training set
        correction_train = regr.predict(X_train[['Sex', 'Age', 'DLICV_baseline']])
        X_train[roi_feature] -= correction_train

        # Apply the same correction to the test set
        correction_test = regr.predict(X_test[['Sex', 'Age', 'DLICV_baseline']])
        X_test[roi_feature] -= correction_test


# Z-normalization
scaler = MinMaxScaler().fit(data_controls[roi_features])
X_train[roi_features] = scaler.transform(X_train[roi_features])
X_test[roi_features] = scaler.transform(X_test[roi_features])


X_train = X_train.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
X_test = X_test.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

# Train the final model on the entire training dataset
final_knn_model.fit(X_train, y_train)

# # Apply linear correction and Z-normalization to X_test
# for roi_feature in roi_features:
#     if roi_feature in X_test.columns:
#         correction_test = regr.predict(X_test[['Sex', 'Age', 'DLICV_baseline']])
#         X_test[roi_feature] -= correction_test
# X_test[roi_features] = scaler.transform(X_test[roi_features])
# X_test = X_test.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

# Evaluate the final model on the test set (X_test, y_test)
y_pred_test = final_knn_model.predict(X_test)
test_accuracy = final_knn_model.score(X_test, y_test)
test_balanced_accuracy = balanced_accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1 = f1_score(y_test, y_pred_test, average='weighted')
test_auc = roc_auc_score(y_test, final_knn_model.predict_proba(X_test), multi_class='ovr', average='weighted')

print(f"Test set Metrics: Accuracy: {test_accuracy}, Balanced Accuracy: {test_balanced_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}, AUC: {test_auc}")

Test set Metrics: Accuracy: 0.45733788395904434, Balanced Accuracy: 0.3745472745472745, Precision: 0.4978020864007415, Recall: 0.45733788395904434, F1 Score: 0.42838858521282336, AUC: 0.6084543927021203


In [35]:
best_params

{'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'uniform'}

##### back ups


In [36]:
# only corrected and standardized outer folds
# (which include the inner folds, but it is the same problem of data
# leakage even though the whole data are train data. Even though all of these
# data are train data, we should treat them as validation/test data for validating
# and evaluating the model.)

# from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
# from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score
# from sklearn.linear_model import LinearRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.preprocessing import StandardScaler

# # Splitting dataset into train and test
# X = data_age_filtered.drop(['diagnosis', 'PTID'], axis=1)
# y = data_age_filtered['diagnosis']

# # Assuming X, y, roi_features are defined
# # Split the dataset into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# # Define parameter grid for KNN
# param_grid = {
#     'n_neighbors': [3, 5, 7, 10, 15, 30],
#     'weights': ['uniform', 'distance'],
#     'metric': ['euclidean', 'manhattan']
# }

# # Define outer cross-validation
# outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # Lists to store metrics for each outer fold
# outer_fold_accuracy = []
# outer_fold_balanced_accuracy = []
# outer_fold_precision = []
# outer_fold_recall = []
# outer_fold_f1 = []
# outer_fold_auc = []

# # Outer loop for model evaluation
# for train_index, test_index in outer_cv.split(X_train, y_train):
#     # Split training data into training and validation for the current outer fold
#     X_train_outer, X_val_outer = X_train.iloc[train_index], X_train.iloc[test_index]
#     y_train_outer, y_val_outer = y_train.iloc[train_index], y_train.iloc[test_index]

#     # Apply preprocessing (linear correction and Z-normalization) to X_train_outer and X_val_outer
#     # Linear correction
#     data_controls_train_outer = X_train_outer[y_train_outer == 0]  # Control group for the fold
#     for roi_feature in roi_features:
#         if roi_feature in X_train_outer.columns:
#             regr = LinearRegression()
#             regr.fit(data_controls_train_outer[['Sex', 'Age', 'DLICV_baseline']], data_controls_train_outer[roi_feature])
#             # Apply correction to the training set
#             correction_train = regr.predict(X_train_outer[['Sex', 'Age', 'DLICV_baseline']])
#             X_train_outer[roi_feature] -= correction_train

#             # Apply the same correction to the validation set
#             correction_val = regr.predict(X_val_outer[['Sex', 'Age', 'DLICV_baseline']])
#             X_val_outer[roi_feature] -= correction_val

#     # Z-normalization for training and validation sets
#     scaler = StandardScaler().fit(data_controls_train_outer[roi_features])
#     X_train_outer[roi_features] = scaler.transform(X_train_outer[roi_features])
#     X_val_outer[roi_features] = scaler.transform(X_val_outer[roi_features])

#     X_train_outer = X_train_outer.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
#     X_val_outer = X_val_outer.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

#     # Inner loop for hyperparameter tuning
#     inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
#     grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=inner_cv, scoring='accuracy', n_jobs=-1)
#     grid_search.fit(X_train_outer, y_train_outer)

#     # Train model with best parameters on the entire training set of the outer fold
#     best_model = grid_search.best_estimator_
#     best_model.fit(X_train_outer, y_train_outer)

#     # Evaluate model on the validation set of the outer fold
#     y_pred_val = best_model.predict(X_val_outer)
#     outer_fold_accuracy.append(best_model.score(X_val_outer, y_val_outer))
#     outer_fold_balanced_accuracy.append(balanced_accuracy_score(y_val_outer, y_pred_val))
#     outer_fold_precision.append(precision_score(y_val_outer, y_pred_val, average='weighted'))
#     outer_fold_recall.append(recall_score(y_val_outer, y_pred_val, average='weighted'))
#     outer_fold_f1.append(f1_score(y_val_outer, y_pred_val, average='weighted'))
#     outer_fold_auc.append(roc_auc_score(y_val_outer, best_model.predict_proba(X_val_outer), multi_class='ovr', average='weighted'))

#     # Evaluate model on the validation set of the outer fold
#     y_pred_val = best_model.predict(X_val_outer)
#     accuracy = best_model.score(X_val_outer, y_val_outer)
#     balanced_accuracy = balanced_accuracy_score(y_val_outer, y_pred_val)
#     precision = precision_score(y_val_outer, y_pred_val, average='weighted')
#     recall = recall_score(y_val_outer, y_pred_val, average='weighted')
#     f1 = f1_score(y_val_outer, y_pred_val, average='weighted')
#     auc = roc_auc_score(y_val_outer, best_model.predict_proba(X_val_outer), multi_class='ovr', average='weighted')

#     # Append metrics to lists
#     outer_fold_accuracy.append(accuracy)
#     outer_fold_balanced_accuracy.append(balanced_accuracy)
#     outer_fold_precision.append(precision)
#     outer_fold_recall.append(recall)
#     outer_fold_f1.append(f1)
#     outer_fold_auc.append(auc)


# # Calculate average metrics across all outer folds
# print("Average Metrics Across All Outer Folds:")
# print(f"Accuracy: {np.mean(outer_fold_accuracy)}")
# print(f"Balanced Accuracy: {np.mean(outer_fold_balanced_accuracy)}")
# print(f"Precision: {np.mean(outer_fold_precision)}")
# print(f"Recall: {np.mean(outer_fold_recall)}")
# print(f"F1 Score: {np.mean(outer_fold_f1)}")
# print(f"AUC: {np.mean(outer_fold_auc)}")
