# **Simple Graph Creation and GNN Model Testing**


# Work Environment


#### File Prerequisites

Run the notebook preferably on google colab.


Upload the following csvs in the path: '/thesis/Data_Preprocessing/':
* ADNI_dataset_ROIs_and_SNPs.csv
* ROIs.csv


#### Setting Google Colab as Filesystem

In [1]:
from google.colab import drive
path = "/gdrive/My Drive/thesis/Data_Preprocessing/"
drive.mount('/gdrive')
import os

Mounted at /gdrive


#### Libraries and Installations

In [2]:
import copy
import random
import time

import numpy as np
import pandas as pd

import itertools

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import trange, tqdm

In [3]:
import scipy.sparse as sp

from sklearn import metrics
from sklearn import decomposition
from sklearn import manifold
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import networkx as nx

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision.transforms as transforms

In [4]:
def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric

Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu121/torch_scatter-2.1.2%2Bpt21cu121-cp310-cp310-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt21cu121
Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu121/torch_sparse-0.6.18%2Bpt21cu121-cp310-cp310-linux_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt21cu121
Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html
Collecting torch-cluster
 

#### Versions

Put the following in a txt file and run the following command to instantly download the version requirements for py projects:  
pip install -r /path/to/requirements.txt

In [5]:
!python --version # python v
print("pandas ", pd.__version__) # pandas v
print("numpy ", np.__version__) # numpy v
print("torch ", torch.__version__) # torch v
# print("torchvision ", torchvision.__version__) # torchvision v
# print("pytorch lightning ", pl.__version__) # pytorch lightning v
# print("torch geometric ", pyg.__version__) # torch geometric v

Python 3.10.12
pandas  1.5.3
numpy  1.25.2
torch  2.1.0+cu121


In [6]:
!pip list

Package                          Version
-------------------------------- ---------------------
absl-py                          1.4.0
aiohttp                          3.9.3
aiosignal                        1.3.1
alabaster                        0.7.16
albumentations                   1.3.1
altair                           4.2.2
annotated-types                  0.6.0
anyio                            3.7.1
appdirs                          1.4.4
argon2-cffi                      23.1.0
argon2-cffi-bindings             21.2.0
array-record                     0.5.0
arviz                            0.15.1
astropy                          5.3.4
astunparse                       1.6.3
async-timeout                    4.0.3
atpublic                         4.0
attrs                            23.2.0
audioread                        3.0.1
autograd                         1.6.2
Babel                            2.14.0
backcall                         0.2.0
beautifulsoup4                   4.12.3
bi

#### Setting Up Device

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### Seeds

In [8]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Datasets

#### ADNI

In [9]:
data = pd.read_csv(path + 'ADNI_dataset_ROIs_and_SNPs.csv', encoding='ISO-8859-1')

# Data Preparation


#### Studying the Data




In [10]:
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,PTID,Age,Date,Sex,DLICV_baseline,diagnosis,MUSE_Volume_4,MUSE_Volume_11,MUSE_Volume_23,MUSE_Volume_30,MUSE_Volume_31,MUSE_Volume_32,MUSE_Volume_35,MUSE_Volume_36,MUSE_Volume_37,MUSE_Volume_38,MUSE_Volume_39,MUSE_Volume_40,MUSE_Volume_41,MUSE_Volume_47,MUSE_Volume_48,MUSE_Volume_49,MUSE_Volume_50,MUSE_Volume_51,MUSE_Volume_52,MUSE_Volume_55,MUSE_Volume_56,MUSE_Volume_57,MUSE_Volume_58,MUSE_Volume_59,MUSE_Volume_60,MUSE_Volume_61,MUSE_Volume_62,MUSE_Volume_71,MUSE_Volume_72,MUSE_Volume_73,MUSE_Volume_75,MUSE_Volume_76,MUSE_Volume_81,MUSE_Volume_82,MUSE_Volume_83,MUSE_Volume_84,MUSE_Volume_85,MUSE_Volume_86,MUSE_Volume_87,MUSE_Volume_88,MUSE_Volume_89,MUSE_Volume_90,MUSE_Volume_91,MUSE_Volume_92,MUSE_Volume_93,MUSE_Volume_94,MUSE_Volume_95,MUSE_Volume_100,MUSE_Volume_101,MUSE_Volume_102,MUSE_Volume_103,MUSE_Volume_104,MUSE_Volume_105,MUSE_Volume_106,MUSE_Volume_107,MUSE_Volume_108,MUSE_Volume_109,MUSE_Volume_112,MUSE_Volume_113,MUSE_Volume_114,MUSE_Volume_115,MUSE_Volume_116,MUSE_Volume_117,MUSE_Volume_118,MUSE_Volume_119,MUSE_Volume_120,MUSE_Volume_121,MUSE_Volume_122,MUSE_Volume_123,MUSE_Volume_124,MUSE_Volume_125,MUSE_Volume_128,MUSE_Volume_129,MUSE_Volume_132,MUSE_Volume_133,MUSE_Volume_134,MUSE_Volume_135,MUSE_Volume_136,MUSE_Volume_137,MUSE_Volume_138,MUSE_Volume_139,MUSE_Volume_140,MUSE_Volume_141,MUSE_Volume_142,MUSE_Volume_143,MUSE_Volume_144,MUSE_Volume_145,MUSE_Volume_146,MUSE_Volume_147,MUSE_Volume_148,MUSE_Volume_149,MUSE_Volume_150,MUSE_Volume_151,MUSE_Volume_152,MUSE_Volume_153,MUSE_Volume_154,MUSE_Volume_155,MUSE_Volume_156,MUSE_Volume_157,MUSE_Volume_160,MUSE_Volume_161,MUSE_Volume_162,MUSE_Volume_163,MUSE_Volume_164,MUSE_Volume_165,MUSE_Volume_166,MUSE_Volume_167,MUSE_Volume_168,MUSE_Volume_169,MUSE_Volume_170,MUSE_Volume_171,MUSE_Volume_172,MUSE_Volume_173,MUSE_Volume_174,MUSE_Volume_175,MUSE_Volume_176,MUSE_Volume_177,MUSE_Volume_178,MUSE_Volume_179,MUSE_Volume_180,MUSE_Volume_181,MUSE_Volume_182,MUSE_Volume_183,MUSE_Volume_184,MUSE_Volume_185,MUSE_Volume_186,MUSE_Volume_187,MUSE_Volume_190,MUSE_Volume_191,MUSE_Volume_192,MUSE_Volume_193,MUSE_Volume_194,MUSE_Volume_195,MUSE_Volume_196,MUSE_Volume_197,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207,rs4575098,rs6656401,rs2093760,rs4844610,rs4663105,rs6733839,rs10933431,rs35349669,rs6448453,rs190982,rs9271058,rs9473117,rs9381563,rs10948363,rs2718058,rs4723711,rs1859788,rs1476679,rs12539172,rs10808026,rs7810606,rs11771145,rs28834970,rs73223431,rs4236673,rs9331896,rs11257238,rs7920721,rs3740688,rs10838725,rs983392,rs7933202,rs2081545,rs867611,rs10792832,rs3851179,rs17125924,rs17125944,rs10498633,rs12881735,rs12590654,rs442495,rs59735493,rs113260531,rs28394864,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358
0,002_S_0295,84.742466,2006-04-18,M,1485405.375,CN,1873.124153,1586.249283,302.695176,352.265466,1062.069832,1159.101038,20657.100036,3254.764153,3118.709527,52564.546547,52086.773326,14018.899912,14294.173225,3600.701497,3368.670352,750.93716,587.460672,16514.289409,23626.044006,1544.061802,1339.452519,4182.888734,4105.896581,7365.93417,8007.18388,4747.146291,4789.333772,4638.513528,2017.616275,2812.850291,352.265466,380.742015,107813.271569,102646.359842,26682.526999,30497.329962,51500.367341,52266.07012,63530.127527,63595.518123,426.093557,543.163817,3778.943604,3226.287604,2316.092703,2299.21771,11240.854293,4464.490169,4446.56049,3624.959299,3474.139054,1558.82742,741.444977,9202.144277,10142.925102,3385.545344,3549.021833,2759.061253,2699.998779,5983.239482,4772.45878,2428.944214,2103.045924,954.491756,976.640183,2716.873772,2089.334993,8379.488399,7090.660857,1878.397588,1371.09313,6936.676551,6667.73136,11534.057285,11793.510293,9326.597346,7320.582628,2008.124092,1507.147756,3968.787268,4167.068429,1769.764825,1258.241619,14127.532675,13099.212828,4775.622841,5759.645833,3586.990566,3536.365589,942.890199,769.921527,2916.209619,1978.592855,4195.544978,4115.388764,11762.924369,11668.002537,3131.365772,4021.521619,3495.232795,5277.653864,2280.233344,2930.975237,1168.593222,1541.952428,3406.639085,3815.85765,11444.408888,11739.721255,3415.076581,3348.631299,2000.741283,2098.827176,1443.866535,1758.163268,8868.863178,8873.081926,2589.256642,2578.709772,1332.06971,1373.202504,10701.909224,10142.925102,1171.757283,1339.452519,1123.24168,1319.413466,9906.675209,12091.986721,3378.162535,3411.91252,6568.59078,7252.027971,2965.779909,2489.061375,9220.073956,7986.090139,4336.873039,4313.669925,8816.128827,7512.535666,2559.725405,2955.233039,994.569863,1110.585435,0,0,0,0,1,1,1,0,0,1,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,0,1
1,002_S_0413,76.283562,2006-05-02,F,1364116.0,CN,2131.516933,1505.034469,384.959763,340.663023,988.239172,1051.520229,18405.295394,3021.670466,3151.396633,45240.682248,43280.024169,12993.710346,13640.231811,3586.981241,3350.731962,475.662611,761.482051,16498.426213,13491.521327,1096.871653,1092.652915,2941.514461,3243.154165,6479.980225,6874.432146,4181.823176,4367.447609,3893.894367,1782.416435,2235.930676,503.084402,371.248867,103124.919662,101385.745282,29939.322695,27712.884177,48141.064022,54393.232442,54684.325304,53309.017001,504.139086,497.810981,3042.764152,2457.414376,2021.829767,1968.040869,11103.716114,2839.210085,3769.441622,3323.310171,3544.79387,1840.424071,1424.878464,9169.425142,8470.169464,2715.812024,3414.013019,2874.014667,2941.514461,5370.452361,6264.824631,2298.157049,2118.860721,1387.964514,967.145486,2358.274053,2307.649208,7267.829383,8130.561125,2144.173144,1907.923865,6878.650883,7117.009531,12912.499657,11882.073114,8530.286468,8347.826087,2611.398281,2317.141366,4837.836799,4199.752808,1187.574501,1671.674586,15983.740284,14967.024637,4672.251366,4851.547694,4239.830811,4672.251366,1018.825016,606.443462,3551.121975,3038.545415,6675.096817,4862.094537,12907.226235,11194.418962,3109.209261,3177.76374,3142.959158,4005.690901,2556.554698,2594.523332,1206.558818,1193.902606,3702.996512,3976.159741,10143.953418,11154.34096,3077.568733,3113.427999,1596.792002,1522.964102,1555.659315,1731.79159,9037.589607,8741.223324,2840.26477,2546.007855,1505.034469,1198.121344,12067.697547,11828.284215,902.809745,1252.964926,948.161169,971.364223,10644.928452,12009.689912,3580.653135,4822.016534,8888.879123,7140.212585,4647.993628,3295.888379,9925.633772,11152.231591,7421.813288,6059.161196,6676.151501,7780.405944,2469.015903,2239.094729,1097.926337,744.607103,1,0,0,0,0,0,0,1,0,0,0,0,0,0,2,2,1,1,1,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0
2,002_S_0559,79.223288,2006-05-23,M,1570479.625,CN,2366.71768,3157.732947,512.577893,552.656,1172.81197,1141.171359,23368.700372,3041.717375,3198.865741,49984.782089,49158.96215,17229.36721,17536.281134,4728.161925,4206.091848,764.648092,640.195023,16674.601836,16115.617714,1304.647848,1335.233771,4119.607512,4195.544978,7287.88733,7510.426292,5413.70849,5620.427146,3802.146719,1700.155481,2524.920733,563.20287,591.67942,115548.346197,110692.567142,30553.228374,27892.253015,54122.319281,61104.347374,63260.127649,63511.143161,543.163817,640.195023,3323.31881,3096.5611,2498.553558,2377.26455,12633.041163,3755.740489,4386.443329,3620.740551,3068.08455,2433.162962,2104.100611,11164.916827,10359.135942,3451.990627,2741.131573,3804.256093,3905.506047,4038.396612,3980.388825,2408.905161,2620.897253,1344.725955,1242.421313,3342.303176,2947.85023,8735.972613,10286.362537,1908.983512,2000.741283,5694.255238,4487.693284,12286.049133,15746.477256,9619.800338,9030.230292,2045.038138,2290.780214,3946.638841,4054.216917,1891.053833,1156.991664,16048.117744,14235.110752,4362.185528,5427.419421,4695.466627,4353.748032,653.905954,1110.585435,1854.139787,2276.014596,6314.411208,5783.903635,13022.220675,12608.783362,3445.662505,2907.772123,5668.94275,4525.662016,2641.990993,2326.639573,1201.288519,1131.679176,3535.310902,3904.45136,10796.831056,11263.00272,3952.966963,4306.287116,1615.780519,1374.257191,1595.741466,1631.600825,8051.480735,12965.267576,3138.748581,3280.076642,1138.007298,1420.66342,11015.15127,12643.588034,1885.780397,1861.522596,932.343328,1025.155787,11981.244583,11908.471179,4343.201161,5510.739696,7549.449712,5636.247452,2599.803512,2494.33481,7430.270078,6746.832887,7983.980765,7081.168674,10475.151514,11083.705926,3069.139237,2872.967451,1051.522962,1274.061924,0,0,0,0,1,0,1,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1,0,0,2,0,0,0,0,1,0,0,0,0
3,002_S_0619,77.447945,2006-06-01,M,1859348.25,Dementia,5124.734093,2981.605944,280.547287,356.484899,876.4466,908.087272,21112.765397,2883.519862,2848.715123,56650.512928,55319.495347,13810.098417,14307.911649,3485.747309,3231.567248,1810.901098,1584.142953,47125.61612,60669.932896,1393.244235,1401.681747,3791.607134,3956.138626,7059.03381,7534.698571,5230.202998,5155.320075,4848.405562,2089.339007,3279.028256,557.930507,343.82863,105178.865483,105828.553938,30496.333873,30318.091423,52001.443596,58340.124785,64032.281587,63620.952857,546.328928,608.555582,3098.676428,2852.933879,2209.573559,2206.409492,12681.581134,2698.949278,4427.584631,3850.669721,3989.888675,2088.284318,2110.432788,10630.210932,9738.998685,2335.081556,3525.825493,3081.801403,3971.958961,4032.076237,5876.727385,1759.221335,1517.697543,1132.736039,1098.98599,2337.190934,1737.072865,9832.866011,8968.02099,2304.495574,2317.151842,8209.699563,8048.332138,12752.2453,13784.78588,9587.123462,11270.407185,3097.621739,2240.159542,4577.350476,4796.725798,1399.572369,1617.893002,17917.057577,16677.797943,6813.291261,7985.050795,5135.280983,4916.96035,831.094971,594.844624,2534.417786,2599.808508,5705.867759,4306.29539,11084.781913,13730.996738,5032.976145,4884.264989,4352.701708,5000.280785,2023.948286,2122.034368,1509.26003,1475.509981,5866.180495,5736.453741,11578.376388,11748.181325,3241.05945,2811.801007,1759.221335,2247.542365,2254.925188,2166.331308,9747.436198,10617.554664,2975.277809,3174.61404,1750.783823,1521.916299,12143.689718,12557.127826,1405.900503,1814.065166,1921.643449,1717.033773,12618.299791,13427.246292,3545.864585,3870.708812,7061.143188,8703.294038,3338.090842,2569.222525,10113.413298,11186.032061,7032.666584,6352.392146,7272.080998,7162.393337,2489.066157,2506.995871,1155.939198,1574.650751,0,1,1,1,0,0,0,2,1,1,0,0,0,0,1,1,1,1,1,0,1,1,1,1,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,2
4,002_S_0685,89.561644,2006-07-06,F,1372862.125,CN,2941.520445,1693.826402,278.437217,328.007479,910.194387,966.092767,19718.417441,3275.856043,3304.332576,48017.763654,46338.702862,15370.999989,15619.905986,3663.980648,3558.512005,850.07726,378.632427,19563.378537,20767.830436,1211.834705,1276.170577,3940.308492,3830.621103,6907.141411,7299.484762,4610.034373,4816.752913,4007.808423,1973.318305,3543.746395,275.273157,261.562234,100127.710645,97369.705638,28625.244318,26733.136868,50063.855322,54914.358201,58502.401426,55236.037561,503.085426,562.147866,3130.309316,2807.575269,2454.255316,2260.193013,10907.567029,2037.654177,3700.894673,3134.528061,3218.902976,1961.716754,1551.443734,6058.118837,6199.446819,3072.301562,3788.433646,2181.091531,2566.052077,4183.941056,4580.503153,1784.529435,1827.771578,930.233429,866.952243,2598.747356,1822.498146,5713.236376,5468.549125,1692.771716,1665.349868,6355.54041,6073.939134,10347.528536,10602.762652,7501.984556,7832.101408,2016.560449,1691.717029,3852.769518,4266.206598,996.678674,1531.404692,13870.181203,12898.815004,3552.183887,3546.910454,3434.059007,3966.675652,834.256964,632.811856,2536.520857,2079.841634,6290.149851,3814.800807,10999.324748,9911.943042,3577.496361,3192.535815,3371.832507,3824.292985,1453.357897,1714.92013,1076.834842,984.022436,3021.676614,3154.567104,7552.609505,7804.679561,3350.738779,3221.012348,1459.686015,1302.537737,855.350692,1380.584533,7132.844307,8861.475361,2633.552009,2252.810208,986.131809,1033.592699,9263.310889,9459.482565,714.022711,719.296143,1048.358309,1097.928571,10256.825504,9957.294558,3674.527512,3801.089883,7668.625012,5915.73617,2531.247425,2325.583572,7989.249685,8070.46054,4039.449016,3574.332301,6934.563258,6778.469667,2066.130711,2608.239534,434.530808,710.858652,0,0,0,0,0,0,1,1,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,0,0,1,2,2,0,0,0,0,0,2,0,0,2,1,1,1,0,0,0,0,0,0


In [11]:
data.describe()

Unnamed: 0,Age,DLICV_baseline,MUSE_Volume_4,MUSE_Volume_11,MUSE_Volume_23,MUSE_Volume_30,MUSE_Volume_31,MUSE_Volume_32,MUSE_Volume_35,MUSE_Volume_36,MUSE_Volume_37,MUSE_Volume_38,MUSE_Volume_39,MUSE_Volume_40,MUSE_Volume_41,MUSE_Volume_47,MUSE_Volume_48,MUSE_Volume_49,MUSE_Volume_50,MUSE_Volume_51,MUSE_Volume_52,MUSE_Volume_55,MUSE_Volume_56,MUSE_Volume_57,MUSE_Volume_58,MUSE_Volume_59,MUSE_Volume_60,MUSE_Volume_61,MUSE_Volume_62,MUSE_Volume_71,MUSE_Volume_72,MUSE_Volume_73,MUSE_Volume_75,MUSE_Volume_76,MUSE_Volume_81,MUSE_Volume_82,MUSE_Volume_83,MUSE_Volume_84,MUSE_Volume_85,MUSE_Volume_86,MUSE_Volume_87,MUSE_Volume_88,MUSE_Volume_89,MUSE_Volume_90,MUSE_Volume_91,MUSE_Volume_92,MUSE_Volume_93,MUSE_Volume_94,MUSE_Volume_95,MUSE_Volume_100,MUSE_Volume_101,MUSE_Volume_102,MUSE_Volume_103,MUSE_Volume_104,MUSE_Volume_105,MUSE_Volume_106,MUSE_Volume_107,MUSE_Volume_108,MUSE_Volume_109,MUSE_Volume_112,MUSE_Volume_113,MUSE_Volume_114,MUSE_Volume_115,MUSE_Volume_116,MUSE_Volume_117,MUSE_Volume_118,MUSE_Volume_119,MUSE_Volume_120,MUSE_Volume_121,MUSE_Volume_122,MUSE_Volume_123,MUSE_Volume_124,MUSE_Volume_125,MUSE_Volume_128,MUSE_Volume_129,MUSE_Volume_132,MUSE_Volume_133,MUSE_Volume_134,MUSE_Volume_135,MUSE_Volume_136,MUSE_Volume_137,MUSE_Volume_138,MUSE_Volume_139,MUSE_Volume_140,MUSE_Volume_141,MUSE_Volume_142,MUSE_Volume_143,MUSE_Volume_144,MUSE_Volume_145,MUSE_Volume_146,MUSE_Volume_147,MUSE_Volume_148,MUSE_Volume_149,MUSE_Volume_150,MUSE_Volume_151,MUSE_Volume_152,MUSE_Volume_153,MUSE_Volume_154,MUSE_Volume_155,MUSE_Volume_156,MUSE_Volume_157,MUSE_Volume_160,MUSE_Volume_161,MUSE_Volume_162,MUSE_Volume_163,MUSE_Volume_164,MUSE_Volume_165,MUSE_Volume_166,MUSE_Volume_167,MUSE_Volume_168,MUSE_Volume_169,MUSE_Volume_170,MUSE_Volume_171,MUSE_Volume_172,MUSE_Volume_173,MUSE_Volume_174,MUSE_Volume_175,MUSE_Volume_176,MUSE_Volume_177,MUSE_Volume_178,MUSE_Volume_179,MUSE_Volume_180,MUSE_Volume_181,MUSE_Volume_182,MUSE_Volume_183,MUSE_Volume_184,MUSE_Volume_185,MUSE_Volume_186,MUSE_Volume_187,MUSE_Volume_190,MUSE_Volume_191,MUSE_Volume_192,MUSE_Volume_193,MUSE_Volume_194,MUSE_Volume_195,MUSE_Volume_196,MUSE_Volume_197,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207,rs4575098,rs6656401,rs2093760,rs4844610,rs4663105,rs6733839,rs10933431,rs35349669,rs6448453,rs190982,rs9271058,rs9473117,rs9381563,rs10948363,rs2718058,rs4723711,rs1859788,rs1476679,rs12539172,rs10808026,rs7810606,rs11771145,rs28834970,rs73223431,rs4236673,rs9331896,rs11257238,rs7920721,rs3740688,rs10838725,rs983392,rs7933202,rs2081545,rs867611,rs10792832,rs3851179,rs17125924,rs17125944,rs10498633,rs12881735,rs12590654,rs442495,rs59735493,rs113260531,rs28394864,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358
count,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0
mean,73.81393,1428763.0,1760.170636,1747.376593,354.51935,390.800399,934.421522,982.750502,20204.546261,3176.595664,3073.943628,48502.767251,47544.487474,13477.859096,13760.769017,3558.694047,3263.864868,789.089259,745.75441,20072.757205,22245.285413,1357.105899,1375.6513,3946.335931,3985.483514,6942.422294,7132.958883,4462.614985,4633.40334,4207.894972,2122.038855,2863.019536,342.218076,316.213274,93517.13819,91003.370739,23704.286179,23757.460942,45064.038634,47594.723132,54939.594856,54196.217571,468.51749,513.857501,3058.296978,2746.387878,2103.066347,2056.08248,11296.94778,3133.491107,4067.248729,3657.448684,3649.463509,1536.637447,1425.383938,7690.903778,7393.917601,2897.849903,2993.546567,3002.885283,3149.397027,4497.613369,4400.981967,2129.288967,2022.024455,1410.440053,1316.011305,2765.047791,2371.482681,7484.910847,7112.47125,1877.960776,1831.652737,6845.064304,6559.150427,11411.27218,11106.140979,8425.185999,7683.311672,2169.168064,1885.165281,4192.520679,3978.033455,1482.773103,1545.337928,15593.306724,15546.883475,4457.669816,4919.097513,3933.589762,4376.999607,807.345012,568.688612,2528.324933,2388.297442,5984.481527,4675.459143,12776.632068,12275.818579,3140.713031,3230.752465,3871.49267,3879.486379,2489.984487,2312.472013,1227.131184,1482.934727,3590.328637,3894.922829,9753.021226,9578.031103,3148.077039,3262.949261,1901.655197,1730.543287,1759.374385,1948.75505,8385.291181,9363.44434,2932.532682,2754.70679,1340.007647,1443.574321,11016.685765,11075.781712,1325.970363,1589.365488,1426.429219,1501.694075,11751.565902,12097.737733,4034.722059,4402.293778,7521.862398,7315.938942,3409.253657,2976.968274,8704.286336,8605.153094,5984.63252,6052.626847,8006.932423,7733.883457,2697.332343,2807.495753,1171.346482,1299.51021,0.454371,0.377154,0.394384,0.376516,0.848756,0.789407,0.441608,0.938098,0.587109,0.782387,0.53478,0.577537,0.765795,0.572431,0.751755,0.713465,0.632419,0.555839,0.575622,0.364391,0.959796,0.68411,0.733886,0.736439,0.768347,0.776643,0.797064,0.824505,0.88896,0.643906,0.790045,0.758137,0.775367,0.609445,0.703255,0.703255,0.1806,0.181238,0.444161,0.449266,0.664965,0.649011,0.57626,0.264837,0.950223,0.343331,0.355456,0.360562,0.167837,0.590938,0.176133,0.181876,0.178685,0.569241
std,7.149737,143870.6,777.161321,579.871305,65.326396,70.733256,162.643803,167.684822,2243.884699,489.53855,441.699929,5120.497417,5020.592049,1596.151189,1612.511285,558.676487,518.260944,659.717463,628.743527,10932.51682,12187.913957,167.722216,169.702897,513.177597,512.382842,698.46162,699.304528,468.937838,491.600299,575.009714,298.281983,412.232589,113.936015,117.329251,12341.786604,11942.339011,3414.91706,3624.056625,5779.393875,6056.089689,6784.429505,6889.202109,72.482932,75.253639,397.740369,361.328293,287.241275,270.482462,1723.699442,723.395345,817.928383,565.9253,574.682677,303.696144,289.250441,1482.041218,1552.176925,603.259404,633.180136,569.18226,576.179401,823.90223,823.070353,426.024133,425.138293,297.174011,301.909503,491.996451,452.719165,1228.9528,1132.737766,297.079053,310.391373,1155.142359,1084.372462,1730.931697,1668.070613,1254.502,1158.825576,405.91545,399.729942,647.64952,622.18836,371.07867,354.417481,2367.168831,2409.018703,857.899488,945.782745,554.783199,607.243284,212.361112,186.858918,560.284508,517.459049,1112.149648,792.038558,2083.562888,2082.830997,611.571923,714.194427,702.847957,677.818067,616.539627,592.915181,341.850572,379.27755,607.495133,626.082804,1700.707796,1655.518129,505.260849,524.021379,348.819524,333.493714,431.702909,454.178137,1244.329236,1343.44604,478.38745,442.05741,241.350931,270.463025,1610.663249,1595.176639,331.069016,396.432575,304.985202,318.9956,1652.725931,1711.69288,768.647817,783.578504,1193.36644,1234.212899,658.471423,649.234262,1319.72009,1371.76467,1007.989909,1100.527491,1359.762514,1366.511667,550.951148,589.229934,311.89336,380.021734,0.595045,0.546732,0.559528,0.545418,0.707177,0.699619,0.589612,0.722294,0.637213,0.668959,0.614768,0.636446,0.683245,0.637048,0.689472,0.677284,0.657151,0.639716,0.64564,0.550707,0.703243,0.679303,0.701226,0.704007,0.680372,0.679862,0.692722,0.695148,0.704247,0.64688,0.701178,0.692661,0.6989,0.646723,0.671921,0.671921,0.407379,0.407879,0.586602,0.588153,0.666239,0.674738,0.632575,0.486794,0.711212,0.533242,0.546094,0.550909,0.380612,0.642667,0.406985,0.411493,0.40744,0.66678
min,54.273973,1057343.0,279.599289,649.198349,188.787377,219.599442,428.399974,465.599972,14147.216048,1785.595459,1909.195145,33671.25,32319.375,8848.799473,9111.599457,1739.177918,1655.858626,34.799911,2.475464,2818.799832,3604.799785,913.199946,903.866268,2608.241008,2521.249264,4339.488159,4380.115207,3168.75,3248.4375,2570.267046,1265.625,1640.625,2.4,0.0,63106.796239,61430.396338,15850.759689,14431.253177,29985.714386,30093.598206,38335.102509,36892.797801,151.199991,224.648336,1925.625,1840.795319,1420.799915,1371.599918,7014.73687,1244.528155,1628.848322,1678.364502,1702.264855,639.140625,542.398621,3305.625,2831.999831,1336.875,1305.708143,1291.988975,1553.550824,2206.875,2169.375,933.75,836.39995,575.545349,521.999969,1183.35884,1076.835451,3627.599784,3786.326413,958.710504,820.799951,3543.599789,2583.988172,6137.999634,5839.18515,4623.738503,4325.262682,612.677307,616.390503,1891.875,2371.989775,0.0,1.199997,8025.0,7935.46875,2213.792315,2062.5,1734.960153,1858.358535,0.0,9.375,1005.0,924.960519,2716.873772,2355.116123,6550.657399,5364.132137,1314.137357,1425.939595,1412.225924,1730.625,750.0,870.116794,0.0,0.0,1953.75,2213.788062,4640.625,4296.149335,1775.625,1678.125,881.718351,741.443469,614.883716,681.328125,5220.731172,5583.385091,1638.983634,1535.999908,778.358583,747.590088,6710.973528,4726.052551,542.108272,556.875,586.405985,454.570107,5368.356948,5285.036673,1730.737884,1789.752791,3833.787329,3688.79978,1214.999451,1129.567504,4633.125,4793.573109,2890.799828,1906.875,3224.291687,2693.670657,1095.0,1081.875,416.399975,423.984375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,69.256164,1323742.0,1181.114188,1327.802492,310.077739,345.937011,832.798916,869.998868,18651.099886,2827.918127,2771.884766,44934.739444,44147.342547,12397.848759,12617.234438,3179.376055,2916.101102,344.69999,304.802856,12306.509566,13577.359251,1239.599926,1263.662114,3611.630461,3638.534576,6454.217291,6641.024802,4119.385503,4283.924872,3805.263448,1919.625004,2571.599847,263.999984,245.999985,84630.9375,82494.990204,21172.5,21082.753998,40779.107855,43278.068912,50009.02351,49106.90411,418.799975,461.999972,2770.661557,2484.609375,1901.024943,1860.937445,10027.642013,2641.875,3493.762396,3290.170815,3268.640585,1334.69996,1232.137463,6715.773102,6368.15354,2472.186522,2551.151324,2600.130998,2757.014426,3922.372172,3845.017759,1853.904695,1736.390272,1204.199928,1098.75,2417.665795,2071.307043,6642.778318,6368.88731,1681.53745,1633.984265,6028.269592,5844.199219,10249.312195,9961.868487,7510.836873,6909.224789,1885.421254,1607.579947,3758.913473,3551.138833,1229.477187,1299.609226,13970.080285,13939.192639,3887.253059,4273.610414,3566.399787,3959.133244,659.235279,438.599974,2130.599936,2016.350886,5236.821749,4137.862377,11371.86343,10906.204601,2722.799838,2726.243596,3410.626116,3401.853076,2067.599877,1867.676626,1015.955768,1231.3716,3167.684233,3449.0625,8527.465834,8446.789008,2804.411842,2906.48996,1657.949951,1499.882768,1450.3125,1632.128168,7520.399552,8459.999748,2606.12913,2449.34042,1166.481277,1250.625,9923.087024,9985.259549,1077.888858,1307.897461,1223.439539,1279.812108,10634.477204,10936.416527,3478.390076,3865.19977,6660.071564,6456.909543,2942.491137,2515.773302,7766.25,7652.283426,5342.899782,5347.799681,7160.454479,6845.999592,2324.401478,2417.867951,952.909455,1022.999939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,73.967123,1423724.0,1636.88129,1643.708008,352.265346,388.125,939.599944,984.375,20012.663645,3114.120348,3028.811855,48298.797121,47384.397176,13397.999201,13660.847351,3592.5,3281.25,588.515625,554.399967,17745.073062,19592.398832,1347.890016,1369.196518,3911.999767,3959.296875,6903.599589,7123.199575,4443.457642,4595.27136,4185.589355,2103.044736,2839.199831,325.199981,301.640625,92283.75,89994.303797,23521.198598,23578.125,44751.435791,47198.397187,54347.047472,53805.596793,466.875,515.741663,3028.799819,2724.25227,2070.350626,2029.207741,11217.564514,3086.009346,4004.399761,3635.509867,3631.199784,1520.859375,1414.799916,7666.511597,7368.043544,2853.59983,2941.199825,2966.843445,3127.148438,4466.599543,4348.125,2130.0,2047.148438,1395.0,1293.040883,2743.125,2330.625,7481.947205,7107.517372,1852.79989,1799.295451,6791.178916,6510.469971,11388.510476,11119.199337,8364.375,7625.390625,2149.940369,1867.5,4138.125,3930.0,1468.124336,1507.19991,15485.264221,15430.79908,4424.412062,4885.315261,3907.617188,4357.5,788.435242,539.999968,2487.599852,2336.25,5886.20495,4621.199725,12758.486704,12263.999269,3087.599816,3166.799811,3845.990219,3832.799772,2433.75,2260.80009,1217.999927,1466.399913,3519.505713,3856.790192,9723.119004,9524.878506,3133.199813,3251.600092,1897.199887,1713.75,1741.875,1911.092886,8335.191544,9253.199448,2923.125,2725.199838,1320.0,1420.084746,10848.75,10990.93819,1285.664062,1545.599908,1408.799916,1486.516052,11699.042236,12007.169464,3976.875,4348.125,7456.875,7265.738903,3368.412849,2945.742188,8654.739213,8546.399491,5963.984833,6040.190534,8018.027466,7719.375,2645.033142,2764.799835,1147.961323,1271.999924,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,78.953425,1527860.0,2230.3125,2039.2882,394.799976,432.211664,1041.29025,1093.455436,21697.919751,3441.866151,3311.257966,51966.086067,50885.2862,14513.078394,14822.02137,3952.361832,3624.328481,1012.194883,991.902871,25057.945899,28160.675369,1466.889835,1479.54961,4235.159212,4254.375,7420.799558,7610.524435,4771.228581,4964.931701,4575.417051,2317.796983,3115.195946,408.30652,387.419498,101146.121986,98605.696952,25918.536728,26148.949825,48869.720837,51710.178185,59564.979757,58827.780823,514.38126,561.599967,3319.099874,2984.585846,2278.125,2227.207031,12391.783877,3580.79534,4579.366209,4014.142667,4008.431435,1719.597762,1595.810471,8641.84436,8372.779884,3253.457584,3378.037399,3373.263371,3512.634932,5001.449851,4905.169682,2403.086004,2318.271912,1597.596946,1521.5625,3064.259634,2624.76356,8302.039444,7852.60555,2045.970886,1997.9974,7614.941955,7235.803299,12555.575993,12203.805615,9193.703663,8375.859375,2421.553878,2134.359355,4631.71875,4360.973738,1719.898091,1775.520782,17051.432014,17059.97115,4975.219593,5489.141792,4279.922117,4772.474858,932.899191,661.244175,2870.996263,2705.036027,6636.656195,5177.130022,14087.321027,13635.595335,3527.92889,3685.738953,4301.516018,4318.623229,2861.63623,2693.996493,1445.398118,1707.599898,3967.995534,4307.070773,10880.420428,10700.345398,3482.399792,3602.144371,2122.344465,1952.997458,2028.97887,2236.195868,9180.429108,10228.144512,3216.997614,3024.077057,1488.577453,1601.810524,12070.905419,12104.699639,1540.886169,1842.267186,1616.442036,1707.603206,12812.999236,13225.724606,4518.854645,4896.268424,8302.498123,8114.635061,3845.830536,3380.995599,9571.799429,9463.5241,6612.589433,6742.799598,8845.946033,8618.514329,3033.680969,3169.199811,1358.399919,1554.902359,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
max,91.306849,1905572.0,5464.820847,4804.855004,621.341431,661.875,1616.396692,1691.996537,31839.946542,5561.985855,5312.38649,67704.578765,66360.907497,23236.864494,23956.161044,5383.188984,5128.789504,5745.935552,4957.5,70383.340608,84133.438525,1959.599883,2030.625,7181.981735,6802.799595,9519.585703,9660.913477,6308.083086,6631.872002,6821.744966,3135.0,4216.637289,974.095032,756.254211,141230.625,136239.591879,37100.725414,36256.875,66583.453995,75606.288414,82662.315331,84714.571074,779.767814,787.19751,4855.779055,4260.935574,3480.473864,3112.387386,18686.039001,5829.599653,7468.784716,5692.143934,5672.104887,2945.802002,2601.599845,13671.921789,12558.75,5812.822174,5480.164302,5255.986633,5609.999666,7508.399552,7674.029274,3439.657043,3332.802329,2581.197795,2399.999857,4903.893921,5514.095764,11909.406196,10935.788185,3954.553528,6711.181083,10529.999372,11524.541655,18003.598927,16764.216126,13418.3992,11733.75,3612.304688,3490.540202,6809.999594,7790.384058,2794.802887,3046.058289,26441.25,26130.0,7886.39953,8400.0,6853.300212,7898.37825,1577.999906,1653.674365,5092.799696,4209.375,12530.374357,8068.351167,20258.387125,19915.61454,6283.199625,5856.702194,6285.599625,6643.194324,5121.599695,4709.570007,2489.999852,2900.00592,6249.599627,6295.187117,15816.976379,14477.75095,5035.093506,5474.388797,3553.199788,2864.099557,4300.799744,4163.904367,13021.875,14338.799145,5376.1226,4664.399722,2519.645874,2568.282878,19388.398844,17305.198969,2951.999824,3166.118286,2754.84157,2673.599841,18711.561708,18617.297456,7352.399562,6998.900712,12270.0,12236.399271,5843.985138,5578.434185,13815.599177,13048.125,9442.5,11927.972096,16396.234924,16002.636169,4601.887329,6470.399614,2335.600159,2987.922258,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [12]:
categorical_features = [column_name for column_name in data.columns if data[column_name].dtype == 'O']
print("Number of Categorical Features: {}".format(len(categorical_features)))
print("Categorical Features: ",categorical_features)

Number of Categorical Features: 4
Categorical Features:  ['PTID', 'Date', 'Sex', 'diagnosis']


In [13]:
numerical_features = [column_name for column_name in data.columns if data[column_name].dtype != 'O']
print("Number of Numerical Features: {}".format(len(numerical_features)))
print("Numerical Features: ",numerical_features)

Number of Numerical Features: 201
Numerical Features:  ['Age', 'DLICV_baseline', 'MUSE_Volume_4', 'MUSE_Volume_11', 'MUSE_Volume_23', 'MUSE_Volume_30', 'MUSE_Volume_31', 'MUSE_Volume_32', 'MUSE_Volume_35', 'MUSE_Volume_36', 'MUSE_Volume_37', 'MUSE_Volume_38', 'MUSE_Volume_39', 'MUSE_Volume_40', 'MUSE_Volume_41', 'MUSE_Volume_47', 'MUSE_Volume_48', 'MUSE_Volume_49', 'MUSE_Volume_50', 'MUSE_Volume_51', 'MUSE_Volume_52', 'MUSE_Volume_55', 'MUSE_Volume_56', 'MUSE_Volume_57', 'MUSE_Volume_58', 'MUSE_Volume_59', 'MUSE_Volume_60', 'MUSE_Volume_61', 'MUSE_Volume_62', 'MUSE_Volume_71', 'MUSE_Volume_72', 'MUSE_Volume_73', 'MUSE_Volume_75', 'MUSE_Volume_76', 'MUSE_Volume_81', 'MUSE_Volume_82', 'MUSE_Volume_83', 'MUSE_Volume_84', 'MUSE_Volume_85', 'MUSE_Volume_86', 'MUSE_Volume_87', 'MUSE_Volume_88', 'MUSE_Volume_89', 'MUSE_Volume_90', 'MUSE_Volume_91', 'MUSE_Volume_92', 'MUSE_Volume_93', 'MUSE_Volume_94', 'MUSE_Volume_95', 'MUSE_Volume_100', 'MUSE_Volume_101', 'MUSE_Volume_102', 'MUSE_Volume_103'

Turning categorical data to numerical.

In [14]:
data['diagnosis'] = data['diagnosis'].map({'CN': 0, 'MCI': 1, 'Dementia': 2}) #multi-class or binary?

In [15]:
data['Sex'] = data['Sex'].map({'M': 0, 'F': 1})

In [16]:
data_new= data.drop(['Date'], axis=1)
data=data_new

#### Data Preparation

In [17]:
data_controls_age = data.loc[(data['diagnosis'] == 0)]
data_controls_age_60 = data_controls_age.loc[(data_controls_age['Age'] > 60)]
data_controls = data_controls_age_60.loc[(data_controls_age_60['Age'] < 86)]
print(data_controls.shape) #controls ages 60-86

data_age_60 = data.loc[(data['Age'] > 60)]
data_age_filtered = data_age_60.loc[(data_age_60['Age'] < 86)]
print(data_age_filtered.shape) #ages 60-86

(449, 204)
(1463, 204)


In [18]:
roi_features = numerical_features[2:147]
snip_features = numerical_features[147:]
first_features = ['PTID','Sex','Age','DLICV_baseline','diagnosis']
all_but_roi_features = ['PTID','Sex','Age','DLICV_baseline','diagnosis']
all_but_roi_features.extend(snip_features)

In [19]:
print(all_but_roi_features)

['PTID', 'Sex', 'Age', 'DLICV_baseline', 'diagnosis', 'rs4575098', 'rs6656401', 'rs2093760', 'rs4844610', 'rs4663105', 'rs6733839', 'rs10933431', 'rs35349669', 'rs6448453', 'rs190982', 'rs9271058', 'rs9473117', 'rs9381563', 'rs10948363', 'rs2718058', 'rs4723711', 'rs1859788', 'rs1476679', 'rs12539172', 'rs10808026', 'rs7810606', 'rs11771145', 'rs28834970', 'rs73223431', 'rs4236673', 'rs9331896', 'rs11257238', 'rs7920721', 'rs3740688', 'rs10838725', 'rs983392', 'rs7933202', 'rs2081545', 'rs867611', 'rs10792832', 'rs3851179', 'rs17125924', 'rs17125944', 'rs10498633', 'rs12881735', 'rs12590654', 'rs442495', 'rs59735493', 'rs113260531', 'rs28394864', 'rs111278892', 'rs3752246', 'rs4147929', 'rs41289512', 'rs3865444', 'rs6024870', 'rs6014724', 'rs7274581', 'rs429358']


In [20]:
roi_mapping = pd.read_csv(path + 'ROIs.csv')

In [21]:
roi = roi_mapping.values.tolist()
roi_names = []
for i in roi:
  roi_names.append(i[0])

In [22]:
roi_names.remove('42,Right Cerebral Exterior')
roi_names.remove('43,Left Cerebral Exterior')
roi_names.remove('44,Cerebral')
roi_names.remove('45,Cerebral')
roi_names.remove('46,CSF')
roi_names.remove('63,Right vessel')
roi_names.remove('64,Left vessel')
roi_names.remove('69,Optic Chiasm')

In [23]:
ch = ','
roi_name=[]
# Remove all characters before the character ',' from string
for roi in roi_names:
  listOfWords = roi.split(ch, 1)
  if len(listOfWords) > 0:
      strValue = listOfWords[1]
      roi_name.append(strValue)

In [24]:
import warnings
warnings.filterwarnings('ignore')

# Data preprocessing & Model

Undersampling

In [37]:
# Count the number of instances in each class
class_counts = data_age_filtered['diagnosis'].value_counts()
print("Original class counts:\n", class_counts)

# Find the count of the least represented class
min_class_count = class_counts.min()

# Perform undersampling
data_balanced = pd.concat([
    data_age_filtered[data_age_filtered['diagnosis'] == class_label].sample(min_class_count, random_state=42)
    for class_label in class_counts.index
])

# Shuffle the dataset (optional but recommended)
data_balanced = data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("Balanced class counts:\n", data_balanced['diagnosis'].value_counts())

Original class counts:
 1    740
0    449
2    274
Name: diagnosis, dtype: int64
Balanced class counts:
 2    274
1    274
0    274
Name: diagnosis, dtype: int64


In [38]:
data_age_filtered

Unnamed: 0,PTID,Age,Sex,DLICV_baseline,diagnosis,MUSE_Volume_4,MUSE_Volume_11,MUSE_Volume_23,MUSE_Volume_30,MUSE_Volume_31,MUSE_Volume_32,MUSE_Volume_35,MUSE_Volume_36,MUSE_Volume_37,MUSE_Volume_38,MUSE_Volume_39,MUSE_Volume_40,MUSE_Volume_41,MUSE_Volume_47,MUSE_Volume_48,MUSE_Volume_49,MUSE_Volume_50,MUSE_Volume_51,MUSE_Volume_52,MUSE_Volume_55,MUSE_Volume_56,MUSE_Volume_57,MUSE_Volume_58,MUSE_Volume_59,MUSE_Volume_60,MUSE_Volume_61,MUSE_Volume_62,MUSE_Volume_71,MUSE_Volume_72,MUSE_Volume_73,MUSE_Volume_75,MUSE_Volume_76,MUSE_Volume_81,MUSE_Volume_82,MUSE_Volume_83,MUSE_Volume_84,MUSE_Volume_85,MUSE_Volume_86,MUSE_Volume_87,MUSE_Volume_88,MUSE_Volume_89,MUSE_Volume_90,MUSE_Volume_91,MUSE_Volume_92,MUSE_Volume_93,MUSE_Volume_94,MUSE_Volume_95,MUSE_Volume_100,MUSE_Volume_101,MUSE_Volume_102,MUSE_Volume_103,MUSE_Volume_104,MUSE_Volume_105,MUSE_Volume_106,MUSE_Volume_107,MUSE_Volume_108,MUSE_Volume_109,MUSE_Volume_112,MUSE_Volume_113,MUSE_Volume_114,MUSE_Volume_115,MUSE_Volume_116,MUSE_Volume_117,MUSE_Volume_118,MUSE_Volume_119,MUSE_Volume_120,MUSE_Volume_121,MUSE_Volume_122,MUSE_Volume_123,MUSE_Volume_124,MUSE_Volume_125,MUSE_Volume_128,MUSE_Volume_129,MUSE_Volume_132,MUSE_Volume_133,MUSE_Volume_134,MUSE_Volume_135,MUSE_Volume_136,MUSE_Volume_137,MUSE_Volume_138,MUSE_Volume_139,MUSE_Volume_140,MUSE_Volume_141,MUSE_Volume_142,MUSE_Volume_143,MUSE_Volume_144,MUSE_Volume_145,MUSE_Volume_146,MUSE_Volume_147,MUSE_Volume_148,MUSE_Volume_149,MUSE_Volume_150,MUSE_Volume_151,MUSE_Volume_152,MUSE_Volume_153,MUSE_Volume_154,MUSE_Volume_155,MUSE_Volume_156,MUSE_Volume_157,MUSE_Volume_160,MUSE_Volume_161,MUSE_Volume_162,MUSE_Volume_163,MUSE_Volume_164,MUSE_Volume_165,MUSE_Volume_166,MUSE_Volume_167,MUSE_Volume_168,MUSE_Volume_169,MUSE_Volume_170,MUSE_Volume_171,MUSE_Volume_172,MUSE_Volume_173,MUSE_Volume_174,MUSE_Volume_175,MUSE_Volume_176,MUSE_Volume_177,MUSE_Volume_178,MUSE_Volume_179,MUSE_Volume_180,MUSE_Volume_181,MUSE_Volume_182,MUSE_Volume_183,MUSE_Volume_184,MUSE_Volume_185,MUSE_Volume_186,MUSE_Volume_187,MUSE_Volume_190,MUSE_Volume_191,MUSE_Volume_192,MUSE_Volume_193,MUSE_Volume_194,MUSE_Volume_195,MUSE_Volume_196,MUSE_Volume_197,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207,rs4575098,rs6656401,rs2093760,rs4844610,rs4663105,rs6733839,rs10933431,rs35349669,rs6448453,rs190982,rs9271058,rs9473117,rs9381563,rs10948363,rs2718058,rs4723711,rs1859788,rs1476679,rs12539172,rs10808026,rs7810606,rs11771145,rs28834970,rs73223431,rs4236673,rs9331896,rs11257238,rs7920721,rs3740688,rs10838725,rs983392,rs7933202,rs2081545,rs867611,rs10792832,rs3851179,rs17125924,rs17125944,rs10498633,rs12881735,rs12590654,rs442495,rs59735493,rs113260531,rs28394864,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358
0,002_S_0295,84.742466,0,1485405.375,0,1873.124153,1586.249283,302.695176,352.265466,1062.069832,1159.101038,20657.100036,3254.764153,3118.709527,52564.546547,52086.773326,14018.899912,14294.173225,3600.701497,3368.670352,750.937160,587.460672,16514.289409,23626.044006,1544.061802,1339.452519,4182.888734,4105.896581,7365.934170,8007.183880,4747.146291,4789.333772,4638.513528,2017.616275,2812.850291,352.265466,380.742015,107813.271569,102646.359842,26682.526999,30497.329962,51500.367341,52266.070120,63530.127527,63595.518123,426.093557,543.163817,3778.943604,3226.287604,2316.092703,2299.217710,11240.854293,4464.490169,4446.560490,3624.959299,3474.139054,1558.827420,741.444977,9202.144277,10142.925102,3385.545344,3549.021833,2759.061253,2699.998779,5983.239482,4772.458780,2428.944214,2103.045924,954.491756,976.640183,2716.873772,2089.334993,8379.488399,7090.660857,1878.397588,1371.093130,6936.676551,6667.731360,11534.057285,11793.510293,9326.597346,7320.582628,2008.124092,1507.147756,3968.787268,4167.068429,1769.764825,1258.241619,14127.532675,13099.212828,4775.622841,5759.645833,3586.990566,3536.365589,942.890199,769.921527,2916.209619,1978.592855,4195.544978,4115.388764,11762.924369,11668.002537,3131.365772,4021.521619,3495.232795,5277.653864,2280.233344,2930.975237,1168.593222,1541.952428,3406.639085,3815.857650,11444.408888,11739.721255,3415.076581,3348.631299,2000.741283,2098.827176,1443.866535,1758.163268,8868.863178,8873.081926,2589.256642,2578.709772,1332.069710,1373.202504,10701.909224,10142.925102,1171.757283,1339.452519,1123.241680,1319.413466,9906.675209,12091.986721,3378.162535,3411.912520,6568.590780,7252.027971,2965.779909,2489.061375,9220.073956,7986.090139,4336.873039,4313.669925,8816.128827,7512.535666,2559.725405,2955.233039,994.569863,1110.585435,0,0,0,0,1,1,1,0,0,1,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,0,1
1,002_S_0413,76.283562,1,1364116.000,0,2131.516933,1505.034469,384.959763,340.663023,988.239172,1051.520229,18405.295394,3021.670466,3151.396633,45240.682248,43280.024169,12993.710346,13640.231811,3586.981241,3350.731962,475.662611,761.482051,16498.426213,13491.521327,1096.871653,1092.652915,2941.514461,3243.154165,6479.980225,6874.432146,4181.823176,4367.447609,3893.894367,1782.416435,2235.930676,503.084402,371.248867,103124.919662,101385.745282,29939.322695,27712.884177,48141.064022,54393.232442,54684.325304,53309.017001,504.139086,497.810981,3042.764152,2457.414376,2021.829767,1968.040869,11103.716114,2839.210085,3769.441622,3323.310171,3544.793870,1840.424071,1424.878464,9169.425142,8470.169464,2715.812024,3414.013019,2874.014667,2941.514461,5370.452361,6264.824631,2298.157049,2118.860721,1387.964514,967.145486,2358.274053,2307.649208,7267.829383,8130.561125,2144.173144,1907.923865,6878.650883,7117.009531,12912.499657,11882.073114,8530.286468,8347.826087,2611.398281,2317.141366,4837.836799,4199.752808,1187.574501,1671.674586,15983.740284,14967.024637,4672.251366,4851.547694,4239.830811,4672.251366,1018.825016,606.443462,3551.121975,3038.545415,6675.096817,4862.094537,12907.226235,11194.418962,3109.209261,3177.763740,3142.959158,4005.690901,2556.554698,2594.523332,1206.558818,1193.902606,3702.996512,3976.159741,10143.953418,11154.340960,3077.568733,3113.427999,1596.792002,1522.964102,1555.659315,1731.791590,9037.589607,8741.223324,2840.264770,2546.007855,1505.034469,1198.121344,12067.697547,11828.284215,902.809745,1252.964926,948.161169,971.364223,10644.928452,12009.689912,3580.653135,4822.016534,8888.879123,7140.212585,4647.993628,3295.888379,9925.633772,11152.231591,7421.813288,6059.161196,6676.151501,7780.405944,2469.015903,2239.094729,1097.926337,744.607103,1,0,0,0,0,0,0,1,0,0,0,0,0,0,2,2,1,1,1,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0
2,002_S_0559,79.223288,0,1570479.625,0,2366.717680,3157.732947,512.577893,552.656000,1172.811970,1141.171359,23368.700372,3041.717375,3198.865741,49984.782089,49158.962150,17229.367210,17536.281134,4728.161925,4206.091848,764.648092,640.195023,16674.601836,16115.617714,1304.647848,1335.233771,4119.607512,4195.544978,7287.887330,7510.426292,5413.708490,5620.427146,3802.146719,1700.155481,2524.920733,563.202870,591.679420,115548.346197,110692.567142,30553.228374,27892.253015,54122.319281,61104.347374,63260.127649,63511.143161,543.163817,640.195023,3323.318810,3096.561100,2498.553558,2377.264550,12633.041163,3755.740489,4386.443329,3620.740551,3068.084550,2433.162962,2104.100611,11164.916827,10359.135942,3451.990627,2741.131573,3804.256093,3905.506047,4038.396612,3980.388825,2408.905161,2620.897253,1344.725955,1242.421313,3342.303176,2947.850230,8735.972613,10286.362537,1908.983512,2000.741283,5694.255238,4487.693284,12286.049133,15746.477256,9619.800338,9030.230292,2045.038138,2290.780214,3946.638841,4054.216917,1891.053833,1156.991664,16048.117744,14235.110752,4362.185528,5427.419421,4695.466627,4353.748032,653.905954,1110.585435,1854.139787,2276.014596,6314.411208,5783.903635,13022.220675,12608.783362,3445.662505,2907.772123,5668.942750,4525.662016,2641.990993,2326.639573,1201.288519,1131.679176,3535.310902,3904.451360,10796.831056,11263.002720,3952.966963,4306.287116,1615.780519,1374.257191,1595.741466,1631.600825,8051.480735,12965.267576,3138.748581,3280.076642,1138.007298,1420.663420,11015.151270,12643.588034,1885.780397,1861.522596,932.343328,1025.155787,11981.244583,11908.471179,4343.201161,5510.739696,7549.449712,5636.247452,2599.803512,2494.334810,7430.270078,6746.832887,7983.980765,7081.168674,10475.151514,11083.705926,3069.139237,2872.967451,1051.522962,1274.061924,0,0,0,0,1,0,1,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1,0,0,2,0,0,0,0,1,0,0,0,0
3,002_S_0619,77.447945,0,1859348.250,2,5124.734093,2981.605944,280.547287,356.484899,876.446600,908.087272,21112.765397,2883.519862,2848.715123,56650.512928,55319.495347,13810.098417,14307.911649,3485.747309,3231.567248,1810.901098,1584.142953,47125.616120,60669.932896,1393.244235,1401.681747,3791.607134,3956.138626,7059.033810,7534.698571,5230.202998,5155.320075,4848.405562,2089.339007,3279.028256,557.930507,343.828630,105178.865483,105828.553938,30496.333873,30318.091423,52001.443596,58340.124785,64032.281587,63620.952857,546.328928,608.555582,3098.676428,2852.933879,2209.573559,2206.409492,12681.581134,2698.949278,4427.584631,3850.669721,3989.888675,2088.284318,2110.432788,10630.210932,9738.998685,2335.081556,3525.825493,3081.801403,3971.958961,4032.076237,5876.727385,1759.221335,1517.697543,1132.736039,1098.985990,2337.190934,1737.072865,9832.866011,8968.020990,2304.495574,2317.151842,8209.699563,8048.332138,12752.245300,13784.785880,9587.123462,11270.407185,3097.621739,2240.159542,4577.350476,4796.725798,1399.572369,1617.893002,17917.057577,16677.797943,6813.291261,7985.050795,5135.280983,4916.960350,831.094971,594.844624,2534.417786,2599.808508,5705.867759,4306.295390,11084.781913,13730.996738,5032.976145,4884.264989,4352.701708,5000.280785,2023.948286,2122.034368,1509.260030,1475.509981,5866.180495,5736.453741,11578.376388,11748.181325,3241.059450,2811.801007,1759.221335,2247.542365,2254.925188,2166.331308,9747.436198,10617.554664,2975.277809,3174.614040,1750.783823,1521.916299,12143.689718,12557.127826,1405.900503,1814.065166,1921.643449,1717.033773,12618.299791,13427.246292,3545.864585,3870.708812,7061.143188,8703.294038,3338.090842,2569.222525,10113.413298,11186.032061,7032.666584,6352.392146,7272.080998,7162.393337,2489.066157,2506.995871,1155.939198,1574.650751,0,1,1,1,0,0,0,2,1,1,0,0,0,0,1,1,1,1,1,0,1,1,1,1,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,2
5,002_S_0729,65.056164,1,1166961.750,1,966.095170,1921.643449,356.484899,415.547486,761.485494,853.243441,18195.495486,3485.747309,2921.488668,44029.049070,42879.438005,11626.892084,12026.619234,2887.738618,2592.425684,392.344326,283.711354,8403.762348,9388.841921,1287.775330,1255.079969,3543.755207,3659.771003,6422.001624,6685.673886,3798.989957,4033.130926,3400.317496,1551.447592,2534.417786,268.945708,158.203357,87173.214027,83566.177477,21149.679514,18514.011579,40216.348155,41546.311047,49603.080697,48494.602506,435.586578,433.477199,2722.152437,2497.503670,2012.346707,1917.424692,9808.608162,2805.472872,3514.223914,3042.777908,3393.989362,1351.056673,769.923006,9316.068376,7022.119693,2155.784418,2744.300907,2465.862998,2690.511766,3676.646027,3777.896176,1416.447394,1371.095765,1420.666150,1110.587569,1976.487279,1818.283922,7236.221570,7037.940029,1790.862006,1707.541571,5660.516130,4246.178114,10451.968483,10478.335709,6965.166484,7380.713970,1843.596459,1515.588164,3769.458664,3579.614635,1319.416001,1700.158748,15817.171679,13515.840172,4775.632017,4807.272689,3063.871689,3464.653528,1248.751835,534.727348,2865.590148,2599.808508,5004.499541,3889.693215,10419.273122,10018.491283,2405.745722,1425.939595,4143.873276,4313.678213,2103.049965,2000.745127,988.243640,1255.079969,2139.964082,3043.832597,8480.754649,6326.024920,2684.183632,2847.660434,1367.931697,1286.720641,1517.697543,1421.720839,6882.900738,9298.138662,2307.659641,2541.800610,1081.056276,1029.376513,8876.263042,9259.115167,980.860816,1336.291026,1059.962495,1116.915704,9093.528987,11257.750917,4128.052941,4293.639121,6690.947331,7654.933123,2551.292811,2318.206531,7825.792749,7452.432825,5885.164897,6838.603798,6198.407545,5983.250979,2492.230224,2634.613246,907.032583,965.040480,0,0,0,0,1,1,0,1,0,0,2,0,1,0,2,1,2,1,1,0,1,1,0,0,2,2,1,1,0,1,1,1,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,941_S_4377,69.187671,1,1213846.750,1,881.999947,1006.799940,291.599983,368.399978,848.399949,842.399950,16942.798990,2813.999832,2732.399837,38367.597713,37732.797751,10575.599370,10729.199360,3331.199801,3163.199811,277.199983,122.399993,10655.999365,10243.199389,1113.599934,1213.199928,3841.199771,3997.199762,6476.399614,6640.799604,3844.799771,3826.799772,3287.999804,2023.199879,2435.999855,275.999984,219.599987,70166.395818,70043.995825,18130.798919,17141.998978,34400.397950,34846.797923,39651.597637,39454.797648,511.199970,561.599967,2305.199863,2029.199879,1618.799904,1685.999900,9392.399440,2234.399867,2710.799838,3969.599763,3922.799766,1229.999927,1067.999936,4531.199730,4151.999753,3071.999817,3301.199803,2642.399843,2969.999823,3874.799769,3181.199810,1715.999898,1789.199893,1654.799901,1523.999909,2671.199841,2146.799872,6149.999633,5678.399662,1540.799908,1766.399895,6223.199629,5893.199649,9580.799429,9399.599440,6956.399585,6032.399640,1623.599903,1963.199883,3746.399777,3551.999788,673.199960,1451.999913,14074.799161,13335.599205,3883.199769,3945.599765,3530.399790,3260.399806,664.799960,518.399969,2203.199869,2095.199875,6124.799635,4739.999717,11431.199319,11432.399319,2571.599847,2919.599826,3221.999808,2474.399853,3680.399781,2359.199859,1059.599937,1625.999903,3466.799793,3680.399781,7255.199568,7193.999571,3002.399821,2816.399832,1857.599889,1628.399903,2129.999873,1987.199882,7388.399560,9505.199433,2341.199860,2143.199872,1143.599932,1201.199928,9118.799456,10125.599396,1430.399915,2164.799871,1171.199930,1341.599920,11763.599299,14017.199165,3299.999803,4031.999760,6173.999632,6542.399610,3784.799774,3429.599796,7781.999536,6890.399589,5427.599676,5349.599681,6283.199625,6148.799634,2323.199862,2611.199844,1282.799924,1429.199915,1,0,0,0,1,1,2,0,2,0,1,1,1,1,0,0,0,0,0,0,1,2,0,0,1,1,1,1,0,2,0,0,0,2,2,2,0,0,0,0,1,2,1,0,2,0,1,1,0,1,0,0,0,1
1563,941_S_4420,81.383562,0,1536545.875,1,1079.999936,1298.399923,431.999974,445.199973,1088.399935,985.199941,19977.598809,3214.799808,3257.999806,47859.597147,48692.397098,12937.199229,13166.399215,4255.199746,3687.599780,751.199955,795.599953,19527.598836,24886.798517,1491.599911,1505.999910,4939.199706,5123.999695,7327.199563,7389.599560,4948.799705,4453.199735,3733.199777,1881.599888,2569.199847,214.799987,379.199977,104933.993745,101272.793964,24487.198540,25827.598461,48183.597128,51826.796911,62361.596283,61167.596354,466.799972,519.599969,3023.999820,2663.999841,2312.399862,2177.999870,11644.799306,4216.799749,3838.799771,4249.199747,4172.399751,1487.999911,1598.399905,8042.399521,9107.999457,2579.999846,2102.399875,3128.399814,3187.199810,5668.799662,5121.599695,2408.399856,2120.399874,1171.199930,1093.199935,3076.799817,2522.399850,8249.999508,6717.599600,1777.199894,1523.999909,7329.599563,7115.999576,13821.599176,13209.599213,8264.399507,7203.599571,2499.599851,1689.599899,4246.799747,4184.399751,1691.999899,1109.999934,17227.198973,17866.798935,4737.599718,5257.199687,3725.999778,3925.199766,743.999956,706.799958,2959.199824,2786.399834,4959.599704,5493.599673,11245.199330,13256.399210,3637.199783,4102.799755,3964.799764,2979.599822,2301.599863,1179.599930,1507.199910,1688.399899,3853.199770,3777.599775,11365.199323,10028.399402,4081.199757,3471.599793,1966.799883,1672.799900,2194.799869,2072.399876,6458.399615,8288.399506,3418.799796,3122.399814,1657.199901,1630.799903,12458.399257,13774.799179,1852.799890,2234.399867,1373.999918,1565.999907,11845.199294,13184.399214,3931.199766,4689.599720,6287.999625,5626.799665,3124.799814,2771.999835,8386.799500,8369.999501,6985.199584,7204.799571,9337.199443,7413.599558,3394.799798,2974.799823,1454.399913,1228.799927,1,1,1,1,0,0,1,1,0,1,0,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,1,1,2,0,0,0,0,2,2,2,0,0,2,2,2,1,1,2,2,0,0,0,0,1,0,0,0,0
1564,941_S_4764,82.672603,1,1438682.375,1,2911.199826,2806.799833,203.999988,257.999985,872.399948,879.599948,20135.998800,2771.999835,2692.799839,49766.397034,49005.597079,14546.399133,14725.199122,3502.799791,3417.599796,1269.599924,898.799946,34168.797963,24159.598560,1229.999927,1301.999922,3470.399793,3406.799797,6903.599589,7033.199581,4495.199732,4612.799725,3977.999763,2102.399875,2887.199828,340.799980,275.999984,77746.795366,74926.795534,21007.198748,18071.998923,36950.397798,39129.597668,46949.997202,45447.597291,604.799964,455.999973,2845.199830,2633.999843,1772.399894,1849.199890,8560.799490,2911.199826,2881.199828,3688.799780,3614.399785,1567.199907,973.199942,7833.599533,5200.799690,2761.199835,2303.999863,2530.799849,2887.199828,3716.399778,3257.999806,2343.599860,2126.399873,1429.199915,1045.199938,2788.799834,2362.799859,7852.799532,7195.199571,1484.399912,1898.399887,6519.599611,4475.999733,10491.599375,9629.999426,8755.199478,7741.199539,1605.599904,1519.199909,3568.799787,3154.799812,1018.799939,1431.599915,14619.599129,14492.399136,4258.799746,4612.799725,2867.999829,3763.199776,862.799949,463.199972,2491.199852,1895.999887,5236.799688,4847.999711,10013.999403,11163.599335,3194.399810,2228.399867,3721.199778,3015.599820,1940.399884,1718.399898,970.799942,1102.799934,2851.199830,3519.599790,8074.799519,7475.999554,3034.799819,3064.799817,1535.999908,1312.799922,1397.999917,1745.999896,6812.399594,9147.599455,2774.399835,2467.199853,1150.799931,1268.399924,10103.999398,10629.599366,1148.399932,1419.599915,1233.599926,1661.999901,11665.199305,10906.799350,3868.799769,4814.399713,5631.599664,6375.599620,3123.599814,3779.999775,8471.999495,6921.599587,5782.799655,6337.199622,7234.799569,7052.399580,1897.199887,1976.399882,826.799951,1083.599935,0,0,0,0,2,2,0,2,0,1,1,2,2,2,1,1,1,0,0,0,0,1,2,2,1,1,0,0,1,0,2,2,2,1,1,1,1,1,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0
1565,941_S_5124,76.664384,1,1353772.750,0,1496.399911,1142.399932,226.799986,238.799986,814.799951,986.399941,17121.598979,2571.599847,2613.599844,40799.997568,37919.997740,11252.399329,11056.799341,2611.199844,2401.199857,1173.599930,903.599946,39554.397642,49455.597052,1089.599935,1198.799929,3371.999799,3441.599795,6752.399598,7288.799566,3669.599781,4016.399761,4837.199712,2242.799866,2045.999878,368.399978,356.399979,72938.395653,72790.795661,16736.399002,19359.598846,37213.197782,37405.197770,52543.196868,48931.197083,151.199991,237.599986,2701.199839,2535.599849,1742.399896,1747.199896,9619.199427,3761.999776,4827.599712,3044.399819,3215.999808,1461.599913,1097.999935,5925.599647,8337.599503,3215.999808,2239.199867,2836.799831,2810.399832,5439.599676,4383.599739,2066.399877,2001.599881,1113.599934,1605.599904,3344.399801,2876.399829,5933.999646,6645.599604,1389.599917,1534.799909,7534.799551,6320.399623,11071.199340,9958.799406,7286.399566,8525.999492,2161.199871,2290.799863,4663.199722,4142.399753,1066.799936,1808.399892,16085.999041,15931.199050,2968.799823,5188.799691,3347.999800,3628.799784,843.599950,481.199971,1402.799916,1694.399899,5456.399675,3436.799795,15909.599052,13811.999177,3209.999809,4238.399747,3220.799808,3424.799796,2506.799851,2048.399878,1117.199933,1532.399909,3375.599799,3566.399787,8815.199475,7204.799571,3658.799782,3085.199816,1631.999903,1789.199893,1604.399904,1658.399901,8210.399511,9099.599458,2953.199824,2977.199823,1442.399914,1610.399904,9107.999457,10970.399346,1537.199908,1352.399919,1473.599912,1432.799915,10263.599388,11079.599340,3147.599812,3183.599810,7949.999526,7853.999532,3065.999817,4151.999753,9046.799461,7985.999524,5089.199697,5537.999670,6945.599586,7079.999578,2033.999879,1708.799898,739.199956,791.999953,1,0,0,0,1,1,0,1,0,0,2,1,1,1,0,0,0,0,1,1,2,2,2,2,1,1,1,1,0,1,0,0,0,1,1,1,0,0,1,1,1,2,2,1,1,2,0,0,0,0,1,1,1,0


10-fold cross validation

In [39]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier


# Splitting dataset into train and test
X = data_age_filtered.drop(['diagnosis', 'PTID'], axis=1)
y = data_age_filtered['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Define the cross-validator
# Define the cross-validator
kf = KFold(n_splits=10, shuffle=True, random_state=42)


# Define the parameter grid for manual hyperparameter tuning
# param_grid = {
#     'n_neighbors': [3, 5, 7, 10, 15],  # Example: trying different numbers of neighbors
#     'weights': ['uniform', 'distance'],  # Example: uniform weights or distance-based
#     'metric': ['euclidean', 'manhattan']  # Example: different distance metrics
# }
# Define the KNN classifier
knn_model = KNeighborsClassifier(n_neighbors=30)

i=0

# Define a function for visualization
def visualize_data(X, y, title):
    # Separate the data based on diagnosis
    data_age_cn = X[y == 0]['Age']
    data_vol_cn = X[y == 0]['MUSE_Volume_48']

    data_age_mci = X[y == 1]['Age']
    data_vol_mci = X[y == 1]['MUSE_Volume_48']

    data_age_dem = X[y == 2]['Age']
    data_vol_dem = X[y == 2]['MUSE_Volume_48']

    # Scatter plot
    plt.scatter(data_age_cn, data_vol_cn, s=10, c='blue')
    plt.scatter(data_age_mci, data_vol_mci, s=10, c='green')
    plt.scatter(data_age_dem, data_vol_dem, s=10, c='red')

    plt.xlabel("Age (years)")
    plt.ylabel("Volume (mm^3)")
    plt.legend(["CN", "MCI", "DEM"])
    plt.title(title)
    plt.show()

# Iterate over each fold
for train_index, val_index in kf.split(X_train):
    # Split the data into the current fold's training and validation partitions
    fold_X_train, fold_X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    fold_y_train, fold_y_val = y_train.iloc[train_index], y_train.iloc[val_index]

    # Visualize initial data
    # visualize_data(fold_X_train, fold_y_train, "Initial Data - Fold")

    # Linear correction
    data_controls_train = fold_X_train[fold_y_train == 0]  # Control group for the fold
    for roi_feature in roi_features:
        if roi_feature in fold_X_train.columns:
            regr = LinearRegression()
            regr.fit(data_controls_train[['Sex', 'Age', 'DLICV_baseline']], data_controls_train[roi_feature])
            # Apply correction to the training set
            correction_train = regr.predict(fold_X_train[['Sex', 'Age', 'DLICV_baseline']])
            fold_X_train[roi_feature] -= correction_train

            # Apply the same correction to the validation set
            correction_val = regr.predict(fold_X_val[['Sex', 'Age', 'DLICV_baseline']])
            fold_X_val[roi_feature] -= correction_val
    # visualize_data(fold_X_train, fold_y_train, "Linearly Corrected Data - Fold")

    # Z-normalization using control group in fold_X_train
    scaler = StandardScaler().fit(data_controls_train[roi_features])
    fold_X_train[roi_features] = scaler.transform(fold_X_train[roi_features])
    fold_X_val[roi_features] = scaler.transform(fold_X_val[roi_features])
    # visualize_data(fold_X_train, fold_y_train, "Z-Normalized Data - Fold")

    fold_X_train = fold_X_train.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
    fold_X_val = fold_X_val.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

    # Train the KNN model
    knn_model.fit(fold_X_train, fold_y_train)

    # Make predictions on the validation set
    y_pred_val = knn_model.predict(fold_X_val)

    # Calculate metrics (weighted because of dataset's imbalance)
    accuracy = knn_model.score(fold_X_val, fold_y_val)
    precision = precision_score(fold_y_val, y_pred_val, average='weighted')
    recall = recall_score(fold_y_val, y_pred_val, average='weighted')
    f1 = f1_score(fold_y_val, y_pred_val, average='weighted')
    auc = roc_auc_score(fold_y_val, knn_model.predict_proba(fold_X_val), multi_class='ovr', average='weighted')

    # Print the validation metrics
    print(f"Validation Metrics, Fold {i}: Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}, AUC: {auc}")
    i+=1

Validation Metrics, Fold 0: Accuracy: 0.48, Precision: 0.4911641276228801, Recall: 0.48, F1 Score: 0.4493849802371541, AUC: 0.6391716125157197
Validation Metrics, Fold 1: Accuracy: 0.6, Precision: 0.6170588344641695, Recall: 0.6, F1 Score: 0.5939223443223444, AUC: 0.7048422769567597
Validation Metrics, Fold 2: Accuracy: 0.616, Precision: 0.6436821428571429, Recall: 0.616, F1 Score: 0.5881713300249886, AUC: 0.7153595930611711
Validation Metrics, Fold 3: Accuracy: 0.47580645161290325, Precision: 0.48266567425220497, Recall: 0.47580645161290325, F1 Score: 0.4685674162106638, AUC: 0.6401079222011385
Validation Metrics, Fold 4: Accuracy: 0.5806451612903226, Precision: 0.6296121373475082, Recall: 0.5806451612903226, F1 Score: 0.546616981860382, AUC: 0.734309064745661
Validation Metrics, Fold 5: Accuracy: 0.4274193548387097, Precision: 0.478494623655914, Recall: 0.4274193548387097, F1 Score: 0.4198457223001402, AUC: 0.6312397990190891
Validation Metrics, Fold 6: Accuracy: 0.4838709677419355, 

Training on the whole train dataset and fitting on both the train and the test data

In [40]:
from sklearn.metrics import balanced_accuracy_score

# Define the control group from the training set
data_controls_train = X_train[y_train == 0]

# Linear Correction
for roi_feature in roi_features:
    if roi_feature in X_train.columns:
        regr = LinearRegression()
        regr.fit(data_controls_train[['Sex', 'Age', 'DLICV_baseline']], data_controls_train[roi_feature])
        # Apply correction to the training set
        correction_train = regr.predict(X_train[['Sex', 'Age', 'DLICV_baseline']])
        X_train[roi_feature] -= correction_train

        # Apply the same correction to the test set
        correction_test = regr.predict(X_test[['Sex', 'Age', 'DLICV_baseline']])
        X_test[roi_feature] -= correction_test

X_train = X_train.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
X_test = X_test.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

# Z-Normalization
scaler = StandardScaler().fit(data_controls_train[roi_features])
X_train[roi_features] = scaler.transform(X_train[roi_features])
X_test[roi_features] = scaler.transform(X_test[roi_features])

# After cross-validation, train the model on the entire training set
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = knn_model.predict(X_test)

# Calculate and print standard accuracy for the test set
test_accuracy = knn_model.score(X_test, y_test)
print(f"Test set accuracy: {test_accuracy}")

# Calculate and print balanced accuracy for the test set
test_balanced_accuracy = balanced_accuracy_score(y_test, y_pred_test)
print(f"Test set Balanced Accuracy: {test_balanced_accuracy}")

# Calculate and print metrics for the test set
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1 = f1_score(y_test, y_pred_test, average='weighted')
test_auc = roc_auc_score(y_test, knn_model.predict_proba(X_test), multi_class='ovr', average='weighted')

print(f"Test set Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}, AUC: {test_auc}")


Test set accuracy: 0.4681818181818182
Test set Balanced Accuracy: 0.4415488042071307
Test set Precision: 0.48012741361637795, Recall: 0.4681818181818182, F1 Score: 0.4427316404851951, AUC: 0.6243919510270833


In [41]:
X_train

Unnamed: 0,MUSE_Volume_4,MUSE_Volume_11,MUSE_Volume_23,MUSE_Volume_30,MUSE_Volume_31,MUSE_Volume_32,MUSE_Volume_35,MUSE_Volume_36,MUSE_Volume_37,MUSE_Volume_38,MUSE_Volume_39,MUSE_Volume_40,MUSE_Volume_41,MUSE_Volume_47,MUSE_Volume_48,MUSE_Volume_49,MUSE_Volume_50,MUSE_Volume_51,MUSE_Volume_52,MUSE_Volume_55,MUSE_Volume_56,MUSE_Volume_57,MUSE_Volume_58,MUSE_Volume_59,MUSE_Volume_60,MUSE_Volume_61,MUSE_Volume_62,MUSE_Volume_71,MUSE_Volume_72,MUSE_Volume_73,MUSE_Volume_75,MUSE_Volume_76,MUSE_Volume_81,MUSE_Volume_82,MUSE_Volume_83,MUSE_Volume_84,MUSE_Volume_85,MUSE_Volume_86,MUSE_Volume_87,MUSE_Volume_88,MUSE_Volume_89,MUSE_Volume_90,MUSE_Volume_91,MUSE_Volume_92,MUSE_Volume_93,MUSE_Volume_94,MUSE_Volume_95,MUSE_Volume_100,MUSE_Volume_101,MUSE_Volume_102,MUSE_Volume_103,MUSE_Volume_104,MUSE_Volume_105,MUSE_Volume_106,MUSE_Volume_107,MUSE_Volume_108,MUSE_Volume_109,MUSE_Volume_112,MUSE_Volume_113,MUSE_Volume_114,MUSE_Volume_115,MUSE_Volume_116,MUSE_Volume_117,MUSE_Volume_118,MUSE_Volume_119,MUSE_Volume_120,MUSE_Volume_121,MUSE_Volume_122,MUSE_Volume_123,MUSE_Volume_124,MUSE_Volume_125,MUSE_Volume_128,MUSE_Volume_129,MUSE_Volume_132,MUSE_Volume_133,MUSE_Volume_134,MUSE_Volume_135,MUSE_Volume_136,MUSE_Volume_137,MUSE_Volume_138,MUSE_Volume_139,MUSE_Volume_140,MUSE_Volume_141,MUSE_Volume_142,MUSE_Volume_143,MUSE_Volume_144,MUSE_Volume_145,MUSE_Volume_146,MUSE_Volume_147,MUSE_Volume_148,MUSE_Volume_149,MUSE_Volume_150,MUSE_Volume_151,MUSE_Volume_152,MUSE_Volume_153,MUSE_Volume_154,MUSE_Volume_155,MUSE_Volume_156,MUSE_Volume_157,MUSE_Volume_160,MUSE_Volume_161,MUSE_Volume_162,MUSE_Volume_163,MUSE_Volume_164,MUSE_Volume_165,MUSE_Volume_166,MUSE_Volume_167,MUSE_Volume_168,MUSE_Volume_169,MUSE_Volume_170,MUSE_Volume_171,MUSE_Volume_172,MUSE_Volume_173,MUSE_Volume_174,MUSE_Volume_175,MUSE_Volume_176,MUSE_Volume_177,MUSE_Volume_178,MUSE_Volume_179,MUSE_Volume_180,MUSE_Volume_181,MUSE_Volume_182,MUSE_Volume_183,MUSE_Volume_184,MUSE_Volume_185,MUSE_Volume_186,MUSE_Volume_187,MUSE_Volume_190,MUSE_Volume_191,MUSE_Volume_192,MUSE_Volume_193,MUSE_Volume_194,MUSE_Volume_195,MUSE_Volume_196,MUSE_Volume_197,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207,rs4575098,rs6656401,rs2093760,rs4844610,rs4663105,rs6733839,rs10933431,rs35349669,rs6448453,rs190982,rs9271058,rs9473117,rs9381563,rs10948363,rs2718058,rs4723711,rs1859788,rs1476679,rs12539172,rs10808026,rs7810606,rs11771145,rs28834970,rs73223431,rs4236673,rs9331896,rs11257238,rs7920721,rs3740688,rs10838725,rs983392,rs7933202,rs2081545,rs867611,rs10792832,rs3851179,rs17125924,rs17125944,rs10498633,rs12881735,rs12590654,rs442495,rs59735493,rs113260531,rs28394864,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358
512,-1.948789,-3.519628,-6.864576,-7.094928,-8.650670,-8.982399,-8.842779,-6.923820,-8.182858,-9.977695,-10.012131,-9.037972,-9.857895,-9.708276,-10.364709,-0.142393,-0.055402,0.030433,0.302972,-9.157386,-9.346332,-8.979470,-9.637544,-12.712174,-12.822045,-9.894595,-9.703516,-6.102152,-7.726123,-6.333825,-2.633948,-1.630805,-8.063320,-8.203913,-7.216565,-7.159445,-8.777602,-8.325936,-8.389574,-8.803952,-6.036176,-7.158957,-7.537406,-7.070318,-7.335203,-7.715266,-7.462522,-6.514412,-6.046324,-7.996275,-8.673601,-4.885418,-4.193093,-6.554702,-5.105966,-5.017504,-5.733212,-6.192827,-5.549694,-8.559681,-8.020253,-7.053861,-8.356034,-6.292032,-3.896864,-5.588596,-5.247532,-6.559102,-7.080791,-6.818210,-6.048775,-5.720854,-6.408606,-7.265054,-8.219865,-8.397948,-7.023477,-4.746934,-4.704919,-7.188947,-7.803646,-5.033622,-3.270467,-6.652709,-6.591331,-4.628801,-6.724356,-6.976084,-7.474191,-4.906424,-3.031026,-4.351703,-4.244168,-4.921928,-6.569336,-8.845398,-9.714961,-3.600114,-4.549492,-5.223332,-7.693628,-4.638961,-3.904311,-2.882125,-2.760562,-5.791632,-6.679109,-7.548102,-7.874723,-7.560746,-8.456730,-6.391372,-5.388566,-4.622870,-4.559614,-7.205274,-8.622123,-5.445830,-5.306905,-6.261344,-4.006021,-7.463475,-8.525057,-3.626543,-5.165306,-4.231103,-5.286721,-8.097333,-7.740874,-4.645647,-4.567676,-9.078165,-7.700008,-7.086495,-5.332105,-8.366084,-8.524313,-7.424042,-8.305104,-5.860586,-6.746749,-5.970426,-5.548002,-4.473527,-3.859377,1,0,0,0,0,0,2,0,1,1,1,0,0,0,1,1,0,0,0,0,1,0,0,0,1,1,1,1,0,1,2,2,2,2,2,2,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,1
298,-1.434834,-2.198937,-7.269048,-6.586277,-7.954955,-8.797102,-10.499959,-7.302622,-7.430222,-10.725687,-10.830201,-9.335841,-9.671028,-9.018257,-9.446377,-1.282818,-0.097160,-1.145097,-0.594534,-10.328835,-10.590601,-8.885402,-9.387713,-12.402899,-12.237226,-10.020527,-10.548008,-8.795988,-9.630844,-8.718587,-3.801347,-3.542276,-8.020289,-8.069396,-7.517905,-7.155770,-7.738319,-8.554843,-8.467086,-8.115072,-9.086758,-7.887541,-8.230440,-8.080836,-8.223895,-8.147744,-8.589088,-6.730650,-5.577287,-6.736796,-6.523127,-5.908805,-5.314039,-6.870509,-7.355878,-5.747821,-5.293708,-6.286288,-6.878733,-7.849789,-6.456637,-7.296699,-7.287396,-5.882071,-5.203495,-5.309232,-5.444604,-7.343218,-8.136756,-7.282723,-7.332423,-6.607534,-7.407126,-7.048259,-7.762914,-8.613857,-8.237352,-5.866154,-5.013521,-9.590194,-9.410863,-5.572733,-5.428775,-7.509491,-7.117613,-5.767464,-5.183426,-7.999004,-9.731276,-6.144387,-2.066418,-5.298239,-6.060603,-5.090068,-6.982145,-7.734655,-7.831658,-6.306708,-5.050745,-6.513002,-6.664972,-4.772427,-3.252290,-3.472368,-3.872162,-7.094198,-7.751763,-8.072881,-8.294960,-7.607368,-7.930755,-5.725073,-6.489447,-4.084394,-4.182473,-8.996184,-8.512622,-6.163249,-5.641967,-6.600672,-5.815179,-8.222529,-8.178878,-4.402790,-3.769142,-5.491839,-4.951347,-8.530947,-8.248961,-7.119578,-7.220229,-8.778469,-7.903819,-6.438227,-5.603591,-6.834209,-8.944727,-5.740882,-5.338974,-6.485209,-7.466340,-3.175904,-3.475261,-3.731061,-3.108884,1,1,2,1,0,0,1,1,1,0,0,1,1,1,2,2,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1,2,2,2,1,2,0,0,0,1,1,0,0,1,1,1,0
1363,-1.510570,-3.228827,-6.579765,-5.268921,-7.254737,-7.520929,-10.380487,-7.080267,-8.072123,-11.377982,-11.083613,-9.918559,-10.123656,-7.861189,-9.078482,-1.449831,-0.643617,-0.927299,-0.947294,-9.849947,-10.240994,-8.434200,-8.358461,-12.334899,-11.888577,-11.211889,-10.915037,-7.898506,-8.489394,-9.844849,-3.362742,-2.928514,-7.431367,-7.729324,-7.837172,-7.575687,-7.862444,-7.713804,-8.973864,-8.378510,-7.591939,-7.229142,-8.864399,-9.145463,-7.766223,-9.080499,-7.613294,-4.001407,-2.570407,-5.437212,-6.247891,-4.906263,-4.000885,-4.201254,-5.562987,-5.748920,-6.278955,-3.787946,-4.661209,-7.461507,-6.359973,-6.215575,-4.574577,-4.609558,-4.695205,-6.294794,-5.237400,-5.631909,-6.493038,-5.326334,-5.137497,-4.914433,-7.418850,-7.849109,-7.384649,-6.659681,-6.199920,-4.767435,-6.139460,-7.745419,-6.992435,-4.696608,-3.770617,-5.421510,-5.454047,-6.440853,-5.773031,-6.541537,-7.598348,-4.950493,-3.454689,-3.430851,-4.896983,-4.764681,-6.741770,-8.286985,-7.402054,-5.025719,-5.921427,-6.932571,-7.076700,-4.648830,-3.073413,-4.145375,-3.805760,-5.911536,-5.654423,-5.883223,-6.331162,-7.178010,-6.394812,-5.017504,-5.383020,-2.674935,-3.920216,-7.040448,-7.238116,-6.722592,-6.935228,-3.497086,-4.603677,-6.264432,-7.156067,-4.278538,-4.763518,-2.109274,-3.021883,-6.546782,-6.364132,-5.449127,-6.011212,-7.095633,-4.923509,-6.573222,-6.224721,-6.003910,-5.475735,-7.174658,-6.244177,-6.428294,-5.446823,-5.069435,-3.951185,-3.734238,-3.423174,0,0,0,0,2,2,0,1,0,2,1,1,1,1,1,1,0,0,0,1,2,1,0,0,1,1,0,0,1,1,1,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0
641,-1.974048,-3.740999,-6.410763,-6.621980,-7.165562,-7.990831,-9.918566,-7.458152,-8.134446,-10.364532,-10.192699,-9.581755,-9.390247,-7.360944,-7.374948,-1.052191,-1.426215,-2.489631,-2.314355,-8.944846,-9.448906,-8.913390,-8.579584,-11.571284,-11.863467,-10.123616,-10.414532,-6.984602,-6.562749,-8.756769,-2.235764,-2.315399,-8.225529,-8.267927,-7.363898,-5.911849,-8.375150,-8.357772,-8.138115,-8.150457,-7.658863,-6.331038,-8.494853,-8.576426,-8.354563,-8.659856,-7.681045,-5.690813,-4.803776,-7.668149,-7.591104,-4.101944,-5.631571,-7.432043,-5.588937,-3.945648,-3.899320,-5.867630,-6.230304,-4.274509,-4.839850,-5.604117,-5.288617,-4.777994,-4.465549,-4.381638,-5.378907,-6.105558,-6.300515,-5.557488,-5.569761,-8.376177,-6.407938,-7.300602,-7.818032,-5.795108,-5.681640,-5.201184,-3.817523,-7.092472,-6.650726,-2.319536,-4.760722,-6.862005,-7.721854,-4.427164,-5.092218,-6.820685,-8.296624,-4.984758,-2.594287,-4.663104,-4.552596,-5.464604,-6.253710,-6.122144,-7.331891,-6.260469,-3.814282,-6.009686,-6.541878,-5.228644,-4.559001,-2.854511,-3.272183,-7.077526,-6.459557,-6.573579,-6.178351,-6.605020,-5.670196,-6.485791,-6.457696,-6.212968,-6.550647,-6.001589,-7.796001,-6.758324,-6.684154,-5.641474,-4.709116,-8.185884,-7.447775,-4.266349,-6.130032,-3.765974,-3.196808,-7.722913,-6.983057,-4.570855,-5.920439,-7.115013,-6.292528,-4.798258,-4.719049,-7.072690,-7.114981,-6.479754,-4.733507,-5.755783,-6.072705,-5.309987,-4.528270,-4.950449,-3.386491,0,0,0,0,1,1,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,2,2,0,0,0,0,1,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,1
951,-1.292084,-3.661094,-7.820499,-7.505824,-8.284537,-9.396333,-9.450188,-6.564946,-7.282691,-10.661381,-10.698376,-8.898874,-9.002517,-9.513228,-9.761840,-1.180694,-0.878081,-0.603183,-0.719884,-8.797151,-9.394577,-8.384294,-8.098654,-11.217447,-11.117691,-9.798189,-10.229973,-7.942824,-7.267279,-7.626696,-2.622311,-2.750169,-7.718968,-7.841531,-7.967199,-6.885053,-7.996051,-8.046069,-8.259558,-8.026165,-6.584107,-7.705709,-7.815856,-7.673214,-8.342612,-8.774200,-8.302771,-4.222698,-4.629914,-8.098581,-8.584151,-5.066066,-5.295170,-7.739554,-6.580412,-5.516680,-5.626182,-6.898229,-6.456151,-5.662569,-7.176553,-6.666673,-6.971202,-6.734314,-6.230263,-4.945084,-5.040200,-6.876400,-7.754809,-6.554503,-7.126180,-7.681194,-6.484341,-8.159707,-9.172382,-7.977185,-7.579585,-4.178609,-5.104372,-7.779134,-7.241929,-5.247489,-5.428269,-7.099252,-7.070121,-4.773942,-5.424451,-8.116150,-6.149353,-4.427450,-4.049493,-4.097404,-4.083851,-6.406340,-6.507472,-8.442089,-7.719698,-4.640063,-3.051329,-5.045850,-7.430742,-5.314562,-6.148259,-3.257746,-3.666362,-7.087132,-7.034602,-6.121501,-5.885705,-6.999917,-7.152024,-6.476779,-5.764053,-4.467525,-4.805751,-7.815321,-8.426496,-6.749071,-6.442604,-5.213704,-3.652050,-7.252364,-6.606512,-4.712799,-4.949602,-3.517781,-2.891048,-7.274299,-7.255090,-4.628447,-5.834825,-8.094743,-7.653189,-6.065780,-4.578790,-7.717210,-5.997106,-6.788509,-7.692890,-7.155761,-7.248952,-6.509690,-5.132794,-4.165504,-3.959767,1,0,0,0,0,0,1,1,0,0,0,1,1,1,2,1,0,0,0,1,2,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,0,0,1,1,2,1,1,0,1,2,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1217,-2.317147,-4.258141,-7.137593,-5.951904,-6.266274,-7.897361,-9.254543,-6.949989,-7.373099,-9.880680,-9.357987,-8.475751,-8.167887,-7.755109,-8.139379,-1.535678,-1.680341,-2.155849,-2.084222,-8.472594,-8.357831,-8.896208,-8.824702,-11.587706,-11.074782,-9.434537,-9.330830,-7.303154,-7.830771,-6.833760,-1.218687,-1.844449,-7.698889,-7.514107,-5.663955,-5.595973,-7.663410,-7.631754,-8.068225,-7.367298,-7.208193,-7.265106,-7.403511,-7.417762,-6.637227,-7.032531,-7.083857,-4.381187,-5.731564,-7.639119,-6.627119,-4.282720,-5.729339,-8.279861,-6.905135,-4.022915,-4.499138,-5.737569,-6.084992,-6.363678,-4.456262,-5.124391,-6.100639,-5.203127,-5.222174,-5.704033,-5.318813,-7.978050,-8.638012,-3.940175,-5.548988,-6.893223,-5.928105,-6.918574,-6.899299,-8.048442,-6.276818,-5.056781,-4.672061,-6.529209,-6.261204,-4.679997,-4.151904,-7.838799,-7.266830,-5.949744,-6.165929,-7.396895,-7.558203,-4.379118,-3.512918,-5.384434,-5.617050,-6.644860,-6.552491,-7.781748,-6.229130,-5.188008,-3.824867,-6.063937,-5.952862,-5.707550,-4.888973,-4.477762,-5.395998,-7.521695,-7.498917,-7.908837,-6.521637,-6.694660,-7.174500,-5.564165,-6.852670,-4.423080,-4.358568,-6.983141,-8.043188,-6.726693,-7.083563,-4.399256,-4.328339,-8.872208,-8.210550,-4.314714,-3.338749,-5.668413,-5.513031,-8.294329,-8.671362,-6.130637,-6.265475,-6.636752,-7.192180,-4.769343,-4.062421,-7.115699,-6.240681,-5.714979,-7.011635,-5.861368,-6.624895,-4.591765,-4.365140,-3.555967,-2.245882,0,0,0,0,1,1,1,0,0,0,0,1,1,1,2,1,1,1,1,0,1,0,2,2,0,0,0,0,0,2,0,0,0,1,1,1,0,0,1,1,1,0,0,0,1,0,1,1,0,1,1,1,1,0
1389,-2.584269,-3.087932,-6.473044,-6.125954,-9.545250,-9.307922,-9.400060,-6.404127,-6.512326,-9.891591,-9.560693,-8.464698,-8.717482,-11.626388,-11.409050,0.716313,1.103377,-1.196079,-1.133523,-8.314425,-9.077054,-8.020484,-8.107316,-12.115909,-11.412459,-10.460833,-9.966650,-7.730730,-7.128650,-5.985762,-3.522714,-3.136715,-7.455801,-7.485361,-6.048854,-5.056514,-7.527666,-7.373676,-8.290886,-7.802622,-7.463729,-9.170676,-8.238103,-8.116768,-7.586838,-7.335016,-8.264510,-5.625693,-5.229770,-7.545736,-7.031460,-4.644194,-4.816911,-8.156144,-7.150448,-4.587208,-4.560885,-6.777845,-6.221551,-5.785325,-5.336639,-8.207697,-8.094238,-6.003864,-4.403686,-5.796594,-6.197991,-9.195074,-8.131890,-6.207734,-6.373324,-7.889186,-7.757009,-9.251095,-9.324096,-8.438490,-6.606806,-4.970705,-4.285717,-8.057287,-6.399196,-4.605340,-4.432116,-7.497709,-7.619794,-7.084476,-7.425245,-8.111092,-8.520692,-4.455427,-3.440926,-5.162306,-5.992583,-6.375419,-6.289774,-9.842477,-8.812606,-5.473639,-4.748028,-7.005390,-6.542468,-5.579149,-4.879938,-3.548168,-5.612609,-7.645192,-6.991951,-7.877351,-7.750620,-9.079795,-9.208414,-7.677815,-6.993881,-4.824892,-4.357163,-7.936659,-7.214750,-6.588384,-6.135443,-6.683854,-5.999331,-8.298767,-8.685530,-4.834170,-4.796062,-5.088525,-5.004210,-8.203639,-7.800109,-6.071714,-7.194555,-6.966768,-7.663372,-7.006227,-5.634786,-7.556070,-7.609878,-7.542782,-8.360947,-7.987769,-8.715824,-4.295097,-3.913567,-4.136603,-2.876520,0,0,0,0,0,0,0,0,0,1,0,2,2,2,0,0,1,1,1,1,1,1,2,2,0,0,0,0,2,0,1,1,1,1,1,1,0,0,2,2,2,1,1,1,0,1,0,0,0,1,0,0,0,1
922,-1.464831,-3.767686,-7.689270,-7.363183,-7.853829,-8.943712,-9.843380,-7.542737,-8.320689,-10.240528,-10.203675,-9.837351,-10.196522,-8.733380,-9.672708,-0.330069,-0.797137,-0.482458,-0.665626,-9.258171,-9.453722,-8.642220,-9.362908,-10.395508,-10.487368,-9.998100,-10.093627,-8.488969,-6.576502,-7.639172,-3.292382,-3.298440,-7.275939,-7.342112,-6.775677,-6.650759,-7.586355,-7.282717,-7.734623,-7.587034,-6.875242,-8.642157,-8.343823,-8.796612,-8.260693,-8.827288,-7.761281,-5.385608,-5.118976,-8.162282,-8.887927,-5.455661,-6.592407,-5.128587,-7.031413,-5.453413,-6.012848,-6.176780,-6.334570,-4.968037,-5.346631,-6.981332,-8.664105,-5.705228,-6.526564,-5.628726,-5.355275,-7.099508,-7.756570,-6.400543,-7.222769,-7.260975,-6.781747,-8.032963,-6.947967,-7.808391,-7.889214,-5.344331,-5.429806,-6.963521,-7.007491,-4.387119,-4.429444,-6.541650,-6.172557,-4.994825,-5.962796,-9.094755,-8.800290,-3.093158,-3.542500,-4.760968,-4.825710,-5.845611,-6.021955,-8.965154,-8.939262,-5.698054,-5.719905,-5.966230,-6.838294,-5.096140,-4.615165,-4.794199,-4.638001,-6.379821,-6.839647,-5.750040,-6.812745,-8.309185,-8.724174,-7.127524,-6.979722,-4.712704,-5.673222,-6.582362,-7.452538,-7.787704,-8.433979,-6.203870,-6.615093,-6.630635,-7.251637,-4.860903,-5.835109,-4.837131,-4.554506,-7.380736,-7.008252,-6.011054,-5.799299,-8.517243,-4.280354,-6.394038,-4.813497,-7.492336,-5.580856,-6.420604,-5.209694,-7.050706,-7.236646,-5.400382,-5.021622,-3.665739,-3.238301,1,0,0,0,0,0,0,2,0,1,0,1,1,1,2,2,0,0,0,0,1,1,0,0,1,1,1,1,0,2,1,1,1,1,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,1,1,0
1563,-3.936370,-4.358209,-4.947124,-5.275294,-7.324943,-8.814710,-10.040405,-7.164168,-7.400924,-10.404935,-10.138720,-9.439179,-9.673792,-7.649724,-8.390936,-1.482313,-1.196085,-2.300046,-1.996537,-8.148271,-8.425950,-6.648030,-6.411529,-11.206091,-11.308527,-9.625964,-11.043032,-8.510102,-8.387588,-8.278320,-3.893318,-2.234085,-7.347388,-7.424127,-7.384909,-6.487309,-7.716879,-7.580592,-7.799010,-7.631759,-7.367216,-7.817640,-8.565103,-8.688723,-7.372506,-7.883663,-7.291256,-3.489701,-6.043831,-6.779867,-6.882758,-5.661074,-4.722013,-6.307691,-4.422263,-5.515477,-6.366006,-5.455840,-6.029543,-4.481998,-4.981246,-6.414800,-7.119956,-6.105830,-5.809823,-5.022774,-4.805420,-6.928583,-8.259474,-6.426470,-7.621316,-6.554134,-6.654112,-6.703695,-7.153446,-8.008375,-7.717794,-4.630102,-5.407614,-7.449696,-6.588666,-3.835414,-5.957170,-6.748965,-6.181433,-5.374458,-5.750794,-8.399708,-8.831902,-4.566860,-2.454007,-4.036008,-4.205367,-7.170979,-5.313573,-9.074305,-7.361185,-4.859171,-3.613016,-5.880811,-8.381921,-4.650886,-5.867598,-3.165738,-3.764353,-6.400316,-7.426428,-6.123301,-6.969560,-5.976244,-7.694137,-6.263982,-6.424675,-3.405397,-4.506103,-8.636355,-8.088637,-5.819750,-6.011319,-4.421765,-4.972184,-6.632649,-5.994794,-2.718920,-2.775560,-5.325857,-5.028276,-7.814583,-7.092005,-6.113987,-6.088803,-8.158023,-8.230320,-5.972340,-4.762147,-7.650128,-7.330467,-5.670211,-5.510896,-6.199551,-7.477993,-3.797604,-4.786747,-3.008891,-3.715744,1,1,1,1,0,0,1,1,0,1,0,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,1,1,2,0,0,0,0,2,2,2,0,0,2,2,2,1,1,2,2,0,0,0,0,1,0,0,0,0


In [42]:
X_test

Unnamed: 0,MUSE_Volume_4,MUSE_Volume_11,MUSE_Volume_23,MUSE_Volume_30,MUSE_Volume_31,MUSE_Volume_32,MUSE_Volume_35,MUSE_Volume_36,MUSE_Volume_37,MUSE_Volume_38,MUSE_Volume_39,MUSE_Volume_40,MUSE_Volume_41,MUSE_Volume_47,MUSE_Volume_48,MUSE_Volume_49,MUSE_Volume_50,MUSE_Volume_51,MUSE_Volume_52,MUSE_Volume_55,MUSE_Volume_56,MUSE_Volume_57,MUSE_Volume_58,MUSE_Volume_59,MUSE_Volume_60,MUSE_Volume_61,MUSE_Volume_62,MUSE_Volume_71,MUSE_Volume_72,MUSE_Volume_73,MUSE_Volume_75,MUSE_Volume_76,MUSE_Volume_81,MUSE_Volume_82,MUSE_Volume_83,MUSE_Volume_84,MUSE_Volume_85,MUSE_Volume_86,MUSE_Volume_87,MUSE_Volume_88,MUSE_Volume_89,MUSE_Volume_90,MUSE_Volume_91,MUSE_Volume_92,MUSE_Volume_93,MUSE_Volume_94,MUSE_Volume_95,MUSE_Volume_100,MUSE_Volume_101,MUSE_Volume_102,MUSE_Volume_103,MUSE_Volume_104,MUSE_Volume_105,MUSE_Volume_106,MUSE_Volume_107,MUSE_Volume_108,MUSE_Volume_109,MUSE_Volume_112,MUSE_Volume_113,MUSE_Volume_114,MUSE_Volume_115,MUSE_Volume_116,MUSE_Volume_117,MUSE_Volume_118,MUSE_Volume_119,MUSE_Volume_120,MUSE_Volume_121,MUSE_Volume_122,MUSE_Volume_123,MUSE_Volume_124,MUSE_Volume_125,MUSE_Volume_128,MUSE_Volume_129,MUSE_Volume_132,MUSE_Volume_133,MUSE_Volume_134,MUSE_Volume_135,MUSE_Volume_136,MUSE_Volume_137,MUSE_Volume_138,MUSE_Volume_139,MUSE_Volume_140,MUSE_Volume_141,MUSE_Volume_142,MUSE_Volume_143,MUSE_Volume_144,MUSE_Volume_145,MUSE_Volume_146,MUSE_Volume_147,MUSE_Volume_148,MUSE_Volume_149,MUSE_Volume_150,MUSE_Volume_151,MUSE_Volume_152,MUSE_Volume_153,MUSE_Volume_154,MUSE_Volume_155,MUSE_Volume_156,MUSE_Volume_157,MUSE_Volume_160,MUSE_Volume_161,MUSE_Volume_162,MUSE_Volume_163,MUSE_Volume_164,MUSE_Volume_165,MUSE_Volume_166,MUSE_Volume_167,MUSE_Volume_168,MUSE_Volume_169,MUSE_Volume_170,MUSE_Volume_171,MUSE_Volume_172,MUSE_Volume_173,MUSE_Volume_174,MUSE_Volume_175,MUSE_Volume_176,MUSE_Volume_177,MUSE_Volume_178,MUSE_Volume_179,MUSE_Volume_180,MUSE_Volume_181,MUSE_Volume_182,MUSE_Volume_183,MUSE_Volume_184,MUSE_Volume_185,MUSE_Volume_186,MUSE_Volume_187,MUSE_Volume_190,MUSE_Volume_191,MUSE_Volume_192,MUSE_Volume_193,MUSE_Volume_194,MUSE_Volume_195,MUSE_Volume_196,MUSE_Volume_197,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207,rs4575098,rs6656401,rs2093760,rs4844610,rs4663105,rs6733839,rs10933431,rs35349669,rs6448453,rs190982,rs9271058,rs9473117,rs9381563,rs10948363,rs2718058,rs4723711,rs1859788,rs1476679,rs12539172,rs10808026,rs7810606,rs11771145,rs28834970,rs73223431,rs4236673,rs9331896,rs11257238,rs7920721,rs3740688,rs10838725,rs983392,rs7933202,rs2081545,rs867611,rs10792832,rs3851179,rs17125924,rs17125944,rs10498633,rs12881735,rs12590654,rs442495,rs59735493,rs113260531,rs28394864,rs111278892,rs3752246,rs4147929,rs41289512,rs3865444,rs6024870,rs6014724,rs7274581,rs429358
1265,-3.309464,-4.589947,-5.964318,-6.509761,-9.947689,-10.780832,-9.899748,-7.468103,-8.306570,-9.352233,-9.396177,-9.058342,-9.188435,-10.534271,-11.049213,1.695588,0.398668,-1.916758,-2.108035,-10.899753,-11.506332,-9.845572,-9.939792,-12.738586,-12.791460,-11.685824,-11.397368,-7.009572,-6.922915,-6.419431,-3.486047,-2.997706,-8.243422,-7.530000,-6.153934,-6.000205,-7.516836,-7.653246,-8.959659,-8.470608,-7.913952,-9.257210,-9.771446,-9.640706,-9.152201,-8.965747,-5.436624,-6.759787,-6.196322,-8.482249,-8.650926,-5.414305,-4.544198,-6.212716,-5.792334,-4.578755,-5.010197,-4.544963,-5.145586,-4.837992,-5.648015,-8.742161,-9.839436,-3.687005,-4.859218,-6.182925,-5.293910,-8.853400,-8.039916,-4.181795,-4.881480,-5.606954,-4.923812,-6.836528,-8.221920,-7.567308,-7.782377,-4.833783,-3.128559,-6.227546,-6.310260,-5.355004,-4.996329,-7.178063,-7.121728,-4.822446,-5.467314,-6.797282,-7.181904,-3.148319,-3.249608,-5.192592,-5.912716,-4.636811,-6.412822,-7.626504,-7.535902,-3.838355,-3.416350,-5.167452,-6.762384,-5.734198,-3.785097,-2.769875,-2.769170,-5.870201,-6.156860,-7.336142,-7.758991,-8.930510,-9.740600,-7.139965,-6.782006,-4.115482,-3.295785,-5.124248,-7.043680,-7.360928,-6.588384,-7.114730,-6.185791,-7.342422,-7.095875,-2.375967,-4.055555,-5.329970,-5.302848,-6.732715,-6.610646,-5.780499,-5.915326,-6.171352,-6.030063,-5.612252,-4.534737,-5.637116,-5.176893,-6.821106,-6.563247,-7.561916,-7.998300,-4.511867,-5.768109,-3.940361,-4.490651,0,0,0,0,1,1,0,1,0,1,0,0,1,0,1,1,1,1,1,1,1,2,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,1,2,0,2,1,0,0,0,0,0,0,0,0
186,-3.248195,-3.367730,-6.958884,-6.498109,-10.150273,-11.383250,-10.973846,-6.608165,-7.178596,-10.658116,-10.917881,-10.217798,-10.238819,-10.110306,-10.951233,-1.485354,-1.323790,-2.160882,-2.112806,-8.825879,-8.053383,-7.198752,-7.723210,-12.047313,-12.517356,-11.854895,-12.103397,-8.371021,-7.769874,-7.825388,-4.718086,-3.555547,-8.306734,-8.551732,-5.902991,-5.912051,-8.427015,-8.758220,-8.669314,-8.638991,-7.290880,-7.009413,-9.503513,-9.677894,-8.325630,-8.757985,-7.906235,-6.533238,-5.762344,-7.333796,-8.824320,-4.958318,-4.630988,-6.813664,-8.198264,-3.441112,-4.985711,-6.260885,-7.290065,-5.572457,-5.522731,-8.392934,-8.279309,-4.708229,-5.331303,-6.272114,-4.250581,-6.921688,-7.353067,-6.669126,-6.556052,-5.215868,-5.847810,-7.721140,-8.458499,-7.767961,-7.828129,-6.829117,-6.499986,-8.890230,-6.568439,-4.278762,-3.536146,-7.758662,-7.345536,-5.958094,-7.345200,-9.280557,-7.709454,-2.534871,-3.543066,-5.942088,-5.628308,-5.969876,-7.122941,-8.550157,-8.694689,-4.861478,-5.491233,-5.302277,-5.880503,-4.473657,-5.175518,-4.709334,-5.456203,-5.932323,-7.199753,-8.254632,-8.057060,-8.704374,-8.626103,-6.731287,-7.868896,-3.402024,-4.062801,-7.149487,-7.008502,-7.791176,-8.185474,-5.494143,-6.293479,-7.106168,-8.334961,-4.454707,-3.930846,-6.026830,-6.241231,-7.817716,-7.409230,-6.461295,-7.503290,-6.860485,-7.871531,-3.973506,-4.885175,-6.446642,-6.324956,-8.132708,-7.206085,-6.833268,-7.205140,-3.811595,-4.147376,-4.028892,-3.921519,1,1,1,1,1,1,2,0,0,1,1,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,2,0,0,0,0,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
299,0.933365,-1.714624,-6.691904,-7.318087,-8.064043,-8.696009,-10.990192,-6.214290,-6.067269,-11.027398,-11.410622,-9.706984,-9.948723,-10.501825,-10.527679,4.567259,4.047826,3.197629,3.083488,-7.919615,-8.558300,-8.815332,-8.337228,-12.947379,-12.701551,-11.458321,-11.642754,-8.410829,-7.560220,-7.797942,-3.797369,-4.132747,-7.225406,-7.762438,-5.853962,-5.269011,-7.758682,-8.259328,-8.298062,-7.887868,-5.794558,-6.688441,-9.493957,-9.661730,-7.418189,-7.629449,-7.691158,-4.715570,-6.859819,-7.618024,-8.989487,-6.745595,-5.935544,-7.613200,-6.972286,-5.023079,-5.343258,-7.596590,-7.387399,-6.110207,-4.378397,-8.830494,-9.160655,-5.843113,-6.091661,-6.939909,-6.492773,-9.738148,-9.139909,-6.691135,-7.140956,-6.617789,-6.624801,-9.795591,-9.097767,-7.876977,-7.703964,-5.868309,-7.584253,-6.681905,-5.145413,-4.734483,-5.480836,-7.784373,-7.669135,-6.015809,-7.369243,-8.614939,-8.066673,-3.675441,-2.999721,-4.399746,-4.889360,-7.455237,-6.462043,-9.311493,-8.596857,-5.806111,-5.015574,-6.757770,-5.575933,-4.280156,-4.416592,-4.496447,-6.298221,-7.174122,-6.894112,-8.873158,-8.413486,-9.321836,-9.494892,-7.172693,-7.350612,-6.074942,-8.088802,-8.397988,-8.869530,-7.049621,-7.232826,-7.360987,-7.439983,-8.766414,-8.685451,-5.500136,-6.280889,-4.337828,-5.096291,-7.857005,-8.449517,-8.262636,-8.567932,-8.150276,-7.463087,-6.539600,-5.263845,-7.143803,-8.177714,-7.673488,-9.123000,-7.634040,-7.614616,-5.027774,-4.831536,-5.165933,-4.749970,0,1,1,1,2,2,1,1,0,1,1,0,1,0,0,0,1,1,1,0,1,1,2,2,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,1,0,0,0,1
586,-2.691670,-2.779815,-5.612280,-5.666983,-8.693145,-9.446912,-8.786760,-6.878811,-7.821506,-10.434491,-10.661598,-7.914008,-7.883709,-9.671662,-10.446839,-0.241495,-0.755553,-2.400902,-2.228832,-7.752846,-8.036935,-7.633106,-7.874443,-10.979405,-10.328093,-9.829272,-9.842531,-7.410496,-6.798440,-7.819192,-3.351338,-4.465671,-6.606632,-6.850112,-6.958171,-5.969355,-7.050775,-7.244094,-7.864152,-8.113246,-8.206397,-8.375854,-7.163247,-6.848543,-7.033264,-7.168549,-7.168362,-4.667936,-5.338548,-8.546298,-8.158304,-5.247436,-5.206215,-7.011528,-6.815776,-5.287861,-5.137352,-6.988762,-7.703604,-6.419560,-6.306651,-6.851620,-7.524767,-5.725612,-5.961371,-5.176760,-4.040917,-7.258793,-8.085899,-6.678590,-7.727903,-7.184209,-8.481825,-7.177094,-9.457879,-8.201753,-7.888322,-6.154078,-6.187742,-6.859253,-6.123582,-5.103824,-3.831253,-7.285802,-7.026088,-6.824671,-7.491759,-8.096858,-7.758707,-3.534681,-3.179653,-6.630999,-5.809318,-6.389786,-6.442955,-8.330833,-9.582217,-6.767830,-4.581424,-5.414733,-7.927633,-4.016667,-5.119141,-6.083458,-5.152320,-6.330901,-7.543526,-8.246962,-8.996862,-7.989863,-9.279221,-6.531264,-6.718927,-4.694009,-4.586340,-7.343899,-8.011823,-7.152845,-6.773262,-6.334870,-5.708258,-7.075740,-7.723927,-4.993096,-5.410486,-5.981239,-6.285141,-7.024243,-8.582252,-6.329221,-8.333041,-7.027868,-7.385747,-7.550455,-6.553128,-9.017185,-10.044787,-8.015947,-7.493986,-6.784311,-7.427112,-6.073975,-5.152971,-4.851237,-4.098831,1,0,0,0,0,0,0,2,0,1,0,0,1,0,1,1,0,0,0,0,0,1,1,1,0,0,2,2,0,1,1,1,1,0,1,1,0,0,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,2
930,-2.478494,-3.242058,-6.174988,-5.792485,-9.611072,-9.205823,-10.045327,-7.257725,-7.957543,-9.220369,-9.965554,-9.282370,-9.519268,-9.646127,-9.810606,1.009850,-0.367362,-2.357862,-1.808706,-7.930821,-7.661441,-6.444216,-6.417120,-12.145347,-12.324204,-9.991031,-10.337941,-7.769419,-6.528713,-8.619572,-3.166196,-3.253543,-8.606235,-8.625383,-7.331635,-6.532524,-8.321040,-8.227157,-8.220355,-7.955899,-8.390141,-8.568037,-9.111749,-8.710729,-8.295828,-8.376328,-8.306551,-5.277807,-6.435872,-7.938519,-7.386026,-6.554325,-5.231733,-7.265022,-4.750291,-5.563425,-5.373960,-5.945472,-6.413049,-6.224797,-5.686973,-7.881855,-8.266820,-5.091709,-5.261020,-6.301360,-4.331026,-7.199377,-5.599020,-6.663352,-7.977927,-6.516217,-6.280831,-7.360790,-7.394613,-6.182616,-5.441119,-5.255876,-5.113976,-8.004983,-7.079983,-3.994884,-3.809997,-8.630521,-8.165343,-6.593369,-5.408610,-7.882440,-8.897447,-4.396451,-4.369171,-5.946176,-6.259261,-5.569638,-5.100483,-7.516008,-6.542801,-6.295273,-3.847867,-3.694256,-5.701995,-4.847134,-4.633003,-3.488200,-4.483231,-5.891387,-5.347059,-6.896945,-6.722858,-8.254928,-7.664302,-6.807057,-6.101954,-4.893062,-5.060588,-6.494209,-7.090980,-5.584213,-6.072452,-5.721749,-4.336972,-8.568733,-9.294242,-4.545318,-4.903001,-4.978342,-5.005127,-8.005165,-7.430382,-4.835473,-5.760020,-5.455345,-6.080402,-4.838972,-4.555535,-8.214348,-8.065983,-6.475063,-7.168956,-8.349798,-7.694172,-5.416786,-5.836909,-4.143079,-4.304580,0,0,0,0,0,0,0,1,0,0,1,2,2,2,0,0,1,1,1,1,2,1,1,1,2,2,2,2,0,0,1,1,1,1,1,1,0,0,0,0,0,2,1,0,2,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,-2.867220,-3.581308,-6.361901,-6.215734,-7.817603,-7.543501,-9.297890,-6.915793,-7.457082,-10.325205,-10.293419,-7.932760,-8.191824,-7.745994,-7.883950,-1.181644,-1.731945,-2.669506,-2.658771,-7.001940,-6.908001,-8.122273,-7.938749,-10.738256,-10.716272,-9.535832,-9.626056,-7.447088,-7.071887,-8.064955,-4.112303,-2.979037,-7.831460,-7.888623,-6.946006,-6.428841,-8.319311,-7.901651,-8.265949,-7.771800,-6.908803,-7.066906,-7.627186,-7.697350,-8.090065,-8.366597,-6.278698,-5.666354,-5.577213,-7.609488,-7.243359,-5.590927,-5.755792,-6.131420,-6.061960,-4.400686,-4.076921,-4.369934,-4.109685,-5.502975,-5.955956,-5.718446,-6.567422,-4.487820,-3.631242,-7.382377,-6.608785,-8.162584,-7.572011,-6.290773,-7.215178,-6.451147,-6.394658,-8.073640,-7.631143,-9.166197,-7.618533,-6.236525,-5.747570,-8.125980,-7.687684,-5.350674,-4.355626,-6.518774,-6.156586,-5.475348,-6.366268,-7.405216,-7.777784,-2.708492,-4.092319,-4.583032,-5.948893,-6.226901,-7.678408,-8.329079,-7.455453,-4.351759,-4.285495,-5.736124,-5.990633,-3.350343,-4.777295,-4.546421,-2.939922,-6.466858,-6.442785,-7.169296,-7.102752,-7.505809,-7.167997,-5.483239,-4.130513,-4.541743,-5.060283,-6.313241,-7.541856,-6.801791,-7.737116,-4.669859,-5.749042,-6.383046,-6.956795,-4.382444,-2.609898,-5.824203,-5.322454,-8.175697,-6.949020,-5.074391,-6.609883,-7.634216,-6.562300,-6.861913,-5.041157,-7.990920,-6.831467,-5.640162,-6.578260,-5.387940,-6.192689,-5.208323,-3.961762,-2.926476,-2.859290,0,0,0,0,0,0,1,2,0,1,1,0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,0,0,2,0,0,0,0,2,2,2,0,0,0,0,0,1,0,0,1,0,0,0,0,2,0,0,0,0
872,-1.747928,-4.449989,-5.372048,-6.560492,-7.944585,-8.105243,-8.994594,-6.540410,-7.336239,-10.135839,-9.985911,-9.143183,-9.333754,-8.174982,-9.155202,-1.134225,-0.523497,-2.293758,-1.970192,-8.396465,-8.801958,-7.842027,-7.897650,-10.736984,-11.157666,-9.296067,-9.372590,-7.666681,-6.322588,-7.506737,-3.173963,-4.458481,-7.063141,-7.341984,-6.798877,-5.590144,-6.571441,-6.892676,-7.973932,-7.692331,-7.087997,-8.432980,-6.892139,-7.083571,-7.131714,-7.869683,-6.740646,-4.758738,-6.999163,-9.009403,-8.549909,-5.407547,-3.660242,-7.200995,-5.139194,-5.045063,-4.103549,-6.132653,-6.473145,-5.913386,-6.860583,-5.483001,-5.493941,-7.054822,-5.237486,-3.493995,-5.065436,-7.949756,-8.232868,-6.194532,-6.088019,-7.175467,-6.712580,-7.299726,-8.278236,-6.508470,-5.985790,-5.358678,-4.747276,-6.804773,-7.536499,-2.998458,-2.718739,-7.555903,-7.545112,-6.439805,-6.249846,-6.098869,-7.029415,-3.179776,-3.384879,-4.090179,-4.154538,-6.194044,-6.732482,-7.702503,-6.877585,-5.601679,-3.742391,-5.269391,-7.788725,-4.994603,-3.877213,-2.388131,-4.365915,-6.800779,-7.128434,-5.548472,-5.404759,-7.329993,-7.784298,-6.289595,-6.529269,-4.303375,-3.799403,-6.189659,-7.062585,-4.499328,-5.904967,-5.776401,-5.282827,-7.590305,-7.191616,-4.929659,-5.873552,-4.234394,-2.471939,-7.487332,-6.499194,-5.996086,-6.538206,-5.727576,-7.434768,-6.347007,-4.937631,-7.765279,-6.938893,-7.416206,-7.518516,-7.174912,-6.320928,-6.207793,-4.614854,-4.380567,-3.753092,1,1,1,1,0,0,2,0,2,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0
335,-1.382597,-3.500232,-5.882939,-5.921523,-7.737600,-8.844399,-8.856868,-7.550093,-7.948671,-9.863292,-9.688529,-7.861574,-7.809546,-9.276080,-10.093100,-1.189537,0.023869,-1.898105,-0.272359,-9.004804,-9.308040,-8.392371,-8.866366,-12.158949,-12.033212,-9.466644,-9.737862,-9.039653,-7.004527,-6.823449,-3.131669,-3.038979,-7.449006,-7.743461,-7.663066,-6.774057,-8.019411,-8.154445,-8.886547,-8.557160,-5.733959,-5.100146,-7.786900,-7.795978,-8.080023,-7.884673,-7.293446,-3.555398,-5.806026,-9.092939,-9.088044,-6.210588,-5.977443,-6.815739,-6.634535,-5.215782,-5.093414,-6.551122,-7.383790,-6.819246,-5.130825,-6.207796,-7.366977,-5.195635,-6.533332,-5.597506,-5.692346,-7.286571,-9.085072,-6.761538,-7.608101,-6.252616,-7.915556,-8.736465,-10.227658,-8.343840,-7.063081,-4.502292,-6.470369,-6.206415,-5.955582,-4.441139,-5.942519,-7.989668,-7.303537,-5.770640,-6.997111,-8.934860,-8.559319,-4.109296,-3.630364,-6.262893,-5.117399,-7.607260,-7.160445,-9.222999,-9.404113,-5.214482,-4.269139,-6.668910,-8.085919,-3.186914,-5.237458,-2.422275,-2.202471,-6.460872,-6.654243,-8.180724,-7.725736,-7.487440,-8.651043,-6.669822,-8.245122,-4.584192,-3.933858,-6.930715,-8.032425,-6.772500,-6.053491,-6.161284,-6.757883,-6.017277,-6.779178,-5.360744,-4.797317,-5.625081,-5.812740,-8.068952,-7.988905,-6.785750,-5.858204,-7.921898,-7.859751,-6.179422,-4.663970,-7.519290,-8.357227,-7.230486,-7.133592,-7.548747,-7.653474,-6.385857,-6.092414,-5.150743,-4.497060,1,0,0,0,2,2,0,1,2,1,2,1,1,1,0,0,2,2,2,1,1,0,1,1,2,2,1,1,0,2,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0
1051,-1.233843,-3.103492,-4.564680,-5.882902,-9.364782,-9.139636,-10.036721,-7.364762,-7.914036,-10.657225,-10.513251,-10.122156,-10.136928,-9.076927,-9.182691,-1.831261,-1.567340,-1.789439,-1.683559,-8.360910,-9.023203,-7.634691,-7.539055,-12.057859,-11.912377,-10.372388,-10.341365,-7.786916,-8.710766,-6.505665,-2.555011,-2.713418,-8.345359,-8.448978,-6.648949,-6.373930,-8.134665,-8.375230,-8.226169,-8.509271,-6.247842,-6.749297,-8.719278,-8.615213,-7.507224,-8.139108,-6.259379,-4.039079,-6.891122,-7.271388,-7.306616,-5.490433,-6.315223,-5.154155,-5.370865,-4.589079,-4.894810,-4.998760,-6.176752,-6.529801,-6.254127,-6.678470,-6.251280,-4.767854,-3.756980,-4.571850,-4.230685,-6.206469,-6.269814,-5.285903,-6.429165,-6.275057,-6.095670,-6.780406,-7.639595,-6.888316,-5.491262,-4.536519,-3.116248,-6.167922,-7.449190,-3.991753,-3.910913,-7.455243,-7.099066,-5.002952,-6.090908,-7.536154,-7.646632,-4.210648,-3.016891,-5.002508,-4.932732,-4.870985,-3.448643,-6.760559,-7.305810,-4.975448,-3.442743,-4.547054,-4.619559,-4.041627,-1.789465,-2.527260,-5.027366,-4.451465,-5.708625,-7.092382,-6.335925,-5.933244,-6.606695,-7.324626,-7.499034,-3.932600,-4.247305,-6.107916,-7.229488,-6.019880,-6.037811,-5.432615,-5.663182,-6.907516,-7.808976,-3.288968,-4.379272,-5.370269,-6.242127,-6.908476,-7.243173,-6.916129,-6.500662,-6.713818,-5.858223,-6.764612,-5.011513,-7.307429,-7.250113,-5.263575,-6.718475,-7.445584,-6.788950,-4.916251,-5.332387,-4.064450,-4.733032,0,1,1,1,1,1,0,2,0,2,0,0,0,0,0,1,1,1,1,0,0,1,1,0,0,0,1,1,1,1,0,0,0,1,1,1,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0


A few considerations should be kept in mind to further ensure that our approach is unbiased:

* Representation of Control Group: Make sure that the control group (CN) in the training data is representative of the normal population.

* Generalizability of Corrections: Applying corrections based solely on the control group assumes that the relationship between predictors (like age, sex) and the outcome (ROI features) is the same in control and diseased groups. If this assumption does not hold, the model may not capture disease-specific patterns effectively.

* Statistical Assumptions: Linear regression makes certain assumptions (like linearity, normality, homoscedasticity, and independence of errors). Ensure these assumptions hold for your data; otherwise, the corrections might be inappropriate.

Nested CV with GridSearch

In [43]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

X = data_age_filtered.drop(['diagnosis', 'PTID'], axis=1)
y = data_age_filtered['diagnosis']

# Assuming X, y, roi_features are defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

param_grid = {
    'n_neighbors': [3, 5, 7, 10, 15, 30],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean']
}

outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

outer_fold_results = []

for train_index, test_index in outer_cv.split(X_train):
    X_train_outer, X_val_outer = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_outer, y_val_outer = y_train.iloc[train_index], y_train.iloc[test_index]

    inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=inner_cv, scoring='accuracy', n_jobs=-1)

    for inner_train_index, inner_val_index in inner_cv.split(X_train_outer):
        X_train_inner, X_val_inner = X_train_outer.iloc[inner_train_index], X_train_outer.iloc[inner_val_index]
        y_train_inner, y_val_inner = y_train_outer.iloc[inner_train_index], y_train_outer.iloc[inner_val_index]

        # Preprocess for the inner fold
        data_controls_train_inner = X_train_inner[y_train_inner == 0]
        for roi_feature in roi_features:
            if roi_feature in X_train_inner.columns:
                regr = LinearRegression()
                regr.fit(data_controls_train_inner[['Sex', 'Age', 'DLICV_baseline']], data_controls_train_inner[roi_feature])
                # Apply correction to the training set
                correction_train = regr.predict(X_train_inner[['Sex', 'Age', 'DLICV_baseline']])
                X_train_inner[roi_feature] -= correction_train

                # Apply the same correction to the validation set
                correction_val = regr.predict(X_val_inner[['Sex', 'Age', 'DLICV_baseline']])
                X_val_inner[roi_feature] -= correction_val

        # Z-normalization for training and validation sets
        scaler = StandardScaler().fit(data_controls_train_inner[roi_features])
        X_train_inner[roi_features] = scaler.transform(X_train_inner[roi_features])
        X_val_inner[roi_features] = scaler.transform(X_val_inner[roi_features])

        X_train_inner = X_train_inner.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
        X_val_inner = X_val_inner.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

        grid_search.fit(X_train_inner, y_train_inner)

    # Evaluate the best model on the outer validation set
    best_model = grid_search.best_estimator_

    # We apply the linear correction and the z normalisation of the outer
    # loop after we are doe with the inner loop, in order to avoid applying
    # these filter twice.
    data_controls_train_outer = X_train_outer[y_train_outer == 0]
    for roi_feature in roi_features:
        if roi_feature in X_train_outer.columns:
            regr = LinearRegression()
            regr.fit(data_controls_train_outer[['Sex', 'Age', 'DLICV_baseline']], data_controls_train_outer[roi_feature])
            # Apply correction to the training set
            correction_train = regr.predict(X_train_outer[['Sex', 'Age', 'DLICV_baseline']])
            X_train_outer[roi_feature] -= correction_train

            # Apply the same correction to the validation set
            correction_val = regr.predict(X_val_outer[['Sex', 'Age', 'DLICV_baseline']])
            X_val_outer[roi_feature] -= correction_val

    # Z-normalization for training and validation sets
    scaler = StandardScaler().fit(data_controls_train_outer[roi_features])
    X_train_outer[roi_features] = scaler.transform(X_train_outer[roi_features])
    X_val_outer[roi_features] = scaler.transform(X_val_outer[roi_features])

    X_train_outer = X_train_outer.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
    X_val_outer = X_val_outer.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)


    y_pred_val = best_model.predict(X_val_outer)

    # # Calculate and print metrics for each outer fold
    # accuracy = best_model.score(X_val_outer, y_val_outer)
    # print(f"Outer Fold Metrics: Accuracy: {accuracy}, Balanced Accuracy: {balanced_accuracy_score(y_val_outer, y_pred_val)}, ...")

    # Append metrics to dict
    fold_metrics = {
        "Accuracy": best_model.score(X_val_outer, y_val_outer),
        "Balanced Accuracy": balanced_accuracy_score(y_val_outer, y_pred_val),
        "Precision": precision_score(y_val_outer, y_pred_val, average='weighted'),
        "Recall": recall_score(y_val_outer, y_pred_val, average='weighted'),
        "F1 Score": f1_score(y_val_outer, y_pred_val, average='weighted'),
        "AUC": roc_auc_score(y_val_outer, best_model.predict_proba(X_val_outer), multi_class='ovr', average='weighted')
    }
    outer_fold_results.append(fold_metrics)
    print(f"Outer Fold Metrics: {fold_metrics}")

# Average metrics across all outer folds
avg_metrics = {metric: np.mean([fold[metric] for fold in outer_fold_results]) for metric in outer_fold_results[0]}
print("\nAverage Metrics Across All Outer Folds:")
print(avg_metrics)



Outer Fold Metrics: {'Accuracy': 0.5261044176706827, 'Balanced Accuracy': 0.5438342934405926, 'Precision': 0.541455697481864, 'Recall': 0.5261044176706827, 'F1 Score': 0.524197643029713, 'AUC': 0.6545805117264375}
Outer Fold Metrics: {'Accuracy': 0.5461847389558233, 'Balanced Accuracy': 0.48808061333014846, 'Precision': 0.5498400220266408, 'Recall': 0.5461847389558233, 'F1 Score': 0.5033236844813651, 'AUC': 0.6991129134026501}
Outer Fold Metrics: {'Accuracy': 0.4819277108433735, 'Balanced Accuracy': 0.50904838585998, 'Precision': 0.524770410312579, 'Recall': 0.4819277108433735, 'F1 Score': 0.46561906856635465, 'AUC': 0.6521063197965931}
Outer Fold Metrics: {'Accuracy': 0.592741935483871, 'Balanced Accuracy': 0.5621432549963464, 'Precision': 0.5922003842414023, 'Recall': 0.592741935483871, 'F1 Score': 0.5910676531188976, 'AUC': 0.7102515659010042}
Outer Fold Metrics: {'Accuracy': 0.5483870967741935, 'Balanced Accuracy': 0.505707643126998, 'Precision': 0.5435227854582693, 'Recall': 0.548

In [44]:
# Retrieve the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Retrain the model on the entire training dataset (X_train, y_train) with these parameters
final_knn_model = KNeighborsClassifier(**best_params)

# Apply preprocessing (linear correction and Z-normalization) to the entire training dataset X_train
data_controls = X_train[y_train == 0]  # Control group for the training dataset
for roi_feature in roi_features:
    if roi_feature in X_train.columns:
        regr = LinearRegression()
        regr.fit(data_controls[['Sex', 'Age', 'DLICV_baseline']], data_controls[roi_feature])
        # Apply correction to the training set
        correction_train = regr.predict(X_train[['Sex', 'Age', 'DLICV_baseline']])
        X_train[roi_feature] -= correction_train

        # Apply the same correction to the test set
        correction_test = regr.predict(X_test[['Sex', 'Age', 'DLICV_baseline']])
        X_test[roi_feature] -= correction_test


# Z-normalization
scaler = StandardScaler().fit(data_controls[roi_features])
X_train[roi_features] = scaler.transform(X_train[roi_features])
X_test[roi_features] = scaler.transform(X_test[roi_features])


X_train = X_train.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
X_test = X_test.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

# Train the final model on the entire training dataset
final_knn_model.fit(X_train, y_train)

# # Apply linear correction and Z-normalization to X_test
# for roi_feature in roi_features:
#     if roi_feature in X_test.columns:
#         correction_test = regr.predict(X_test[['Sex', 'Age', 'DLICV_baseline']])
#         X_test[roi_feature] -= correction_test
# X_test[roi_features] = scaler.transform(X_test[roi_features])
# X_test = X_test.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

# Evaluate the final model on the test set (X_test, y_test)
y_pred_test = final_knn_model.predict(X_test)
test_accuracy = final_knn_model.score(X_test, y_test)
test_balanced_accuracy = balanced_accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1 = f1_score(y_test, y_pred_test, average='weighted')
test_auc = roc_auc_score(y_test, final_knn_model.predict_proba(X_test), multi_class='ovr', average='weighted')

print(f"Test set Metrics: Accuracy: {test_accuracy}, Balanced Accuracy: {test_balanced_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}, AUC: {test_auc}")

Test set Metrics: Accuracy: 0.4909090909090909, Balanced Accuracy: 0.47158876562369395, Precision: 0.5185294325239977, Recall: 0.4909090909090909, F1 Score: 0.47370538801022527, AUC: 0.6329923516691478


In [32]:
best_params

{'metric': 'euclidean', 'n_neighbors': 30, 'weights': 'uniform'}

#### Nested CV with Randomized Search

In [52]:
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats

X = data_age_filtered.drop(['diagnosis', 'PTID'], axis=1)
y = data_age_filtered['diagnosis']

# Assuming X, y, roi_features are defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Define parameter distribution (instead of a fixed grid)
param_dist = {
    'n_neighbors': stats.randint(3, 1000),  # Uniformly distributed integers from 3 to 30
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

outer_fold_results = []

for train_index, test_index in outer_cv.split(X_train):
    X_train_outer, X_val_outer = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_outer, y_val_outer = y_train.iloc[train_index], y_train.iloc[test_index]

    inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)

    # Use RandomizedSearchCV
    n_iter_search = 20  # Number of parameter settings sampled
    random_search = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=param_dist, n_iter=n_iter_search, cv=inner_cv, scoring='accuracy', n_jobs=-1, random_state=42)

    for inner_train_index, inner_val_index in inner_cv.split(X_train_outer):
        X_train_inner, X_val_inner = X_train_outer.iloc[inner_train_index], X_train_outer.iloc[inner_val_index]
        y_train_inner, y_val_inner = y_train_outer.iloc[inner_train_index], y_train_outer.iloc[inner_val_index]

        # Preprocess for the inner fold
        data_controls_train_inner = X_train_inner[y_train_inner == 0]
        for roi_feature in roi_features:
            if roi_feature in X_train_inner.columns:
                regr = LinearRegression()
                regr.fit(data_controls_train_inner[['Sex', 'Age', 'DLICV_baseline']], data_controls_train_inner[roi_feature])
                # Apply correction to the training set
                correction_train = regr.predict(X_train_inner[['Sex', 'Age', 'DLICV_baseline']])
                X_train_inner[roi_feature] -= correction_train

                # Apply the same correction to the validation set
                correction_val = regr.predict(X_val_inner[['Sex', 'Age', 'DLICV_baseline']])
                X_val_inner[roi_feature] -= correction_val

        # Z-normalization for training and validation sets
        scaler = StandardScaler().fit(data_controls_train_inner[roi_features])
        X_train_inner[roi_features] = scaler.transform(X_train_inner[roi_features])
        X_val_inner[roi_features] = scaler.transform(X_val_inner[roi_features])

        X_train_inner = X_train_inner.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
        X_val_inner = X_val_inner.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

        random_search.fit(X_train_inner, y_train_inner)

    # Evaluate the best model on the outer validation set
    best_model = grid_search.best_estimator_

    # We apply the linear correction and the z normalisation of the outer
    # loop after we are doe with the inner loop, in order to avoid applying
    # these filter twice.
    data_controls_train_outer = X_train_outer[y_train_outer == 0]
    for roi_feature in roi_features:
        if roi_feature in X_train_outer.columns:
            regr = LinearRegression()
            regr.fit(data_controls_train_outer[['Sex', 'Age', 'DLICV_baseline']], data_controls_train_outer[roi_feature])
            # Apply correction to the training set
            correction_train = regr.predict(X_train_outer[['Sex', 'Age', 'DLICV_baseline']])
            X_train_outer[roi_feature] -= correction_train

            # Apply the same correction to the validation set
            correction_val = regr.predict(X_val_outer[['Sex', 'Age', 'DLICV_baseline']])
            X_val_outer[roi_feature] -= correction_val

    # Z-normalization for training and validation sets
    scaler = StandardScaler().fit(data_controls_train_outer[roi_features])
    X_train_outer[roi_features] = scaler.transform(X_train_outer[roi_features])
    X_val_outer[roi_features] = scaler.transform(X_val_outer[roi_features])

    X_train_outer = X_train_outer.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
    X_val_outer = X_val_outer.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)


    y_pred_val = best_model.predict(X_val_outer)
    # Calculate and print metrics for each outer fold
    accuracy = best_model.score(X_val_outer, y_val_outer)
    print(f"Outer Fold Metrics: Accuracy: {accuracy}, Balanced Accuracy: {balanced_accuracy_score(y_val_outer, y_pred_val)}, ...")

    # Append metrics to dict
    fold_metrics = {
        "Accuracy": best_model.score(X_val_outer, y_val_outer),
        "Balanced Accuracy": balanced_accuracy_score(y_val_outer, y_pred_val),
        "Precision": precision_score(y_val_outer, y_pred_val, average='macro'),
        "Recall": recall_score(y_val_outer, y_pred_val, average='macro'),
        "F1 Score": f1_score(y_val_outer, y_pred_val, average='macro'),
        "AUC": roc_auc_score(y_val_outer, best_model.predict_proba(X_val_outer), multi_class='ovr', average='macro')
    }

    outer_fold_results.append(fold_metrics)
    print(f"Outer Fold Metrics: {fold_metrics}")

# Average metrics across all outer folds
avg_metrics = {metric: np.mean([fold[metric] for fold in outer_fold_results]) for metric in outer_fold_results[0]}
print("\nAverage Metrics Across All Outer Folds:")
print(avg_metrics)



Outer Fold Metrics: Accuracy: 0.5742971887550201, Balanced Accuracy: 0.5233277658474509, ...
Outer Fold Metrics: {'Accuracy': 0.5742971887550201, 'Balanced Accuracy': 0.5233277658474509, 'Precision': 0.7129629629629629, 'Recall': 0.5233277658474509, 'F1 Score': 0.4886280806983572, 'AUC': 0.8455802086677028}
Outer Fold Metrics: Accuracy: 0.5542168674698795, Balanced Accuracy: 0.4881886740944224, ...
Outer Fold Metrics: {'Accuracy': 0.5542168674698795, 'Balanced Accuracy': 0.4881886740944224, 'Precision': 0.705850436423685, 'Recall': 0.4881886740944224, 'F1 Score': 0.42644111062460066, 'AUC': 0.8262809783490722}
Outer Fold Metrics: Accuracy: 0.5542168674698795, Balanced Accuracy: 0.5450885668276974, ...
Outer Fold Metrics: {'Accuracy': 0.5542168674698795, 'Balanced Accuracy': 0.5450885668276974, 'Precision': 0.6741519350215003, 'Recall': 0.5450885668276974, 'F1 Score': 0.5098015357992646, 'AUC': 0.8404921981038395}
Outer Fold Metrics: Accuracy: 0.5846774193548387, Balanced Accuracy: 0.54

In [53]:
# Retrieve the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Retrain the model on the entire training dataset (X_train, y_train) with these parameters
final_knn_model = KNeighborsClassifier(**best_params)

# Apply preprocessing (linear correction and Z-normalization) to the entire training dataset X_train
data_controls = X_train[y_train == 0]  # Control group for the training dataset
for roi_feature in roi_features:
    if roi_feature in X_train.columns:
        regr = LinearRegression()
        regr.fit(data_controls[['Sex', 'Age', 'DLICV_baseline']], data_controls[roi_feature])
        # Apply correction to the training set
        correction_train = regr.predict(X_train[['Sex', 'Age', 'DLICV_baseline']])
        X_train[roi_feature] -= correction_train

        # Apply the same correction to the test set
        correction_test = regr.predict(X_test[['Sex', 'Age', 'DLICV_baseline']])
        X_test[roi_feature] -= correction_test


# Z-normalization
scaler = StandardScaler().fit(data_controls[roi_features])
X_train[roi_features] = scaler.transform(X_train[roi_features])
X_test[roi_features] = scaler.transform(X_test[roi_features])


X_train = X_train.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
X_test = X_test.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

# Train the final model on the entire training dataset
final_knn_model.fit(X_train, y_train)

# # Apply linear correction and Z-normalization to X_test
# for roi_feature in roi_features:
#     if roi_feature in X_test.columns:
#         correction_test = regr.predict(X_test[['Sex', 'Age', 'DLICV_baseline']])
#         X_test[roi_feature] -= correction_test
# X_test[roi_features] = scaler.transform(X_test[roi_features])
# X_test = X_test.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

# Evaluate the final model on the test set (X_test, y_test)
y_pred_test = final_knn_model.predict(X_test)
test_accuracy = final_knn_model.score(X_test, y_test)
test_balanced_accuracy = balanced_accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='weighted')
test_recall = recall_score(y_test, y_pred_test, average='weighted')
test_f1 = f1_score(y_test, y_pred_test, average='weighted')
test_auc = roc_auc_score(y_test, final_knn_model.predict_proba(X_test), multi_class='ovr', average='weighted')

print(f"Test set Metrics: Accuracy: {test_accuracy}, Balanced Accuracy: {test_balanced_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1 Score: {test_f1}, AUC: {test_auc}")

Test set Metrics: Accuracy: 0.4909090909090909, Balanced Accuracy: 0.47158876562369395, Precision: 0.5185294325239977, Recall: 0.4909090909090909, F1 Score: 0.47370538801022527, AUC: 0.6329923516691478


In [50]:
best_params

{'metric': 'euclidean', 'n_neighbors': 30, 'weights': 'distance'}

##### back ups


In [36]:
# only corrected and standardized outer folds
# (which include the inner folds, but it is the same problem of data
# leakage even though the whole data are train data. Even though all of these
# data are train data, we should treat them as validation/test data for validating
# and evaluating the model.)

# from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
# from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score
# from sklearn.linear_model import LinearRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.preprocessing import StandardScaler

# # Splitting dataset into train and test
# X = data_age_filtered.drop(['diagnosis', 'PTID'], axis=1)
# y = data_age_filtered['diagnosis']

# # Assuming X, y, roi_features are defined
# # Split the dataset into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# # Define parameter grid for KNN
# param_grid = {
#     'n_neighbors': [3, 5, 7, 10, 15, 30],
#     'weights': ['uniform', 'distance'],
#     'metric': ['euclidean', 'manhattan']
# }

# # Define outer cross-validation
# outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # Lists to store metrics for each outer fold
# outer_fold_accuracy = []
# outer_fold_balanced_accuracy = []
# outer_fold_precision = []
# outer_fold_recall = []
# outer_fold_f1 = []
# outer_fold_auc = []

# # Outer loop for model evaluation
# for train_index, test_index in outer_cv.split(X_train, y_train):
#     # Split training data into training and validation for the current outer fold
#     X_train_outer, X_val_outer = X_train.iloc[train_index], X_train.iloc[test_index]
#     y_train_outer, y_val_outer = y_train.iloc[train_index], y_train.iloc[test_index]

#     # Apply preprocessing (linear correction and Z-normalization) to X_train_outer and X_val_outer
#     # Linear correction
#     data_controls_train_outer = X_train_outer[y_train_outer == 0]  # Control group for the fold
#     for roi_feature in roi_features:
#         if roi_feature in X_train_outer.columns:
#             regr = LinearRegression()
#             regr.fit(data_controls_train_outer[['Sex', 'Age', 'DLICV_baseline']], data_controls_train_outer[roi_feature])
#             # Apply correction to the training set
#             correction_train = regr.predict(X_train_outer[['Sex', 'Age', 'DLICV_baseline']])
#             X_train_outer[roi_feature] -= correction_train

#             # Apply the same correction to the validation set
#             correction_val = regr.predict(X_val_outer[['Sex', 'Age', 'DLICV_baseline']])
#             X_val_outer[roi_feature] -= correction_val

#     # Z-normalization for training and validation sets
#     scaler = StandardScaler().fit(data_controls_train_outer[roi_features])
#     X_train_outer[roi_features] = scaler.transform(X_train_outer[roi_features])
#     X_val_outer[roi_features] = scaler.transform(X_val_outer[roi_features])

#     X_train_outer = X_train_outer.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)
#     X_val_outer = X_val_outer.drop(['Sex', 'Age', 'DLICV_baseline'], axis=1)

#     # Inner loop for hyperparameter tuning
#     inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
#     grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=inner_cv, scoring='accuracy', n_jobs=-1)
#     grid_search.fit(X_train_outer, y_train_outer)

#     # Train model with best parameters on the entire training set of the outer fold
#     best_model = grid_search.best_estimator_
#     best_model.fit(X_train_outer, y_train_outer)

#     # Evaluate model on the validation set of the outer fold
#     y_pred_val = best_model.predict(X_val_outer)
#     outer_fold_accuracy.append(best_model.score(X_val_outer, y_val_outer))
#     outer_fold_balanced_accuracy.append(balanced_accuracy_score(y_val_outer, y_pred_val))
#     outer_fold_precision.append(precision_score(y_val_outer, y_pred_val, average='weighted'))
#     outer_fold_recall.append(recall_score(y_val_outer, y_pred_val, average='weighted'))
#     outer_fold_f1.append(f1_score(y_val_outer, y_pred_val, average='weighted'))
#     outer_fold_auc.append(roc_auc_score(y_val_outer, best_model.predict_proba(X_val_outer), multi_class='ovr', average='weighted'))

#     # Evaluate model on the validation set of the outer fold
#     y_pred_val = best_model.predict(X_val_outer)
#     accuracy = best_model.score(X_val_outer, y_val_outer)
#     balanced_accuracy = balanced_accuracy_score(y_val_outer, y_pred_val)
#     precision = precision_score(y_val_outer, y_pred_val, average='weighted')
#     recall = recall_score(y_val_outer, y_pred_val, average='weighted')
#     f1 = f1_score(y_val_outer, y_pred_val, average='weighted')
#     auc = roc_auc_score(y_val_outer, best_model.predict_proba(X_val_outer), multi_class='ovr', average='weighted')

#     # Append metrics to lists
#     outer_fold_accuracy.append(accuracy)
#     outer_fold_balanced_accuracy.append(balanced_accuracy)
#     outer_fold_precision.append(precision)
#     outer_fold_recall.append(recall)
#     outer_fold_f1.append(f1)
#     outer_fold_auc.append(auc)


# # Calculate average metrics across all outer folds
# print("Average Metrics Across All Outer Folds:")
# print(f"Accuracy: {np.mean(outer_fold_accuracy)}")
# print(f"Balanced Accuracy: {np.mean(outer_fold_balanced_accuracy)}")
# print(f"Precision: {np.mean(outer_fold_precision)}")
# print(f"Recall: {np.mean(outer_fold_recall)}")
# print(f"F1 Score: {np.mean(outer_fold_f1)}")
# print(f"AUC: {np.mean(outer_fold_auc)}")
