In [121]:
import os
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/Colab Notebooks/Innolab')

Mounted at /content/drive


In [126]:
import pandas as pd
import numpy as np
from data_preparation import prepare_data

In [129]:
# as importing the function from drive somehow doesn't work. Same function as in data_preparation.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def prepare_data(data: pd.DataFrame, classification: bool = True, 
                 columns_drop: list = ["ConnID", "Repseudonym", "siteid" "visdat", "MEM_Score", "Apoe", "IDs"], 
                 target: str = "prmdiag", y_0: list = [0], y_1: list = [2, 3],
                 train_size: float = 0.8, seed: int = 123) -> list:
    """
    Function that prepares the data for modelling
    
    Args:
        data: A pd.Dataframe
        classification: boolean, if false regression task
        columns_drop: which columns should be dropped (all other columns will be used for modelling)
        target: name of the target variable
        y_0: (only relevant for classification task) which values of target should be treated as 0
        y_1: (only relevant for classification task) which values of target should be treated as 1
        train_size: size of the training data (default 0.8)
        seed: seed for reproducibility of train/test split
    Returns:
        A list containing the training data (index 0) and the test data (index 1) with a y-column and all columns that shouldn't be used for modelling are dropped
    """

    # check input types
    assert isinstance(data, pd.DataFrame), "provided data is no pd.DataFrame"
    assert isinstance(classification, bool), "classification is no boolean"
    assert isinstance(columns_drop, list), "invalid columns_drop, must be list"
    assert all(isinstance(x, str) for x in columns_drop), "invalid columns_drop, elements must be strings"
    assert isinstance(target, str), "invalid target, must be string"
    assert isinstance(y_0, list), "invalid, y_0, must be list"
    assert all(isinstance(x, int) for x in y_0), "invalid y_0, elements must be integers"
    assert isinstance(y_1, list), "invalid, y_1, must be list"
    assert all(isinstance(x, int) for x in y_1), "invalid y_1, elements must be integers"
    assert isinstance(train_size, float), "invalid train size, must be float"
    assert isinstance(seed, int), "provided seed is no integer"

    # check if inputs are valid
    assert (train_size > 0.0) & (train_size < 1.0), "invalid train size, must be > 0 and < 1" 
    assert target in data.columns, "target not found in columns of data"
    assert all(x in data.columns for x in columns_drop), "columns_drop not found in columns of data"
    if classification:
      assert all(x in data[target].unique() for x in y_0), "y_0 not found in target variable"
      assert all(x in data[target].unique() for x in y_1), "y_1 not found in target variable"


    # create y variable
    if classification:
            
      # create 0 and 1 based on target and y_0 and y_1
      target_0 = data[target].isin(y_0)
      target_1 = data[target].isin(y_1)
  
      data.loc[target_0, "y"] = 0
      data.loc[target_1, "y"] = 1

      # drop NAs in y
      data.dropna(subset = ["y"], inplace = True)

    else: # regression

      data["y"] = data[target]

    # drop target (as y was created) and other columns 
    data.drop(columns=columns_drop, inplace = True)
    data.drop(columns=[target], inplace = True)


    # reorder data so that y is the first variable
    data = pd.concat([data["y"], data.drop(columns = ["y"])], axis=1)

    # perform train/test split
    data_list = train_test_split(data, train_size=train_size, random_state=seed, shuffle=True)

    return data_list


 

In [128]:
# ToDo: change to output of preprocessing
train = pd.read_csv("data_DELCODE/train.csv")
test = pd.read_csv("data_DELCODE/test.csv")
pd.concat([train, test])

Unnamed: 0,ConnID,Repseudonym,siteid,age,visdat,sex,prmdiag,edyears,MEM_score,Apoe,IDs,1_2,1_3,1_4,1_5,1_6,1_7,1_8,1_9,1_10,1_11,1_12,1_13,1_14,1_15,1_16,1_17,1_18,1_19,1_20,1_21,1_22,1_23,1_24,1_25,1_26,1_27,1_28,1_29,1_30,...,237_243,237_244,237_245,237_246,238_239,238_240,238_241,238_242,238_243,238_244,238_245,238_246,239_240,239_241,239_242,239_243,239_244,239_245,239_246,240_241,240_242,240_243,240_244,240_245,240_246,241_242,241_243,241_244,241_245,241_246,242_243,242_244,242_245,242_246,243_244,243_245,243_246,244_245,244_246,245_246
0,85,3fd91223d,14,63,18.05.2016,1,0,13,1.215577,1.0,85.0,0.749860,0.230923,-0.065474,0.412777,0.303270,0.341117,-0.027277,0.574057,0.290517,0.862302,0.479994,0.419541,0.089797,0.211377,-0.069165,0.243698,0.138934,0.203588,-0.128424,-0.142274,-0.097212,0.342343,-0.006937,0.953543,0.271842,-0.215807,-0.345942,0.288377,-0.002376,...,0.675390,0.641759,0.256006,0.161096,0.051992,0.335806,0.274249,0.464632,0.346927,0.197321,0.093035,0.169661,0.843718,0.758413,0.297913,0.398200,-0.033476,1.027731,1.040865,1.111977,0.796641,0.750172,0.242670,0.693493,0.663318,0.677533,0.545846,0.246136,0.600611,0.532708,0.910899,0.423027,0.168445,0.086235,0.553605,0.260301,0.262766,-0.064755,-0.121692,1.193451
1,11,0f1b4b7ac,11,68,26.10.2017,1,2,13,-0.583224,1.0,11.0,0.769255,0.488714,0.116482,-0.021402,-0.150269,0.307751,0.393959,0.304850,0.355440,0.476269,0.125607,-0.130721,-0.072421,0.430517,0.186178,0.727480,0.388229,0.120676,0.009006,0.496298,0.096529,0.357324,0.193164,0.485271,0.248289,-0.113434,-0.227484,0.485454,0.268980,...,1.170770,0.347485,0.377663,0.224154,0.323828,0.439175,0.538379,0.322498,0.745291,0.248104,0.554985,0.353254,0.220629,0.451341,0.322443,0.568902,0.357294,0.450699,0.659226,0.461063,0.722502,0.556016,0.228903,0.295915,0.322317,0.484691,0.791189,0.268268,0.141795,0.034816,0.381080,0.852528,0.141102,0.182648,0.196211,0.387882,0.299217,0.194711,0.191894,1.018621
2,619,bf5da3838,11,78,22.06.2016,0,1,14,-0.856116,1.0,619.0,0.168419,0.257018,0.016907,0.091702,-0.010532,0.435234,0.106881,0.182423,0.056001,0.502006,0.317397,0.217119,0.021138,0.640308,0.071841,0.204547,0.161851,0.247751,-0.140406,0.114880,-0.060171,0.269094,0.016566,0.430953,0.037941,0.160389,-0.093853,0.430277,-0.035102,...,0.709562,0.446627,0.172218,-0.172940,0.199733,0.216898,0.212193,0.245178,0.138725,0.192206,0.234587,0.136768,0.221140,0.219693,0.082570,0.190661,0.110283,0.411547,0.318120,0.673747,0.659346,0.442534,0.396549,0.123856,0.012965,0.577570,0.563080,0.140217,0.097882,-0.028234,0.639703,0.427146,0.088723,-0.245473,0.663036,0.142216,-0.300101,0.115183,-0.298258,0.131574
3,252,47e5a2600,2,63,28.11.2016,1,1,17,0.862633,0.0,252.0,0.868713,0.325486,0.395653,0.391204,0.349502,0.079478,0.199210,0.158742,-0.060597,0.801271,0.418965,0.180624,0.031938,0.310969,0.231340,0.663267,0.339705,0.427437,0.265302,0.042056,0.125814,0.487544,0.190261,0.516608,0.238371,0.118187,0.057087,0.064976,0.018432,...,0.695289,0.720458,0.383823,0.223196,0.268081,0.140685,0.303392,0.211814,0.215745,0.049197,0.347162,0.521063,0.218042,0.429776,0.175440,0.284785,-0.061578,0.478259,0.531669,0.544807,0.729218,0.118570,-0.128986,0.300403,0.406977,0.579221,0.414116,0.185960,0.494446,0.406204,0.331876,0.222091,0.354670,0.261393,0.797265,0.431262,0.195765,0.176242,-0.108549,0.525855
4,872,48387a07e,8,79,2015-08-31,1,3,11,-2.116266,1.0,872.0,0.128000,0.545008,-0.343972,0.862140,-0.015617,0.179164,-0.233017,0.034938,-0.105519,0.733050,0.088092,0.277305,-0.179329,0.332651,-0.231090,0.702441,-0.072943,0.645076,-0.215037,0.274248,-0.451468,0.574711,-0.173056,0.595247,-0.396143,0.378200,-0.196846,0.674376,-0.166660,...,0.301599,0.281604,-0.257081,0.164976,-0.064623,0.205962,0.259190,0.369864,0.119939,0.430113,-0.128200,0.036388,-0.062767,0.013018,0.002274,0.071944,0.069917,0.173637,0.107760,0.456734,0.444816,0.072618,0.086900,-0.095910,0.172357,0.220310,0.484933,0.004595,-0.263606,-0.013920,0.155383,0.416782,-0.300804,0.057657,0.031142,-0.086161,0.055487,-0.082463,-0.029868,0.177041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,582,b67b41df4,17,75,17.02.2016,1,2,13,-1.129996,0.0,582.0,0.755933,0.279517,0.149091,0.052978,-0.072814,0.281643,0.166668,0.390767,0.308243,0.709724,0.292785,0.370437,0.055550,0.247428,0.166505,0.731781,0.158642,0.320686,0.040845,-0.040871,0.127707,0.350518,0.005856,0.384852,0.204982,0.300694,0.136284,0.565172,0.010010,...,0.745143,0.516429,0.099301,0.061510,0.410268,0.014048,0.137245,0.048413,0.348382,0.040848,0.205206,0.140130,0.269176,0.436009,-0.102222,0.342045,0.045286,0.244485,0.279450,0.478796,0.524764,0.253968,0.172204,0.087065,0.302393,0.425745,0.503829,0.308995,0.107462,-0.061540,0.484448,0.467015,-0.105310,-0.141064,0.913408,-0.074612,-0.111399,-0.224415,-0.286836,0.336503
178,871,16437ed4f,8,75,2017-01-10,1,3,10,-1.866769,0.0,871.0,0.449709,0.094578,0.087171,0.062335,-0.102834,0.305695,0.164338,0.143627,0.174432,0.370682,0.116743,0.084464,-0.183651,0.476749,0.327601,0.424649,0.157659,0.498215,0.215625,0.425984,0.415790,0.213661,0.132123,0.450839,0.167749,0.097507,-0.050451,0.284726,0.028092,...,-0.081792,0.047471,0.020948,-0.053185,-0.157380,-0.043794,0.096956,0.094565,-0.124034,0.313645,-0.049446,0.325272,0.166447,0.042219,0.279172,0.283053,-0.022596,0.260205,0.020118,0.349210,0.626126,0.498804,0.286380,-0.090909,0.195216,0.283289,0.371555,0.059083,-0.081508,0.103415,0.353635,0.199127,-0.061212,0.007186,0.129561,-0.229676,-0.135857,-0.170095,0.085570,0.265964
179,447,9017e3b68,13,69,22.04.2015,0,1,18,-0.070936,0.0,447.0,0.999838,0.265437,0.064838,0.273070,0.127590,0.413145,-0.130358,0.596543,0.421639,0.872816,0.682815,0.531743,0.238102,0.697317,0.136118,0.212304,0.078386,0.710412,0.305626,0.097869,0.029067,0.314796,-0.004899,0.384429,0.063470,0.089283,-0.080691,0.558093,0.172003,...,0.644016,0.181685,0.142625,-0.029439,-0.084909,0.083132,-0.257401,0.521824,0.280201,0.189806,0.167511,-0.023666,-0.037617,0.152923,0.029774,0.230403,0.165138,0.322878,0.538333,0.219641,0.332110,0.171374,0.372054,0.015542,-0.090596,0.003481,0.388222,0.103221,-0.191940,0.014135,0.389067,0.475096,0.135471,-0.109216,0.456896,0.109648,0.123226,0.146611,0.130343,0.344798
180,789,f85559075,2,71,10.04.2018,0,1,15,-0.327773,0.0,789.0,1.035043,0.522144,0.175784,0.893588,0.361376,-0.034800,0.176660,0.112255,0.006876,0.873531,0.532428,0.536440,0.331871,-0.074082,-0.102804,0.367833,0.019475,0.615560,-0.234014,-0.524451,-0.077777,0.350643,0.309283,0.647946,-0.057053,0.255685,-0.100881,0.397163,-0.082778,...,0.480073,0.462388,0.238441,0.183852,0.305138,0.405770,0.368763,0.480553,0.536894,0.483155,0.216093,0.373432,0.495368,0.508624,0.314638,0.447226,0.249061,0.465490,0.765885,0.578834,0.787369,0.650306,0.653662,0.255215,0.522276,0.346901,0.421487,0.221639,0.381781,0.532531,0.865034,0.808278,0.162444,0.310823,0.739667,0.143182,0.257853,0.039514,0.323592,0.510412


# Data preparation
Preparation of the data for modelling. Creates the target variable, drops unnecessary columns, performs a train/test split. \\
The user has to specify:
- *classification*: is it a classification task (True) or a regression task (False)
- *columns_drop*: which variables shoulnd't be used for modelling
- *target*: what is the name of the target variable
- *y_0, y_1* (only relevant for classification task): which values of the target variable are 0, which are 1
- *train_size*: size of the training data
- *seed*: a seed to ensure reproducibility of train/test split

In [130]:
data = pd.concat([train, test])
classification = True
columns_drop = ["ConnID", "Repseudonym", "siteid", "visdat", "MEM_score", "Apoe", "IDs"]
target = "prmdiag"
y_0 = [0]
y_1 = [2, 3]
train_size = 0.8
seed = 123

# preparation of data
data_list = prepare_data(data = data, classification = classification,
                         columns_drop = columns_drop, target = target, y_0 = y_0, y_1 = y_1,
                         train_size = train_size, seed = seed)

# creation of y and X, test and train
ytrain, Xtrain = data_list[0]["y"], data_list[0].drop(columns="y")
ytest, Xtest = data_list[1]["y"], data_list[1].drop(columns="y")

data_list[0] # training data

Unnamed: 0,y,age,sex,edyears,1_2,1_3,1_4,1_5,1_6,1_7,1_8,1_9,1_10,1_11,1_12,1_13,1_14,1_15,1_16,1_17,1_18,1_19,1_20,1_21,1_22,1_23,1_24,1_25,1_26,1_27,1_28,1_29,1_30,1_31,1_32,1_33,1_34,1_35,1_36,1_37,...,237_243,237_244,237_245,237_246,238_239,238_240,238_241,238_242,238_243,238_244,238_245,238_246,239_240,239_241,239_242,239_243,239_244,239_245,239_246,240_241,240_242,240_243,240_244,240_245,240_246,241_242,241_243,241_244,241_245,241_246,242_243,242_244,242_245,242_246,243_244,243_245,243_246,244_245,244_246,245_246
669,1.0,69,1,16,0.397735,0.040334,-0.073844,0.253680,0.017894,0.253501,0.015033,0.303579,0.077040,0.427777,0.304312,-0.047886,-0.217151,0.250177,-0.106913,0.584106,0.100530,0.207814,-0.001066,0.139676,0.194629,-0.038919,-0.053973,0.360676,0.242022,0.022354,-0.202717,0.376343,0.307500,0.376860,0.229468,0.239679,0.177953,0.262597,0.362132,0.327453,...,0.549100,0.209986,0.251222,0.360105,0.015905,0.155677,0.220320,0.319087,0.445713,0.267303,0.152654,0.212705,0.090667,0.207281,0.071938,0.140225,0.028960,0.357334,0.217831,0.481808,0.550746,0.213317,0.068699,0.133928,0.257917,0.314921,0.304969,0.024338,0.022405,0.242821,0.597018,0.426241,0.026967,0.308450,0.461265,-0.051504,0.351591,0.010037,0.088215,0.338404
650,0.0,77,0,13,0.847041,0.424220,-0.094463,0.183973,-0.133949,0.453239,0.310502,0.598839,0.266454,0.422876,0.104657,0.006132,-0.279225,0.730799,0.312803,0.869461,0.379569,0.619814,-0.017184,0.752033,0.057020,0.526096,0.239660,0.766580,0.267483,-0.038492,-0.134617,0.590717,0.399014,0.557775,0.126777,0.671707,0.339354,0.449808,0.601821,0.818308,...,0.472060,0.558828,0.052223,0.109709,0.195441,-0.082049,0.003831,-0.211826,-0.227409,-0.210751,0.067685,0.502165,0.404586,0.156615,0.323459,0.242844,0.043661,0.334176,0.331084,0.178195,0.312208,0.288080,0.215453,0.055860,0.177040,0.255538,0.210682,0.137751,0.031600,0.217645,0.645937,0.664577,0.090622,-0.235904,0.517791,-0.103706,-0.203406,0.178297,-0.175383,0.121168
11,1.0,61,2,9,0.481190,0.794886,0.042472,0.622561,0.240239,0.205222,0.044472,0.125473,0.154784,1.006971,0.258262,0.299779,0.086156,0.505519,-0.228502,0.547644,0.156033,0.302560,-0.393507,0.289591,-0.061863,0.463038,0.088506,0.363152,-0.322339,0.029781,-0.315391,0.292243,0.054527,0.448504,0.197302,0.660328,0.090805,0.497627,0.278157,0.765816,...,0.359010,0.279815,0.128684,-0.066307,0.034740,0.047412,0.162660,0.322722,0.137922,0.251697,0.077119,-0.147284,0.197675,0.342841,0.059047,0.151311,0.114551,0.556033,0.340676,0.250657,0.681586,0.307239,0.383454,-0.045115,0.227992,0.256245,0.641780,0.167375,-0.073806,-0.217689,0.410023,0.437427,-0.139135,-0.026834,0.342825,-0.028371,-0.036341,-0.033416,0.173324,0.531999
674,1.0,87,2,14,1.066774,0.869239,0.725535,0.229903,0.179936,0.829748,0.816613,0.971331,0.951879,0.399652,0.427439,-0.029118,-0.261042,0.229482,0.356209,0.564830,0.457979,-0.190621,-0.131628,-0.087768,-0.060402,0.723381,0.496605,0.816182,0.518333,-0.041053,-0.047264,0.011901,0.162064,-0.015769,0.093115,-0.033180,-0.072842,-0.085097,-0.208300,-0.176630,...,0.403854,0.584699,-0.057364,-0.000677,0.316397,0.030609,0.375501,0.020062,0.209640,0.168422,0.383994,0.202414,0.332031,0.611686,0.260466,0.693609,0.249280,0.558320,0.637682,0.516076,0.753107,0.452391,0.403021,0.062627,0.264926,0.669121,0.828608,0.373555,0.272088,0.260842,0.609479,0.231749,0.074244,0.148052,0.736523,0.188902,0.165349,-0.045028,-0.035415,0.828018
637,1.0,74,0,10,0.962072,0.369855,0.175164,0.358260,0.073656,0.299314,0.141797,0.523828,0.306021,0.612470,0.403345,0.042869,-0.049235,0.149535,0.324251,0.707533,0.479002,0.370430,0.088622,0.383909,0.459646,0.774959,0.381947,0.757195,0.640151,-0.052982,0.054337,0.310751,0.392032,0.157920,0.322120,0.616631,0.409036,0.654030,0.283597,0.439387,...,0.387958,0.582100,0.154563,-0.068274,0.288830,0.106149,0.400668,0.249772,0.388697,0.483921,0.037182,0.109250,0.402702,0.523418,0.516259,0.640029,0.149295,0.111501,0.092212,0.532549,0.863914,0.356736,0.279993,0.181851,0.111977,0.650426,0.830183,0.465798,0.084209,0.160812,0.501794,0.437849,0.096676,0.194939,0.549400,0.134639,0.106644,0.253842,0.075348,0.307396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,0.0,66,1,16,0.591591,-0.080666,-0.119921,-0.170848,-0.163674,0.746350,0.108373,0.131851,0.284916,0.201573,-0.161298,-0.001199,0.019906,0.256448,0.245150,0.595105,0.412356,0.069354,-0.243103,0.304795,0.161993,0.341332,0.374304,0.520962,0.163664,0.055415,0.078614,0.482236,0.663434,0.320140,0.216273,0.065586,0.031506,0.345773,0.122352,0.454819,...,0.591811,0.563833,0.126006,0.292782,0.102766,0.186524,0.002918,0.297806,0.204306,0.309211,0.061142,0.236060,-0.001013,0.052442,-0.031927,0.086679,-0.069848,0.315528,0.392464,0.137417,0.504110,0.653560,0.675432,-0.245136,0.287906,0.137331,0.259579,0.142239,0.144927,0.211091,0.520529,0.464105,-0.197139,-0.148327,0.623523,-0.107716,0.302789,-0.116674,0.115593,0.332073
200,1.0,76,1,19,0.703348,0.311199,0.202594,0.184349,-0.164339,0.316550,0.300254,0.471908,0.394808,0.367342,0.243604,0.207934,0.100042,0.140426,-0.148956,0.177015,0.071797,0.133394,-0.154737,0.027378,0.021687,0.083267,-0.143621,0.189734,0.192284,-0.085283,0.023141,0.190981,-0.029748,0.259340,-0.112341,-0.000238,0.031534,0.238256,0.039471,0.203832,...,0.226323,0.154674,-0.169593,0.090783,-0.123699,0.132949,0.296999,0.333559,0.231471,0.257140,-0.104753,-0.134106,-0.009317,0.108919,0.008266,0.153991,-0.068738,0.378966,0.362029,0.342296,0.495314,0.073656,0.000457,0.033699,0.142592,0.303661,0.469306,0.039811,-0.156650,-0.194369,0.429060,0.065457,0.194998,0.079488,0.229309,0.103587,0.022405,-0.121601,-0.130421,0.399422
609,1.0,78,2,16,1.139782,1.040654,0.705003,0.784820,0.745172,0.722767,0.717316,0.847528,0.581609,0.912884,0.751832,0.261196,-0.018033,0.987280,0.604109,0.749289,0.403240,0.164351,0.215037,0.240506,0.310976,0.874105,0.381825,0.908636,0.762537,0.116363,0.064640,0.378171,0.517819,0.106710,0.222006,0.227095,0.293250,0.317157,0.305643,0.312875,...,0.422437,0.413825,0.137794,0.176956,0.113861,-0.146293,-0.173039,0.176393,-0.008011,-0.001513,0.393012,0.258803,0.346315,0.470451,0.131392,0.446760,0.280501,0.267100,0.731304,0.809620,0.714327,0.641689,0.285061,-0.196872,0.383385,0.409945,0.805700,0.329858,-0.120461,0.390700,0.708941,0.163157,-0.157718,0.203308,0.556826,-0.361246,0.377343,-0.010021,0.202764,0.177333
723,1.0,73,0,13,0.413874,0.816672,0.727668,0.686160,0.369191,0.377622,0.383998,0.468565,0.320248,0.962821,0.606392,0.275312,0.106233,0.311046,0.147262,0.839586,0.452478,0.220322,0.285735,0.161195,0.250844,0.772266,0.556147,1.058303,0.581259,-0.062867,-0.041767,0.328358,0.397087,0.386015,0.245009,0.484123,0.275475,0.443392,-0.008765,0.371003,...,0.389541,0.355539,0.098511,-0.064406,0.107869,-0.023698,0.163605,0.038816,0.185567,0.226136,0.191353,-0.047254,0.476238,0.245406,0.222835,0.428800,0.301875,0.440159,0.567543,0.465077,0.548427,0.315052,0.145238,0.122402,0.200132,0.617246,0.526820,0.404212,-0.010015,-0.077069,0.390945,0.267829,-0.122892,-0.155694,0.671668,0.062467,0.036910,0.111245,0.039767,0.926998


another example with MEM_score as target

In [132]:
data = pd.concat([train, test])
classification = False
columns_drop = ["ConnID", "Repseudonym", "siteid", "visdat", "prmdiag", "Apoe", "IDs"]
target = "MEM_score"
y_0 = [0]
y_1 = [2, 3]
train_size = 0.8
seed = 123

# preparation of data
data_list = prepare_data(data = data, classification = classification,
                         columns_drop = columns_drop, target = target, y_0 = y_0, y_1 = y_1,
                         train_size = train_size, seed = seed)

# creation of y and X, test and train
ytrain, Xtrain = data_list[0]["y"], data_list[0].drop(columns="y")
ytest, Xtest = data_list[1]["y"], data_list[1].drop(columns="y")

data_list[0] # training data

Unnamed: 0,y,age,sex,edyears,1_2,1_3,1_4,1_5,1_6,1_7,1_8,1_9,1_10,1_11,1_12,1_13,1_14,1_15,1_16,1_17,1_18,1_19,1_20,1_21,1_22,1_23,1_24,1_25,1_26,1_27,1_28,1_29,1_30,1_31,1_32,1_33,1_34,1_35,1_36,1_37,...,237_243,237_244,237_245,237_246,238_239,238_240,238_241,238_242,238_243,238_244,238_245,238_246,239_240,239_241,239_242,239_243,239_244,239_245,239_246,240_241,240_242,240_243,240_244,240_245,240_246,241_242,241_243,241_244,241_245,241_246,242_243,242_244,242_245,242_246,243_244,243_245,243_246,244_245,244_246,245_246
85,0.830339,71,1,11,0.767906,0.056220,-0.191566,-0.244958,-0.377368,0.780359,0.470434,0.318310,0.422032,0.336066,-0.067664,-0.473637,-0.515097,0.542722,-0.085543,0.240851,0.141410,0.247546,0.287977,-0.001957,0.003456,-0.040433,-0.095059,0.570300,0.306050,-0.160144,-0.277290,0.531407,0.270788,0.287958,0.048609,0.286695,0.338601,0.437407,0.154246,0.503904,...,0.497441,0.322514,0.119558,-0.048838,-0.194195,0.006896,0.225845,-0.035846,0.072159,0.131081,0.053261,0.047987,0.387873,0.084009,-0.010600,0.014793,-0.028289,0.228420,0.273085,0.313660,0.317083,-0.009386,0.046839,0.150708,0.189433,0.320523,0.267146,-0.060157,-0.104290,-0.073376,0.360488,0.286452,-0.136397,-0.029781,0.519910,-0.133808,-0.001784,-0.018899,0.006354,0.052184
280,1.075548,67,1,20,0.632305,0.122269,-0.065212,0.310342,0.114617,0.055222,0.146389,0.350888,0.327145,0.549286,0.177110,0.075499,-0.178849,0.066174,0.014865,0.934519,0.426773,0.062329,0.122356,0.166538,-0.062092,-0.095138,0.343392,0.812595,0.665973,0.081818,-0.182844,0.840187,0.349794,0.437773,0.210806,0.694573,0.182819,0.445632,-0.038769,0.762921,...,0.665591,0.753848,0.153437,0.343715,0.136250,0.295695,0.284476,0.540125,0.541932,0.628564,0.222249,0.304928,0.465632,0.471032,0.281071,0.304226,0.103514,0.443129,0.592817,0.752621,0.771478,0.460596,0.481971,0.268661,0.605224,0.496833,0.537713,0.249762,0.235250,0.372427,0.479991,0.490536,0.337091,0.520007,0.779492,0.129865,0.359081,0.119128,0.267850,0.693103
22,0.299585,64,1,12,1.068486,0.019988,-0.335863,0.438401,0.200815,0.131925,0.119408,0.267346,0.194102,0.531772,-0.140940,0.175174,-0.465277,-0.508721,-0.769327,0.439135,0.274128,0.178859,-0.146428,-0.128952,-0.281115,-0.232679,-0.399160,0.629497,0.286658,-0.371478,-0.280265,0.085306,0.487755,0.591229,-0.105266,0.852589,0.338155,0.890911,0.494788,0.679687,...,0.574083,0.568303,0.112032,0.098130,-0.027973,0.042002,0.378820,0.251402,0.205820,0.276267,-0.150490,0.037068,-0.099211,0.182688,-0.060949,0.030911,-0.133580,0.354023,0.238774,0.077786,0.200204,0.065359,0.155531,0.166575,0.242475,0.168337,0.279380,0.096143,0.031594,0.029706,0.421610,0.295630,-0.156735,-0.257543,0.694109,-0.056509,-0.099359,0.078788,0.083461,0.401951
95,-0.756945,80,0,19,0.965386,0.338413,0.362495,0.463829,0.419779,0.454792,0.545673,0.670866,0.733551,0.641806,0.717910,0.295008,0.357987,0.301754,0.532511,0.320057,0.408016,0.347903,0.465962,0.437063,0.585565,0.099838,0.290324,0.339111,0.315600,0.083185,0.254962,0.597327,0.755477,0.333294,0.330752,0.276136,0.487378,0.540499,0.571876,0.312129,...,0.268238,0.533636,0.180410,0.195162,0.036663,0.096458,0.154068,0.245838,0.168122,0.243695,0.041057,0.296734,0.647556,0.564308,0.237222,0.357820,0.316757,0.296806,0.625452,0.414413,0.324272,0.242434,0.362159,0.145644,0.451993,0.558549,0.621197,0.326981,0.084076,0.380877,0.601891,0.327681,0.257669,0.468455,0.562519,0.011162,0.201665,0.092215,0.171483,0.721934
95,0.196150,69,0,15,0.582970,0.814798,0.307050,0.509316,0.340621,0.497388,0.132869,0.116887,0.046590,1.051315,0.813630,0.417164,0.260787,0.187558,-0.068696,0.750454,0.321029,0.470878,-0.068336,0.336157,-0.230155,0.438588,0.199648,0.849679,0.475600,-0.054731,-0.326851,0.461771,0.121850,0.307797,0.106577,0.442878,0.395678,0.528618,0.291409,0.815094,...,0.806901,0.636123,0.015660,0.045483,-0.063224,0.134211,0.652862,0.484442,0.567206,0.551116,-0.121229,-0.042118,0.178622,0.229412,-0.103117,0.245121,0.129150,0.659157,0.923568,0.580381,0.737222,0.508388,0.419125,0.142539,0.092468,0.549129,0.615035,0.428164,0.005052,0.133685,0.893866,0.451249,-0.269048,-0.170075,0.675042,-0.041024,0.105165,0.198872,0.223391,0.866219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,0.271843,66,0,13,0.309567,0.784300,0.695819,0.495037,0.461287,0.401084,0.191859,-0.009053,0.003949,0.520390,0.313753,0.217057,0.047981,0.244592,0.299688,0.502897,0.146103,0.436709,0.206714,0.293692,-0.049301,0.552276,0.248448,0.627810,0.250976,0.215825,0.182073,0.382691,0.083260,0.226870,0.034288,0.177456,0.098995,0.340032,-0.223954,0.350586,...,0.694942,0.772290,-0.010105,-0.017833,0.161538,0.138042,0.109590,0.374814,0.216049,0.276234,0.055434,0.178987,0.411089,0.272537,0.140467,0.416243,0.088238,0.392817,0.441776,0.315466,0.529594,0.210335,-0.010215,0.298070,0.522507,0.356114,0.434808,0.169270,0.010769,0.149916,0.528237,0.462630,0.071885,0.059461,0.769568,0.215778,0.067598,0.116100,0.033366,0.645295
322,0.606709,76,0,18,0.996305,0.367788,0.100133,0.155016,0.210940,0.482775,0.436797,0.517272,0.527191,0.609713,0.244215,0.218868,-0.042540,0.384766,0.069887,0.286859,0.170408,-0.016061,0.076602,0.333278,-0.127353,0.248675,0.148621,0.698001,0.384967,0.003928,-0.092166,0.264603,0.051190,0.226810,-0.033720,0.423768,0.013696,0.437217,0.127093,0.693571,...,0.445291,0.286866,-0.066200,-0.171848,0.141119,0.288409,0.266052,0.371950,0.107502,0.319375,-0.103283,-0.113134,0.425278,0.195848,0.005066,0.324981,0.049247,0.502599,0.416253,0.593753,0.601472,0.529717,0.374421,-0.150056,0.106474,0.457063,0.509857,0.222785,-0.275757,-0.172497,0.595886,0.631427,-0.428729,-0.214912,0.508756,-0.030101,-0.036561,-0.222306,-0.279244,0.717552
382,-0.163983,62,1,13,0.873288,-0.046148,-0.315427,0.350704,0.169319,0.173718,0.316063,0.392927,0.482953,0.582847,0.462913,0.090758,-0.133639,0.339489,0.294914,0.274875,0.375263,0.358088,0.536259,0.528242,0.533147,0.161617,0.180238,0.193421,0.129724,0.300292,0.168615,0.560312,0.501526,0.402661,0.483545,0.395355,0.550690,0.574788,0.630992,0.443973,...,0.576479,0.864874,0.184840,0.046495,0.107867,0.216999,0.068548,0.273899,0.230465,0.416567,0.270366,0.006504,0.458365,0.495116,0.316858,0.338061,0.095789,0.294211,0.320817,0.485426,0.844144,0.499252,0.153969,0.486708,0.735878,0.362819,0.519730,-0.065310,0.144757,0.327905,0.627011,0.473357,0.266737,0.392032,0.584833,0.313824,0.353401,0.110811,0.012943,0.373251
365,1.130514,65,1,18,0.528959,0.488725,0.157809,0.356719,0.179142,0.504637,0.067757,0.422028,0.219201,0.634333,0.243780,0.192300,0.119785,0.113851,-0.037461,0.155785,0.254528,0.098092,-0.172644,-0.052243,-0.236113,0.093952,0.034302,0.048197,-0.206041,-0.029814,-0.302936,-0.295824,-0.389679,0.028725,-0.021519,0.330913,-0.037730,0.201175,0.018180,0.344326,...,0.687950,0.371665,0.434697,0.563138,0.105321,0.283588,0.047995,0.190626,0.337856,0.304558,0.429797,0.401419,0.268275,0.320105,0.023039,0.503689,0.175418,0.398468,0.531148,0.392709,0.633951,0.248559,0.098334,0.328772,0.209889,0.261163,0.245482,0.032098,0.114855,0.121580,0.350188,0.163117,0.141572,0.074027,0.654689,0.404851,0.745199,0.313952,0.430205,0.544585
