In [28]:
import tqdm
import pandas as pd
import numpy as np
from pathlib import Path
from brainprint.predictive_modelling.utils.targets import targets
from brainprint.utils.parcellations import parcellations
from brainprint.predictive_modelling.utils.data import get_data
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [33]:
atlas,labels = [parcellations.get("Brainnetome").get(key) for key in ["atlas","labels"]]
labels = pd.read_csv(labels,index_col=0)

In [34]:
param_grid = {"learning_rate":np.linspace(0.1,1,3),"n_estimators":np.linspace(100,1000,endpoint=True,num=3,dtype="int"),"max_depth":[3,5,7],"random_state":[42]}
clf = GradientBoostingClassifier()

def predict_single_parameter(X:pd.DataFrame,y:pd.DataFrame,param_grid=param_grid,estimator=clf):
    search = GridSearchCV(estimator,param_grid)    
    pipe = Pipeline([("scaler",StandardScaler()),("clf",search)])
    pipe.fit(X,y)
    return pipe.predict_log_proba(X,y)[:,0],pipe
# pipe.predict_log_proba(X_train)


In [35]:
target_name = "sex"
parameters = ["MD","FA","AD","RD","CL","CS","CP","EigenValue","EigenVector","Thickness","Volume","Sulc"]
target = targets.get(target_name)
data = get_data("MD")
available_subjects = data.index.levels[0]
available_subjects = [subj for subj in available_subjects if subj in target.index]
train_indices,test_indices = train_test_split(available_subjects,test_size=0.25,random_state=42)

In [47]:
multi_columns = pd.MultiIndex.from_product([parameters,labels.index])
multi_df = pd.DataFrame(index=available_subjects,columns=multi_columns)
for param in parameters:
    param_df = get_data(param).loc[available_subjects]
    param_df.fillna(param_df.mean(),inplace=True)
    multi_df.loc[available_subjects,(param,labels.index)] = param_df.values


In [25]:
ensemble_df = pd.DataFrame(index=available_subjects,columns=parameters)
# for parameter in tqdm.tqdm(parameters):
for parameter in ["Thickness"]:
    data = get_data(parameter)
    data = data.fillna(data.mean())
    X = data.loc[train_indices]
    y = target.loc[train_indices].values.ravel()
    # ensemble_df[parameter],clf = predict_single_parameter(X,y)
    # print(parameter,"---",clf.score(data.loc[test_indices],target.loc[test_indices].values.ravel()))
    break



In [48]:
clf = GradientBoostingClassifier()
search = GridSearchCV(clf,param_grid,verbose=2)
pipe = Pipeline([("scaler",StandardScaler()),("clf",search)])
pipe.fit(multi_df.loc[train_indices],target.loc[train_indices].values.ravel())

Pipeline(steps=[('scaler', StandardScaler()),
                ('clf', GradientBoostingClassifier())])

In [50]:
pipe.score(multi_df.loc[test_indices],target.loc[test_indices].values.ravel())

0.68

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,264,265,266,267,268,269,270,271,272,273
Subject,Session,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
233,1,-3.188418,-3.629296,-1.291021,-1.582649,-6.754386,-6.518717,-3.532388,-3.48618,-1.695136,-3.326542,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
234,1,-5.533547,-3.676166,-4.053463,-2.853448,-8.849649,-6.679481,-5.515294,-3.27329,-3.243403,-2.649091,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
235,1,-4.494731,-4.805337,-2.104465,-1.405643,-7.898177,-8.152022,-4.828654,-5.035045,-2.26836,-4.00787,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
236,1,-4.151017,-4.159572,-4.151161,-2.223843,-10.825301,-6.102279,-5.88969,-4.139164,-3.458094,-3.497199,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
240,1,-4.21434,-3.356278,-1.586674,-2.869891,-7.918996,-6.792164,-2.901212,-4.890003,-3.635218,-4.094037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692,1,-4.78202,-3.701256,-2.156351,-1.933797,-8.672385,-8.011079,-4.442943,-2.374937,-2.435338,-2.82509,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
693,1,-4.472471,-4.255897,-3.365269,-2.173974,-8.877484,-9.383181,-5.792247,-4.242427,-2.771431,-3.96568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
694,1,-2.521248,-4.156634,-2.94217,-2.684197,-9.413647,-7.494935,-3.723435,-3.726248,-2.265643,-3.740619,...,-0.983083,0.0,-1.135794,-0.088504,-0.007342,-1.024593,-0.333748,0.0,0.0,-0.109596
695,1,-5.090229,-5.611099,-3.807065,-3.713528,-10.642318,-10.97793,-4.725028,-4.100146,-3.251541,-4.854484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
pipe = Pipeline([("scaler",StandardScaler()),("gbooster",GradientBoostingClassifier(random_state=42))])
pipe.fit(X_train,y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('gbooster', GradientBoostingClassifier(random_state=42))])

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.])