# **This Notebook is a baseline on how to handle the hdf5 data and correctly submit. There is no complex feature selection, catagorical data or multi-site data included, which you will most certianly need to obtain a good score. Good luck!**

In [1]:
import h5py
import numpy as np
import pandas as pd
import tensorflow as tf
import sys
import time
import sklearn

In [2]:
pheno_train = pd.read_csv('../input/abide/comp_data/pheno_train.csv')
pheno_test = pd.read_csv('../input/abide/comp_data/pheno_test.csv')
ss = pd.read_csv('../input/abide/comp_data/sample_submission.csv')
func = h5py.File('../input/abide/comp_data/abide.hdf5', 'r')

# adjust according to submission format
pheno_train['DX_GROUP'] = pheno_train['DX_GROUP'] - 1.0

In [3]:
def get_data_train(data, pheno, derivative):
    X = []
    y = []
    i = 0
    total = pheno.shape[0]
    for row in pheno.iterrows():
        file_id, dx_group = row[1]['FILE_ID'], row[1]['DX_GROUP']
        connectivity = data['patients'][file_id][derivative][()]
        X.append(connectivity)
        y.append(dx_group)
        sys.stdout.write("\r{:.2f}%>".format(i/total))
        sys.stdout.flush()
        i += 1
        
    X = np.array(X).astype(np.float32)
    y = np.array(y).astype(np.float32)
    return X, y

def get_data_test(data, pheno, derivative):
    X_test = []
    sub_ids = []
    j = 0
    total = pheno.shape[0]
    for row in pheno.iterrows():
        file_id, sub_id = row[1]['FILE_ID'], row[1]['SUB_ID']
        connectivity = data['patients'][file_id][derivative][()]
        X_test.append(connectivity)
        sub_ids.append(sub_id)
        sys.stdout.write("\r{:.2f}%>".format(j/total))
        sys.stdout.flush()
        j += 1
        
    X_test = np.array(X_test).astype(np.float32)
    return X_test, sub_ids

In [4]:
X, y = get_data_train(func, pheno_train, 'aal')
X.shape, y.shape

1.00%>

((931, 6670), (931,))

In [5]:
X_test, sub_ids = get_data_test(func, pheno_test, 'aal')
X_test.shape, len(sub_ids)

0.99%>

((104, 6670), 104)

In [6]:
%%time
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier as hgbc
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import gc

N_SPLITS = 10
kf = KFold(n_splits=N_SPLITS)
final_preds = np.zeros((X_test.shape[0], N_SPLITS))
i = 0
for tr_idx, val_idx in kf.split(X):
    clf = hgbc(max_depth=6, max_leaf_nodes=25, verbose=0, max_iter=30)
    clf.fit(X[tr_idx], y[tr_idx])
    final_preds[:, i] = clf.predict_proba(X_test)[:, 1]
    fold_preds = clf.predict_proba(X[val_idx])[:, 1]
    print('Fold: {0}, score: {1}'.format(i, roc_auc_score(y[val_idx], fold_preds)))
    del clf
    gc.collect()
    i += 1

Fold: 0, score: 0.6535326086956522
Fold: 1, score: 0.6932573599240266
Fold: 2, score: 0.6077705827937097
Fold: 3, score: 0.6808905380333952
Fold: 4, score: 0.7437258687258688
Fold: 5, score: 0.651031894934334
Fold: 6, score: 0.7078703703703703
Fold: 7, score: 0.7574074074074074
Fold: 8, score: 0.6875
Fold: 9, score: 0.6796296296296298
CPU times: user 7min 43s, sys: 11min 2s, total: 18min 45s
Wall time: 5min 28s


In [7]:
preds = np.mean(final_preds, axis=1)
sub = pd.DataFrame({'SUB_ID': sub_ids, 'preds': preds})

In [8]:
# just in case to assert order
almost_final_sub = ss.merge(sub, on='SUB_ID', how='left')
ss['DX_GROUP'] = almost_final_sub['preds']
ss # final sub

Unnamed: 0,SUB_ID,DX_GROUP
0,51260,0.795921
1,50184,0.281604
2,51306,0.549330
3,50818,0.633983
4,51491,0.420823
...,...,...
99,50408,0.745693
100,50212,0.430002
101,51492,0.661661
102,50514,0.575077


In [9]:
# dont forget correct submission type
ss['SUB_ID'] = ss['SUB_ID'].map(str)
ss.to_csv('first_submission_abide.csv', index=False)