In [110]:
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import pickle
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, f1_score,  confusion_matrix


# Machine learning, prototyping

## Granger Causality data

In [48]:
def get_spec_sense(true_y, preds):
    """
    Calculate specificity and sensitivity
    true_y: True classes
    preds: predicted classes
    """
    # Use sklearn confusion matrixs to get true positive, etc
    tn, fp, fn, tp = confusion_matrix(true_y, preds).ravel()
    # calculate specificity and sensitivity
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    # output specificity and sensitivity
    return specificity, sensitivity

In [2]:
gci_df = pd.read_csv('/Users/admin/Documents/MscProject/vectorised/gci_cc200_vectorised.csv.gz', compression='gzip', header=0, sep='\t')


In [3]:
gci_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39792,39793,39794,39795,39796,39797,39798,39799,DX_GROUP,DSM_IV_TR
0,0.232827,0.310001,0.022694,0.17194,0.201924,1.564648,0.507979,0.157911,0.26136,0.02722,...,0.0,0.0,0.365369,0.0,0.159129,0.0,0.0,0.0,1,1
1,0.0,0.0,0.034329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
2,0.0,0.211977,0.06318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.300281,0.064557,0.0,0.0,0.314101,0.0,0.087941,0.244485,1,1
3,0.003462,0.176266,0.119884,0.0,0.0,0.061226,0.0,0.272025,0.083786,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.130785,0.0,1,1
4,0.348111,0.137865,0.170946,0.007856,0.477393,0.028955,0.093623,0.025476,0.086795,0.0,...,0.082194,0.0,0.270727,0.0,0.139575,0.108761,0.019303,0.329178,1,1


In [4]:
X = gci_df.iloc[:,:-2].to_numpy()
y = gci_df.loc[:,'DX_GROUP']

In [5]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=2)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	39796
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	39796
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	39796
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	39796
Iteration: 	12 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	39796
Iteration: 	13 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	39796
Iteration: 	14 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	39796
Iteration: 	15 / 100
Confirmed: 	0
Tentative: 	4
Rejected: 	39796
Iteration: 	16 / 10

In [8]:
filtered_df = pd.DataFrame(X_filtered)
filtered_df['DX_GROUP'] = gci_df['DX_GROUP']

In [109]:
filtered_df

Unnamed: 0,0,1,2,3,DX_GROUP
0,0.000000,0.000000,0.174199,0.000000,1
1,0.000000,0.000000,0.000000,0.000000,1
2,0.000000,0.000000,0.068504,0.000000,1
3,0.005836,0.238610,0.000000,0.000000,1
4,0.000000,0.000000,0.115350,0.000000,1
...,...,...,...,...,...
1021,0.007061,0.544092,0.268287,0.000000,1
1022,0.184271,0.542669,0.000000,0.084724,1
1023,0.197216,0.060624,0.346220,0.000000,1
1024,0.024830,0.013563,0.000000,0.000000,1


In [9]:
filtered_df.to_csv('/Users/admin/Documents/MscProject/vectorised/gci_cc200_filtered.csv', index=False)

In [21]:
svm = SVC(kernel='rbf')
boost = xgb.XGBClassifier()
lr = LogisticRegression()

In [49]:
cv = StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2)
scores_svm = []
f1_svm = []
spec_svm = []
sense_svm = []
for train, test in cv.split(X_filtered, y):
    # *ConnectivityMeasure* can output the estimated subjects coefficients
    # as a 1D arrays through the parameter *vectorize*.

    classifier = SVC(kernel='rbf').fit(X_filtered[train], y[train])
    # make predictions for the left-out test subjects
    predictions = classifier.predict(X_filtered[test])
    # store the accuracy for this cross-validation fold
    score = accuracy_score(y[test], predictions)
    f1 = f1_score(y[test], predictions)
    spec, sense = get_spec_sense(y[test], predictions)

    scores_svm.append(score)
    f1_svm.append(f1)
    spec_svm.append(spec)
    sense_svm.append(sense)

In [75]:
scores_boost = []
f1_boost = []
spec_boost = []
sense_boost = []
for train, test in cv.split(X_filtered, y):
    # *ConnectivityMeasure* can output the estimated subjects coefficients
    # as a 1D arrays through the parameter *vectorize*.

    classifier = boost.fit(X_filtered[train], y[train])
    # make predictions for the left-out test subjects
    predictions = classifier.predict(X_filtered[test])
    # store the accuracy for this cross-validation fold
    score = accuracy_score(y[test], predictions)
    f1 = f1_score(y[test], predictions)
    spec, sense = get_spec_sense(y[test], predictions)

    scores_boost.append(score)
    f1_boost.append(f1)
    spec_boost.append(spec)
    sense_boost.append(sense)


In [76]:
scores_lr = []
f1_lr = []
spec_lr = []
sense_lr = []
for train, test in cv.split(X_filtered, y):
    # *ConnectivityMeasure* can output the estimated subjects coefficients
    # as a 1D arrays through the parameter *vectorize*.

    classifier = lr.fit(X_filtered[train], y[train])
    # make predictions for the left-out test subjects
    predictions = classifier.predict(X_filtered[test])
    # store the accuracy for this cross-validation fold
    score = accuracy_score(y[test], predictions)
    f1 = f1_score(y[test], predictions)
    spec, sense = get_spec_sense(y[test], predictions)

    scores_lr.append(score)
    f1_lr.append(f1)
    spec_lr.append(spec)
    sense_lr.append(sense)

In [56]:
np.round(np.mean(scores_svm) *100, 4), np.round(np.std(scores_svm)*100,4)

(55.9223, 3.2039)

In [53]:
np.round(np.mean(scores_boost), 4), np.round(np.std(scores_boost),4)

(0.5316, 0.027)

In [54]:
np.round(np.mean(scores_lr), 4), np.round(np.std(scores_lr),4)

(0.5684, 0.031)

In [84]:
def get_avg_std(metric):
    return np.round(np.mean(metric) *100, 2), np.round(np.std(metric)*100,2)

In [85]:
def get_all(acc, sensitivity, specificity, f1_s):
    return {'Accuracy' : get_avg_std(acc), 'Sensitivty': get_avg_std(sensitivity), 'Specifitcy': get_avg_std(specificity), 'F1': get_avg_std(f1_s)}

In [86]:
get_all(scores_boost,sense_boost,spec_boost,f1_boost)

{'Accuracy': (53.16, 2.7),
 'Sensitivty': (50.71, 3.37),
 'Specifitcy': (55.42, 4.4),
 'F1': (50.97, 2.76)}

In [87]:
get_all(scores_lr,sense_lr,spec_lr,f1_lr)

{'Accuracy': (56.84, 3.1),
 'Sensitivty': (30.3, 5.27),
 'Specifitcy': (81.4, 3.14),
 'F1': (40.12, 5.73)}

## Large scale granger causality, machine learning

In [90]:
lsgci_df = pd.read_csv('/Users/admin/Documents/MscProject/vectorised/gci_cc200_vectorised.csv.gz', compression='gzip', header=0, sep='\t')


In [108]:
lsgci_df.shape

(1026, 39802)

In [97]:
X = lsgci_df.iloc[:,:-2].to_numpy()
y = lsgci_df.loc[:,'DX_GROUP']

In [93]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=2)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

# call transform() on X to filter it down to selected features
lsgciX_filtered = feat_selector.transform(X)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	39800
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	39798
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	39798
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	39798
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	39798
Iteration: 	12 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	39798
Iteration: 	13 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	39798
Iteration: 	14 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	39798
Iteration: 	15 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	39798
Iteration: 	16 / 10

In [94]:
lsgciX_filtered

array([[0.        , 0.02042338],
       [0.00183179, 0.        ],
       [0.0064784 , 0.03929353],
       ...,
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ]])

In [98]:
lsgci_df_F = pd.DataFrame(lsgciX_filtered)
lsgci_df_F['DX_GROUP'] = lsgci_df['DX_GROUP']

In [99]:
lsgci_df_F.to_csv('/Users/admin/Documents/MscProject/vectorised/gci_ls_cc200_filtered.csv', index=False)

In [100]:
svm = SVC(kernel='rbf')
boost = xgb.XGBClassifier()
lr = LogisticRegression()

In [101]:
cv = StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2)
scores_svm = []
f1_svm = []
spec_svm = []
sense_svm = []
for train, test in cv.split(lsgciX_filtered, y):
    # *ConnectivityMeasure* can output the estimated subjects coefficients
    # as a 1D arrays through the parameter *vectorize*.

    classifier = SVC(kernel='rbf').fit(lsgciX_filtered[train], y[train])
    # make predictions for the left-out test subjects
    predictions = classifier.predict(lsgciX_filtered[test])
    # store the accuracy for this cross-validation fold
    score = accuracy_score(y[test], predictions)
    f1 = f1_score(y[test], predictions)
    spec, sense = get_spec_sense(y[test], predictions)

    scores_svm.append(score)
    f1_svm.append(f1)
    spec_svm.append(spec)
    sense_svm.append(sense)

In [102]:
scores_boost = []
f1_boost = []
spec_boost = []
sense_boost = []
for train, test in cv.split(lsgciX_filtered, y):
    # *ConnectivityMeasure* can output the estimated subjects coefficients
    # as a 1D arrays through the parameter *vectorize*.

    classifier = boost.fit(lsgciX_filtered[train], y[train])
    # make predictions for the left-out test subjects
    predictions = classifier.predict(lsgciX_filtered[test])
    # store the accuracy for this cross-validation fold
    score = accuracy_score(y[test], predictions)
    f1 = f1_score(y[test], predictions)
    spec, sense = get_spec_sense(y[test], predictions)

    scores_boost.append(score)
    f1_boost.append(f1)
    spec_boost.append(spec)
    sense_boost.append(sense)


In [103]:
scores_lr = []
f1_lr = []
spec_lr = []
sense_lr = []
for train, test in cv.split(lsgciX_filtered, y):
    # *ConnectivityMeasure* can output the estimated subjects coefficients
    # as a 1D arrays through the parameter *vectorize*.

    classifier = lr.fit(lsgciX_filtered[train], y[train])
    # make predictions for the left-out test subjects
    predictions = classifier.predict(lsgciX_filtered[test])
    # store the accuracy for this cross-validation fold
    score = accuracy_score(y[test], predictions)
    f1 = f1_score(y[test], predictions)
    spec, sense = get_spec_sense(y[test], predictions)

    scores_lr.append(score)
    f1_lr.append(f1)
    spec_lr.append(spec)
    sense_lr.append(sense)

In [104]:
get_all(scores_svm,sense_svm,spec_svm,f1_svm)

{'Accuracy': (55.87, 3.25),
 'Sensitivty': (34.34, 5.51),
 'Specifitcy': (75.79, 4.15),
 'F1': (42.62, 5.6)}

In [105]:
get_all(scores_boost,sense_boost,spec_boost,f1_boost)

{'Accuracy': (55.0, 2.99),
 'Sensitivty': (41.82, 3.68),
 'Specifitcy': (67.2, 5.65),
 'F1': (47.15, 3.22)}

In [106]:
get_all(s,sense_lr,spec_lr,f1_lr)

{'Accuracy': (51.94, 0.0),
 'Sensitivty': (0.0, 0.0),
 'Specifitcy': (100.0, 0.0),
 'F1': (0.0, 0.0)}

## FC machine

In [111]:
fc_df = pd.read_csv('/Users/admin/Documents/MscProject/vectorised/fc_cc200_vectorised.csv.gz', compression='gzip', header=0, sep='\t')

In [112]:
fc_df.head()

Unnamed: 0,#1-#2,#1-#3,#1-#4,#1-#5,#1-#6,#1-#7,#1-#8,#1-#9,#1-#10,#1-#11,...,#196-#199,#196-#200,#197-#198,#197-#199,#197-#200,#198-#199,#198-#200,#199-#200,DX_GROUP,DSM_IV_TR
0,-0.318127,-0.069796,-0.036556,-0.163779,0.154582,-0.468522,0.149931,0.085749,-0.091261,-0.097561,...,0.288202,0.346534,0.169769,0.402399,0.235802,0.328523,0.022539,0.275308,1,1
1,0.248915,0.138,-0.209563,-0.142492,-0.541118,-0.22042,-0.038429,-0.57957,-0.27624,0.080674,...,-0.217907,0.152274,-0.022263,0.387047,0.030468,0.100718,0.109483,-0.272489,1,1
2,0.138845,0.060143,-0.0742,-0.062667,0.065521,0.034017,-0.131597,-0.27295,-0.267307,0.056912,...,-0.101845,0.25816,-0.350446,0.157305,-0.163685,-0.071646,0.114257,-0.299283,1,1
3,-0.255444,0.107162,-0.054165,0.085518,-0.007979,-0.277307,-0.052372,0.300137,0.187776,-0.014143,...,0.214707,0.010837,-0.220701,0.121951,0.140008,-0.150833,0.227749,-0.012394,1,1
4,-0.377935,-0.060415,-0.005243,-0.431189,-0.117298,-0.135625,0.15358,0.104868,0.151153,-0.198125,...,-0.051302,-0.137138,-0.20722,-0.160206,-0.037513,0.401884,0.275285,-0.304601,1,1


In [113]:
X = fc_df.iloc[:,:-2].to_numpy()
y = fc_df.loc[:,'DX_GROUP']

In [None]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=1)

# find all relevant features 
feat_selector.fit(X, y)


# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X)

In [None]:
support = feat_selector.support_ # 1D boolean mask to be applied to feature list gives list of selected features


In [None]:
selected = fc_df.columns[:-2][support]# selected features
selected = np.r_[selected, ['DX_GROUP','DSM_IV_TR']] # add target column names to list

In [None]:
X_filtered = fc_df[selected] # filter out selected columns and store 

In [None]:
X_filtered.to_csv('vectorised/cc200_filtered.csv',index=False) # store as csv

In [None]:
X = X_filtered.iloc[:,:-2].to_numpy()
y = X_filtered.loc[:,'DX_GROUP']

In [None]:
X_filtered.head()

In [114]:
cv = StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2)
scores = []
for train, test in cv.split(X, y):
        # 

    rf = xgb.XGBClassifier().fit(X[train], y[train])
        # 
    predictions = rf.predict(X[test])
        # 
    score= accuracy_score(y[test], predictions)
    scores.append(score)

In [115]:
np.mean(scores)

0.6699029126213593

In [116]:
scores

[0.6650485436893204,
 0.6990291262135923,
 0.6844660194174758,
 0.6456310679611651,
 0.7087378640776699,
 0.6844660194174758,
 0.6650485436893204,
 0.6553398058252428,
 0.6650485436893204,
 0.6262135922330098]

In [670]:
cv = StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2)
scores = []
for train, test in cv.split(X, y):
    # 

    svm = SVC(kernel='rbf').fit(X[train], y[train])
    # 
    predictions = svm.predict(X[test])
    # 
    score = accuracy_score(y[test], predictions)
    scores.append(score)

KeyError: "None of [Int64Index([226, 349, 666, 568, 563, 254, 586, 148, 799,  94,\n            ...\n            471, 613, 519, 439, 811, 305, 612,  95, 647, 689],\n           dtype='int64', length=759)] are in the [columns]"

In [669]:
np.mean(scores)

TypeError: unsupported operand type(s) for /: 'dict' and 'int'

In [668]:
scores

{'fit_time': array([0.15300322, 0.02082396, 0.02080488, 0.01933885, 0.01970482,
        0.01949716, 0.01910305, 0.02134585, 0.0182209 , 0.02264905]),
 'score_time': array([0.00555897, 0.00334096, 0.00318098, 0.0033741 , 0.00331521,
        0.00556493, 0.00319719, 0.00338531, 0.00390506, 0.00314283]),
 'test_accuracy': array([0.69902913, 0.66019417, 0.7184466 , 0.80582524, 0.80582524,
        0.66019417, 0.69607843, 0.68627451, 0.71568627, 0.68627451]),
 'train_accuracy': array([0.82015168, 0.83315276, 0.83098592, 0.82340195, 0.82015168,
        0.83640303, 0.83008658, 0.83874459, 0.83225108, 0.82683983]),
 'test_precision': array([0.69387755, 0.66666667, 0.69811321, 0.79591837, 0.82222222,
        0.70588235, 0.69565217, 0.66666667, 0.7       , 0.71794872]),
 'train_precision': array([0.83373494, 0.83682984, 0.83449883, 0.8353222 , 0.83095238,
        0.83990719, 0.83449883, 0.84705882, 0.84987893, 0.83971292]),
 'test_recall': array([0.68      , 0.6       , 0.74      , 0.79591837, 0.7

## Feature Importances

In [None]:
feature_names = X_filtered.columns[:-2] #feature names

In [None]:
#fit permutation importance model to test set
perm_importance = permutation_importance(svm, X_filtered.iloc[test,:], y[test])

feature_names = X_filtered.columns[:-2]
features = np.array(feature_names)

sorted_idx = perm_importance.importances_mean.argsort()


In [None]:
perm_imps = np.c_[feature_names[sorted_idx[::-1]],perm_importance.importances_mean[sorted_idx[::-1]],
                  perm_importance.importances[sorted_idx[::-1]]]

In [None]:
perm_imp_df = pd.DataFrame(perm_imps)


In [None]:
perm_imp_df.columns = ['features','mean_perm_imp', 1,2,3,4,5]

In [None]:
perm_imp_df

In [None]:
tree_importance_sorted_idx = np.argsort(rf.feature_importances_)
tree_indices = np.arange(0, len(rf.feature_importances_)) + 0.5

In [None]:

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 10))
ax1.barh(tree_indices[:50], rf.feature_importances_[tree_importance_sorted_idx][-50:], height=0.2)
ax1.set_yticks(tree_indices[:50])
ax1.set_yticklabels(feature_names[tree_importance_sorted_idx][-50:])
ax1.set_ylim((0, len(rf.feature_importances_[-50:])))
ax2.boxplot(
    perm_importance.importances[sorted_idx[-50:]].T,
    vert=False,
    labels=feature_names[sorted_idx[-50:]],
)
fig.tight_layout()
plt.show()


In [None]:
perm_imps = np.c_[feature_names[sorted_idx[::-1]],perm_importance.importances_mean[sorted_idx[::-1]],
                  perm_importance.importances[sorted_idx[::-1]]]

In [None]:
perm_imp_df = pd.DataFrame(perm_imps)


In [None]:
perm_imp_df.columns = ['features','mean_perm_imp', 1,2,3,4,5]

In [None]:
perm_imp_df

## Removing least important features

In [None]:
threshold_0_X = perm_imp_df.loc[perm_imp_df['mean_perm_imp']>0, 'features']

In [None]:
threshold_0_X.shape

In [None]:
X_thresh_0 = X_filtered.loc[:,threshold_0_X].to_numpy()


In [None]:
cv = StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2)
scores2 = []
for train, test in cv.split(X_thresh_0, y):
    # 

    svm2 = SVC(kernel='rbf').fit(X_thresh_0[train,:], y[train])
    # 
    predictions = svm2.predict(X_thresh_0[test])
    # 
    score2 = accuracy_score(y[test], predictions)
    scores2.append(score2)

In [654]:
np.mean(scores2)

0.7378640776699028

In [655]:
scores2

[0.7524271844660194,
 0.7864077669902912,
 0.7427184466019418,
 0.7427184466019418,
 0.7766990291262136,
 0.7378640776699029,
 0.6941747572815534,
 0.6650485436893204,
 0.7330097087378641,
 0.7475728155339806]

In [None]:
feature_names[sorted_idx][::-1][:50]

In [None]:
features[tree_importance_sorted_idx[::-1] ][:50]

In [None]:
highest = fc_df.loc[:,feature_names[sorted_idx][::-1][:50]]

In [None]:
highest['DX_GROUP']= fc_df['DX_GROUP']

In [None]:
highest

In [None]:
highest_feats = feature_names[sorted_idx][::-1][:50].to_list()

In [None]:
split_feats = [highest_feats[i].split("-")for i in range(len(highest_feats))]

In [None]:
grouped = highest.groupby(by= 'DX_GROUP').mean()
grouped

In [None]:
highest_0,highest_1 = grouped.iloc[0,:],grouped.iloc[1,:]

In [None]:
high_0_val, high_1_val = highest_0.values.tolist(), highest_1.values.tolist()

In [None]:
control_highest = [nodes + [high_0_val[i]] for i,nodes in enumerate(split_feats)]

In [None]:
asd_highest = [nodes + [high_1_val[i]] for i,nodes in enumerate(split_feats)]

In [None]:
asd_highest

In [None]:
with open('asd_highest_50.pkl', 'wb') as handle:
    pickle.dump(asd_highest, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('control_highest_50.pkl', 'wb') as handle:
    pickle.dump(control_highest, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [653]:
pd.read_csv('/Users/admin/Documents/MscProject/vectorised/fc_cc400_filtered.csv')

Unnamed: 0,#5-#182,#5-#230,#5-#390,#8-#224,#9-#55,#10-#18,#10-#133,#10-#239,#10-#390,#11-#82,...,#310-#390,#315-#346,#317-#347,#318-#362,#323-#382,#323-#391,#335-#360,#343-#382,#356-#366,DX_GROUP
0,-0.465351,-0.038908,-0.303455,0.061997,0.285353,-0.302591,-0.575635,0.093608,-0.311941,0.502320,...,0.105564,0.281348,-0.078890,0.431270,-0.367768,0.088068,0.272958,-0.402212,0.349068,1
1,-0.009303,0.247179,0.019453,-0.089782,0.118145,0.045821,-0.036047,0.138405,0.046357,0.364785,...,0.006089,0.052901,-0.005255,0.166180,0.029733,-0.006374,0.513472,0.115737,0.519810,1
2,-0.131458,0.072728,0.090999,0.324135,0.235671,-0.226198,-0.300777,0.442815,-0.039074,0.477168,...,-0.112499,-0.089670,-0.184399,0.130349,0.320956,0.134839,0.131533,-0.027364,0.234810,1
3,0.155183,0.077839,0.129694,-0.299517,0.191523,-0.196390,-0.279142,0.075303,0.181871,0.709533,...,0.117378,-0.332721,-0.203306,0.292701,0.187288,-0.461183,0.044323,-0.016915,0.198380,1
4,-0.005694,-0.116649,0.046553,-0.101360,-0.211751,0.016708,-0.093205,0.279462,0.240016,0.256403,...,0.069215,-0.025669,-0.043941,0.474094,0.048160,-0.178296,0.238861,-0.083691,-0.080434,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
944,0.242785,0.333870,0.324636,-0.013359,0.227179,0.246499,0.280949,-0.473235,0.376639,0.405462,...,0.177930,-0.399635,-0.147217,0.157535,-0.168862,0.141573,0.085821,0.174607,0.022434,1
945,-0.190760,-0.086610,-0.045279,-0.212172,0.302947,0.140238,0.121516,-0.039889,0.031693,0.288423,...,-0.062911,0.255855,0.144863,0.246999,0.206618,-0.105380,-0.088745,0.283497,0.294862,1
946,0.138792,0.152113,0.253272,-0.215632,0.052178,-0.371396,-0.358475,-0.308918,0.345141,0.394936,...,-0.033570,-0.459476,0.112224,0.646445,-0.334266,-0.294695,0.314053,0.049902,-0.000490,1
947,-0.027725,0.081142,0.384957,-0.146337,-0.330887,0.068879,0.120493,0.010479,0.087033,0.419849,...,0.002085,0.024210,-0.080221,0.254440,-0.138745,0.110950,-0.319803,0.143677,-0.130846,1
