# UCSD classification on VABS domain

in this notebook UCSD subject, whose T1<72 months, are classified in respective VABS domain clusters. 

The classifier is trained on NDAR testset.

In [1]:
#import libraries
from strat.create_dataset import dataset, prepare_imputation
#from strat.run_rcv import RCV, relabel
import strat.utils as ut
import logging
import numpy as np
import pandas as pd
import logging
from strat.create_long_ndar import build_long
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
import re
from sklearn.preprocessing import StandardScaler
from umap import UMAP
from strat.ucsd_dataset_mod import  create_new,predict_labels,predict_labels_no_umap #prepare_ucsd,
import pickle as pkl
import os
from reval.best_nclust_cv import FindBestClustCV
from sklearn.cluster import KMeans
from strat.visualization import _scatter_plot
from strat.run_rcv import _build_distmat
from sklearn.impute import KNNImputer

## import data and reformat them 
select only a subset of them, dived input features from labels

In [2]:
main_path = '/Users/vmandelli/OneDrive - Fondazione Istituto Italiano Tecnologia/vineland_proj_edition'
data_path = os.path.join(main_path,"data","tidy","nda")
results_path = os.path.join(main_path,"results")
plot_path = os.path.join(main_path,"plot")

In [3]:
# import data from NDA, already splitted into age bins. 
file = "VABS_withcluster_edition_041022.csv"
new_dict_p1 = pd.read_csv(os.path.join(results_path,file),
                        header=0,
                        low_memory=False,
                        index_col='subjectkey')


# split train and test
x_tr = new_dict_p1[new_dict_p1['TR_TS']=='tr']
x_ts = new_dict_p1[new_dict_p1['TR_TS']=='ts']

In [4]:
# imputation of missing: fit on the train and transform the test
impute = KNNImputer(n_neighbors=5)

col2use = ['communicationdomain_totalb','livingskillsdomain_totalb',
           'socializationdomain_totalb','motorskillsdomain_totalb']


X_tr_prepr = pd.DataFrame(impute.fit_transform(x_tr[col2use]), index = x_tr.index)
X_ts_prepr = pd.DataFrame(impute.transform(x_ts[col2use]), index = x_ts.index)


In [5]:
# Import data from UCSD
#ucsd_data = pd.read_csv(os.path.join(ut.tidy_data_folder,'ucsd','tidy_ucsd_long_asd_onlyT1.csv'),index_col='subjectid')

# Import BETA CORRECTION data from UCSD

ucsd_data = pd.read_csv(os.path.join(ut.tidy_data_folder,'ucsd','UCSD_ASD_T1_4reval_edition_corrected.csv'),index_col='subjectid')
ucsd_data.shape


(1201, 28)

In [6]:
#define the columns I want as features (VABS domanin columns)

ndar_feat = ['communicationdomain_totalb',
             'livingskillsdomain_totalb',
             'socializationdomain_totalb',
             'motorskillsdomain_totalb']
#oringinal columns
ucsd_feat = ['vine_ComTotal_DomStd',
             'vine_DlyTotal_DomStd',
             'vine_SocTotal_DomStd',
             'vine_MtrTotal_DomStd']
# beta corrected
#ucsd_feat = ['new_vine_ComTotal_DomStd',
#             'new_vine_DlyTotal_DomStd',
#            'new_vine_SocTotal_DomStd',
#            'new_vine_MtrTotal_DomStd']

In [8]:
# select NDA test set as the new training set (to train the classifier), so that we avoid circularities of using data multiple times.
new_TR_data =  X_ts_prepr
new_TR_data.colnames = ndar_feat
new_TR_data.head()

Unnamed: 0_level_0,0,1,2,3
subjectkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NDARFE664RE3,85.114274,80.097414,73.081418,77.312881
NDARTH626RWH,42.0,46.0,55.0,75.0
NDARPM268PMP,48.994312,69.682299,51.249438,66.268552
NDARCL082FGC,97.994312,112.682299,83.249438,111.268552
NDARVL893WB0,77.114274,76.097414,87.081418,86.312881


In [9]:
# Prepare it to be in the correct format to be the train set
X_tr = new_TR_data
y_tr = x_ts['cluster_domain']

#check it is ok (it should have subject on the rows and the 4 VABS subscles in the columns (COM-DLS-SOC-MOT))
print(y_tr)
X_tr.head()

subjectkey
NDARFE664RE3        0
NDARTH626RWH        2
NDARPM268PMP        2
NDARCL082FGC        1
NDARVL893WB0        1
                   ..
NDARFR137BN3        1
NDAR_INVYY983ZGL    2
NDARKV375JZW        2
NDARHV294XMW        2
NDARHX114VE2        2
Name: cluster_domain, Length: 495, dtype: int64


Unnamed: 0_level_0,0,1,2,3
subjectkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NDARFE664RE3,85.114274,80.097414,73.081418,77.312881
NDARTH626RWH,42.0,46.0,55.0,75.0
NDARPM268PMP,48.994312,69.682299,51.249438,66.268552
NDARCL082FGC,97.994312,112.682299,83.249438,111.268552
NDARVL893WB0,77.114274,76.097414,87.081418,86.312881


In [10]:
# Preapre UCSD to be in the correct format to be insert in the classifies to be labels)
X_ts = ucsd_data[ucsd_data['vine_agemo']<=72]

logging.info(f' Selected subjects at their first encounter done at age included in '
                 f'period P1 (<72). Mean (sd) interview age is {round(X_ts.vine_agemo.describe()["mean"], 2)} '
                 f'({round(X_ts.vine_agemo.describe()["std"], 2)}) '
                 f'-- Min / Max:[{round(X_ts.vine_agemo.describe()["min"], 2)}; '
                 f'{round(X_ts.vine_agemo.describe()["max"], 2)}]')

uscd_used_in_reval = X_ts.copy()
X_ts= X_ts[ucsd_feat]


print(X_ts.shape)


10:21:05, INFO  Selected subjects at their first encounter done at age included in period P1 (<72). Mean (sd) interview age is 25.45 (8.47) -- Min / Max:[9.56; 69.91]


(1185, 4)


In [None]:
# preapre the DBs to insert in as the algorithm requires

#X_tr, y_tr, X_ts, ucsd = prepare_ucsd(new_TR_data, ucsd_data, period='P1') 
# rember python start to count from 0 , so new_dict_p1['P1'][1] 1 here means ts (= test set), 
# we train the algorithm on the NDAR test set to avoid circularity   -- From ucsd_dataselect only subject whose T1 is in P1 Range
# problem is that reval overfit the training set,

## train the classifier

In [11]:
# train the classifier

# initialize che classifier
classifier = KNeighborsClassifier(n_neighbors = 5)

# predict labels using a self build function strat/ucsd_dataset_mod.predict_labels
label_dict, model_fit, Scaler, Umap = predict_labels(X_tr, X_ts, y_tr, classifier)


In [12]:
# remap the labels on the original UCSD dataframe
uscd_used_in_reval['cluster_domain']= uscd_used_in_reval.index.map(label_dict)

## Merge the long_ucsd DB with the cluster + create fake clusters for other diagnosis

take the database in long format with both ASD and TD (other diagnosis) and all the measures (vineland , mullen, ados)

In [13]:
# import ucsd_long all subject
all_ucsd = pd.read_csv(os.path.join(ut.tidy_data_folder,'ucsd','tidy_ucsd_long_allsubj.csv'),index_col='subjectid')
print(all_ucsd.shape)


(14880, 74)


In [14]:
# a new_dictionary to add also TD DD LD MN and SYB (as cluster labels) :

# TD (typically development)= PrevDDTyp ,PrevLDDTyp,TD  
# DD (developmental delay) = DD+ GDD
# LD (language delay)
# MD (motor delay) = FMD + MD motor delay 
# syb (syblings) = Typ Sib ASD   

diagnosis_dict={"TD":"TD",
                'PrevLDDTyp':'TD',
                'PrevDDTyp':'TD',
                'DD': "DD",
                'GDD' : 'DD',
                'LD': "LD",
                'FMD': 'MD',
                'MD':'MD',
                'Typ Sib ASD':"syb"}

In [15]:
# copy the DB to map colums on
all_ucsd_clust = all_ucsd.copy()
#create a 'sub_id' col to map the label_dict
all_ucsd_clust['sub_id'] = all_ucsd_clust.index

In [16]:
### to create the "cluster_clolumn"
# 1) map col 'sub_id' on label_dict (output from the classifier for ASD subject whose 1st assessment is 
# before 72 months and have a logitudinal data (at least 2 time points)
# 2) map col 'recentDxJ_dxCode' on diagnosis_dict(to get instead of clusters the diagnosis for TD DD LD MD and syb)

all_ucsd_clust['cluster'] = all_ucsd_clust['sub_id'].map(label_dict).to_frame(name='cluster').combine_first(all_ucsd_clust['recentDxJ_dxCode'].map(diagnosis_dict).to_frame(name='cluster'))

print(all_ucsd_clust.shape)
all_ucsd_clust.head()


(14880, 76)


Unnamed: 0_level_0,Unnamed: 0,gender,ethnicity,race,recentDxJ_dxCode,vine_subjectid,vine_agemo,vine_ComRecep_Raw,vine_ComRecep_Adap,vine_ComRecep_AgeEq_mo,...,mullen_RLT,mullen_RL_Raw,mullen_RL_AgeEq,mullen_ELT,mullen_EL_Raw,mullen_EL_AgeEq,mullen_ELC_Std,time,sub_id,cluster
subjectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2A4U,0,M,Unknown,Unknown,TD,A2A4U,23.0,23.0,Adequate,23.0,...,47.0,23.0,23.0,46.0,21.0,22.0,101.0,1,A2A4U,TD
A2A4U,0,M,Unknown,Unknown,TD,A2A4U,35.0,75.0,ModHigh,102.0,...,49.0,31.0,34.0,47.0,30.0,33.0,102.0,2,A2A4U,TD
A2A4U,0,M,Unknown,Unknown,TD,A2A4U,,,,,...,,,,,,,,3,A2A4U,TD
A2A4U,0,M,Unknown,Unknown,TD,A2A4U,,,,,...,,,,,,,,4,A2A4U,TD
A2A4U,0,M,Unknown,Unknown,TD,A2A4U,,,,,...,,,,,,,,5,A2A4U,TD


In [18]:
# save the new DBs
path2save = ut.out_folder
all_ucsd_clust.to_csv(os.path.join(path2save,'ucsd_long_allSubj_clusters_061022.csv'))
ut.out_folder

'/Users/vmandelli/OneDrive - Fondazione Istituto Italiano Tecnologia/vineland_proj_edition/results'

In [19]:
# check that I have 1185 ASD subject at T1 <72
all_ucsd_clust_t1= all_ucsd_clust[all_ucsd_clust['time']==1]
all_ucsd_clust_t1['cluster'].isin([1,2,0]).sum()


1185

## run the model! in future studies ...

In [20]:
#prepare the cell to run the classifier on every type if DBS

## with UMAP
Model =  pkl.load(open(os.path.join(ut.out_folder, "fittedmodelDomainONLY_NDARts.sav"), 'rb'))
umap = pkl.load(open(os.path.join(ut.out_folder, 'umapDomainONLY_NDARts.sav'), 'rb'))
scaler = pkl.load(open(os.path.join(ut.out_folder, 'scalerDomainONLY_NDARts.sav'), 'rb'))

## no umap
#Model =  pkl.load(open(os.path.join(ut.out_folder, "NO_UMAP/fittedmodelDomainONLY_NDARts.sav"), 'rb'))
#scaler = pkl.load(open(os.path.join(ut.out_folder, 'NO_UMAP/scalerDomainONLY_NDARts.sav'), 'rb'))


def run_KNN_VABS(X):
    subj = X.index

    New_X_ts = umap.transform(scaler.transform(X))
    pred_labels = Model.predict(New_X_ts)
    
    label_dict = {s: lab for s, lab in zip(subj, pred_labels)}
    
    return label_dict

In [21]:
# to check it works the same as before RESULTS should be equal to label label_dict (use the same X_ts (from UCSD))
RESULTS = run_KNN_VABS(X_ts)


In [28]:
### STOP here