# UCSD classification on VABS domain

in this notebook UCSD subject, whose T1<72 months, are classified in respective VABS domain clusters. 

The classifier is trained on NDAR testset.

In [1]:
#import libraries
from strat.create_dataset import dataset, prepare_imputation
#from strat.run_rcv import RCV, relabel
import strat.utils as ut
import logging
import numpy as np
import pandas as pd
import logging
from strat.create_long_ndar import build_long
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
import re
from sklearn.preprocessing import StandardScaler
from umap import UMAP
from strat.ucsd_dataset_mod import  create_new,predict_labels,predict_labels_no_umap #prepare_ucsd,
import pickle as pkl
import os
from reval.best_nclust_cv import FindBestClustCV
from sklearn.cluster import KMeans
from strat.visualization import _scatter_plot
from strat.run_rcv import _build_distmat

## import data and reformat them 
select only a subset of them, dived input features from labels

In [2]:
## import data from NDA, already splitted into age bins. 
## we select only P1 data(less that 72 months), with the imputed missing data, and save them into a dictionary (new_dict_p1)

# train
new_dict_p1_tr = pd.read_csv(os.path.join(ut.out_folder,'imputed_data_P1_tr.csv'),delimiter=',', index_col='subjectkey')
# test
new_dict_p1_ts = pd.read_csv(os.path.join(ut.out_folder,'imputed_data_P1_ts.csv'),delimiter=',', index_col='subjectkey')

# bind them together
new_dict_p1 = {'P1': (new_dict_p1_tr.copy(), new_dict_p1_ts.copy())}


In [3]:
# Import data from UCSD
ucsd_data = pd.read_csv(os.path.join(ut.tidy_data_folder,'ucsd','tidy_ucsd_long_asd_onlyT1.csv'),index_col='subjectid')


In [5]:
#define the columns I want as features (VABS domanin columns)

ndar_feat = [ 'communicationdomain_totalb',
             'livingskillsdomain_totalb',
             'socializationdomain_totalb',
             'motorskillsdomain_totalb']

ucsd_feat = ['vine_ComTotal_DomStd',
             'vine_DlyTotal_DomStd',
             'vine_SocTotal_DomStd',
             'vine_MtrTotal_DomStd']

In [6]:
# select NDA test set as the new training set (to train the classifier), so that we avoid circularities of using data multiple times.
new_TR_data = new_dict_p1['P1'][1] 

# Prepare it to be in the correct format to be the train set
X_tr = new_TR_data[[c for c in ndar_feat]]
y_tr = new_TR_data['cluster_domain']

#check it is ok (it should have subject on the rows and the 4 VABS subscles in the columns (COM-DLS-SOC-MOT))
X_tr.head()


Unnamed: 0_level_0,communicationdomain_totalb,livingskillsdomain_totalb,socializationdomain_totalb,motorskillsdomain_totalb
subjectkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NDARAA668AC0,72.0,55.0,66.0,72.0
NDARAB839WV5,81.0,62.0,65.0,67.0
NDARAB989DG5,63.0,73.0,68.0,70.0
NDARAC070DYV,89.8,98.0,95.0,82.0
NDARAC098YNP,91.0,71.0,77.0,84.0


In [7]:
# Preapre UCSD to be in the correct format to be insert in the classifies to be labels)
X_ts = ucsd_data[ucsd_data['vine_agemo']<=72]

logging.info(f' Selected subjects at their first encounter done at age included in '
                 f'period P1. Mean (sd) interview age is {round(X_ts.vine_agemo.describe()["mean"], 2)} '
                 f'({round(X_ts.vine_agemo.describe()["std"], 2)}) '
                 f'-- Min / Max:[{round(X_ts.vine_agemo.describe()["min"], 2)}; '
                 f'{round(X_ts.vine_agemo.describe()["max"], 2)}]')

X_ts= X_ts[ucsd_feat]
print(X_ts.shape)


17:31:59, INFO  Selected subjects at their first encounter done at age included in period P1. Mean (sd) interview age is 25.45 (8.45) -- Min / Max:[9.56; 69.91]


(1216, 4)


In [8]:
# preapre the DBs to insert in as the algorithm requires

#X_tr, y_tr, X_ts, ucsd = prepare_ucsd(new_TR_data, ucsd_data, period='P1') 
# rember python start to count from 0 , so new_dict_p1['P1'][1] 1 here means ts (= test set), 
# we train the algorithm on the NDAR test set to avoid circularity   -- From ucsd_dataselect only subject whose T1 is in P1 Range
# problem is that reval overfit the training set,

## train the classifier

In [8]:
# train the classifier

# initialize che classifier
classifier = KNeighborsClassifier(n_neighbors = 5)

# predict labels using a self build function strat/ucsd_dataset_mod.predict_labels
label_dict, model_fit, Scaler, Umap = predict_labels(X_tr, X_ts, y_tr, classifier)


In [9]:
# remap the labels on the original UCSD dataframe
ucsd_data['cluster_domain']= ucsd_data.index.map(label_dict)

## Merge the long_ucsd DB with the cluster + create fake clusters for other diagnosis

take the database in long format with both ASD and TD (other diagnosis) and all the measures (vineland , mullen, ados)

In [10]:
# import ucsd_long all subject
all_ucsd = pd.read_csv(os.path.join(ut.tidy_data_folder,'ucsd','tidy_ucsd_long_allsubj.csv'),index_col='subjectid')
print(all_ucsd.shape)


(14880, 74)


In [11]:
# a new_dictionary to add also TD DD LD MN and SYB (as cluster labels) :

# TD (typically development)= PrevDDTyp ,PrevLDDTyp,TD  
# DD (developmental delay) = DD+ GDD
# LD (language delay)
# MD (motor delay) = FMD + MD motor delay 
# syb (syblings) = Typ Sib ASD   

diagnosis_dict={"TD":"TD",
                'PrevLDDTyp':'TD',
                'PrevDDTyp':'TD',
                'DD': "DD",
                'GDD' : 'DD',
                'LD': "LD",
                'FMD': 'MD',
                'MD':'MD',
                'Typ Sib ASD':"syb"}

In [12]:
# copy the DB to map colums on
all_ucsd_clust = all_ucsd.copy()
#create a 'sub_id' col to map the label_dict
all_ucsd_clust['sub_id'] = all_ucsd_clust.index

In [13]:
### to create the "cluster_clolumn"
# 1) map col 'sub_id' on label_dict (output from the classifier for ASD subject whose 1st assessment is 
# before 72 months and have a logitudinal data (at least 2 time points)
# 2) map col 'recentDxJ_dxCode' on diagnosis_dict(to get instead of clusters the diagnosis for TD DD LD MD and syb)

all_ucsd_clust['cluster'] = all_ucsd_clust['sub_id'].map(label_dict).to_frame(name='cluster').combine_first(all_ucsd_clust['recentDxJ_dxCode'].map(diagnosis_dict).to_frame(name='cluster'))

print(all_ucsd_clust.shape)



(14880, 76)


In [15]:
# save the new DBs
path2save = ut.out_folder
all_ucsd_clust.to_csv(os.path.join(path2save,'ucsd_long_allSubj_clusters.csv'))


In [16]:
# check that I have 1216 ASD subject at T1 <72
all_ucsd_clust_t1= all_ucsd_clust[all_ucsd_clust['time']==1]
all_ucsd_clust_t1['cluster'].isin([1,2,3]).sum()


1216

## run the model! in future studies ...

In [17]:
#prepare the cell to run the classifier on every type if DBS

## with UMAP
Model =  pkl.load(open(os.path.join(ut.out_folder, "fittedmodelDomainONLY_NDARts.sav"), 'rb'))
umap = pkl.load(open(os.path.join(ut.out_folder, 'umapDomainONLY_NDARts.sav'), 'rb'))
scaler = pkl.load(open(os.path.join(ut.out_folder, 'scalerDomainONLY_NDARts.sav'), 'rb'))

## no umap
#Model =  pkl.load(open(os.path.join(ut.out_folder, "NO_UMAP/fittedmodelDomainONLY_NDARts.sav"), 'rb'))
#scaler = pkl.load(open(os.path.join(ut.out_folder, 'NO_UMAP/scalerDomainONLY_NDARts.sav'), 'rb'))


def run_KNN_VABS(X):
    subj = X.index

    New_X_ts = umap.transform(scaler.transform(X))
    pred_labels = Model.predict(New_X_ts)
    
    label_dict = {s: lab for s, lab in zip(subj, pred_labels)}
    
    return label_dict

In [19]:
# to check it works the same as before RESULTS should be equal to label label_dict (use the same X_ts (from UCSD))
RESULTS = run_KNN_VABS(X_ts)


In [None]:
### STOP here