In [21]:
import numpy as np
import pandas as pd
from tqdm import tqdm

### Load TADPOLE data

In [9]:
input_path = '../data/TADPOLE_D1_D2.csv'
data = pd.read_csv(input_path, dtype = object)

### Basic data stats

In [11]:
print('Number of patients = {}, entries = {}'.format(\
            len(set(data['PTID'].values)), len(data)))

Number of patients = 1737, entries = 12741


### Available visits in TADPOLE data


In [12]:
visits = sorted(list(set(data['VISCODE'].values)))
print('Number of available visits : ', len(visits))
print('Available Visit IDs : ', visits)

('Number of available visits : ', 22)
('Available Visit IDs : ', ['bl', 'm03', 'm06', 'm102', 'm108', 'm114', 'm12', 'm120', 'm18', 'm24', 'm30', 'm36', 'm42', 'm48', 'm54', 'm60', 'm66', 'm72', 'm78', 'm84', 'm90', 'm96'])


### Dictionary to map visit to VISNUM

In [15]:
visit_codes = ['bl', 'm03', 'm06', 'm12', 'm18', 'm24', 
               'm30', 'm36', 'm42', 'm48', 'm54', 'm60', 
               'm66', 'm72', 'm78', 'm84', 'm90', 'm96', 
               'm102', 'm108', 'm114', 'm120']
visit_id = {key : i for i, key in enumerate(visit_codes)}
print(visit_id)

{'m102': 18, 'm120': 21, 'm90': 16, 'm108': 19, 'm96': 17, 'm12': 3, 'm18': 4, 'm54': 10, 'm36': 7, 'm78': 14, 'm30': 6, 'm72': 13, 'bl': 0, 'm114': 20, 'm84': 15, 'm06': 2, 'm03': 1, 'm60': 11, 'm48': 9, 'm66': 12, 'm24': 5, 'm42': 8}


### Split patients for train and test

In [19]:
patient_ids = list(set(data['PTID'].values))
num_patients = len(patient_ids)
split = 0.8
num_train = int(split*num_patients)
train_ids = patient_ids[:num_train]
test_ids = patient_ids[num_train:]

print('Number of patients = {}'.format(num_patients))
print('Number of patients in train data = {}'.format(num_train))
print('Number of patients in test data = {}'.format(num_patients - num_train))

with open('../data/patientID_train_all.txt', 'w') as f:
    np.savetxt(f, train_ids, fmt = '%s', delimiter = '\n')
with open('../data/patientID_test_all.txt', 'w') as f:
    np.savetxt(f, test_ids, fmt = '%s', delimiter = '\n')

Number of patients = 1737
Number of patients in train data = 1389
Number of patients in test data = 348


### Preprocess data 
- Add VISNUM column.  
- Retain only rows with the required visit ids.  
- Impute missing feature data. Remove features with no values.  
- Fill NAN feature values with column means.  
- 

In [22]:
def preprocess_adni(input_path, output_path):
    data = pd.read_csv(input_path, dtype = object)
    
    # Add VISNUM column
    visit_codes = ['bl', 'm03', 'm06', 'm12', 'm18', 'm24', 
               'm30', 'm36', 'm42', 'm48', 'm54', 'm60', 
               'm66', 'm72', 'm78', 'm84', 'm90', 'm96', 
               'm102', 'm108', 'm114', 'm120']
    visit_id = {key : i for i, key in enumerate(visit_codes)}
    data['VISNUM'] = data['VISCODE'].apply(lambda x: visit_id[x] \
            if x in visit_id else -1)

    # Retain only rows with required visit_id
    data = data.loc[data['VISNUM'] != -1]

    # Impute missing image feature data  
    data.sort_values(by = ['PTID', 'VISNUM'], inplace = True)
    data = data.groupby('PTID').ffill()
    all_nan_cols = []
    for name in tqdm(data.columns.values):
        if('UCSFFSX' in name or 'UCSFFSL' in name):
            if(name.startswith('ST') and 'STATUS' not in name):
                data[name] = data[name].apply(pd.to_numeric, errors = 'coerce')
                data[name].fillna(data[name].mean(), inplace=True)                
                if np.sum(np.isnan(data[name].values)) > 0: 
                    all_nan_cols.append(name)
    data = data.drop(all_nan_cols, axis = 1)    
    
    # Fill Nan values of features with mean
    cols = ['ADAS13', 'MMSE', 'ADAS11', 'RAVLT_immediate', \
            'RAVLT_forgetting', 'AGE', 'CDRSB']
    for col in cols:
        data[col] = data[col].apply(pd.to_numeric, errors = 'coerce')
        data[col].fillna(data[col].mean(), inplace=True)
        
    # Fill Nan values of APOE4 gene with 0
    data['APOE4'] = data['APOE4'].apply(pd.to_numeric, errors = 'coerce')
    data['APOE4'].fillna(0, inplace=True)

    # Normalize the image feature columns
    train_ids = np.loadtxt('../data/patientID_train_all.txt', dtype = str)
    for name in tqdm(data.columns.values):
        if('UCSFFSX' in name or 'UCSFFSL' in name):
            if(name.startswith('ST') and 'STATUS' not in name):
                featcol = data[data['PTID'].isin(train_ids)][name].values
                mean, std = np.mean(featcol), np.std(featcol)
                data[name] = (data[name] - mean)/(std + 1e-4)
#                 print(len(featcol), mean, std)
       
    # Save processed Dataframe to output_path
    data.to_csv(output_path)

if __name__ == '__main__':
    input_path = '../data/TADPOLE_D1_D2.csv'
    output_path = '../data/TADPOLE_D1_D2_proc_norm_all.csv'
    preprocess_adni(input_path, output_path)

100%|██████████| 1908/1908 [05:01<00:00,  6.33it/s]
100%|██████████| 1872/1872 [01:26<00:00, 21.69it/s]


### Analyze preprocessed data

In [24]:
data = pd.read_csv('../data/TADPOLE_D1_D2_proc_norm_all.csv')
print(data.shape)
num_features = 0
for name in tqdm(data.columns.values):
        if('UCSFFSX' in name or 'UCSFFSL' in name):
            if(name.startswith('ST') and 'STATUS' not in name):
                num_features += 1
print('Number of image features = {}'.format(num_features))
print('TODO : Number of entries for each visit id :')


100%|██████████| 1873/1873 [00:00<00:00, 1116851.21it/s]

(12741, 1873)
Number of image features = 656



