# Target label definition

Describe the problem.

In [1]:
import pandas as pd
import os
import sys

# Append the path to access custom modules
if '../' not in sys.path:
    sys.path.append('../')

import src.modules.label_extraction as le
import src.modules.plotting as p
import importlib

from glob import glob

importlib.reload(le);
importlib.reload(p);

- `df_fsr`: contains freesurfers information
- `df_cdr`: contains data about subjects' visit from which it is possible to extract the target label

In [2]:
df_fsr = pd.read_csv('../data/csv/OASIS3_Freesurfer_output.csv')
df_cdr = pd.read_csv('../data/csv/OASIS3_UDSb4_cdr.csv')

- Remove from `df_fsr` the labels related to empty freesurfers

In [4]:
df_fsr = df_fsr[~df_fsr['MR_session'].isin(os.listdir('../data/empty'))]

- Define a new column in `df_cdr` and `df_fsr` that encapsule the temporal information

In [5]:
df_cdr['time'] = le.get_time_column(df_cdr['OASIS_session_label'])
df_fsr['time'] = le.get_time_column(df_fsr['MR_session'])

- Checking negative time columns for `df_cdr`:

In [6]:
df_cdr[df_cdr['time'] < 0].shape[0]

5

- Checking negative time columns for `df_fsr`:

In [7]:
df_fsr[df_fsr['time'] < 0].shape[0]

0

## Fixing negative time columns

- Clearly OAS30753 is a mistake: the cdr is null for this patient and we can't infer about its possible dementia
- The other ones looks like errors in typing, they will be fixed

In [8]:
df_cdr[df_cdr['time'] < 0]

Unnamed: 0,OASISID,OASIS_session_label,days_to_visit,age at visit,MMSE,memory,orient,judgment,commun,homehobb,...,dx2_code,dx3_code,dx4_code,dx5_code,dx1,dx2,dx3,dx4,dx5,time
1883,OAS30290,OAS30290_UDSb4_d-0002,-2,47.45,,0.0,0.0,0.0,0.0,0.0,...,,,,,No dementia,.,.,.,.,-2
2116,OAS30330,OAS30330_UDSb4_d-0101,-101,80.53,24.0,0.5,0.5,0.0,0.0,0.5,...,,,,,"DLBD, primary",.,.,.,.,-101
2462,OAS30380,OAS30380_UDSb4_d-0015,-15,61.39,28.0,0.5,0.0,0.0,0.0,0.0,...,,,,,uncertain dementia,Active Other neurol/med diagnoses,.,.,.,-15
4914,OAS30753,OAS30753_UDSb4_d-39520,-39520,-47.25,,,,,,,...,,,,,,,,,,-39520
5667,OAS30851,OAS30851_UDSb4_d-0001,-1,73.49,,0.5,0.0,0.5,0.5,0.5,...,,,,,"Non DAT, Other primary",.,.,.,.,-1


- Remove the error instance

In [9]:
df_cdr = df_cdr.drop(index=4914)
df_fsr = df_fsr.drop(index=df_fsr[df_fsr['Subject'] == 'OAS30753'].index)

- Fix the column and subject label names

In [10]:
df_cdr[['OASIS_session_label', 'time']] = le.fix_negative_time_label(df_cdr, 'time', 'OASIS_session_label')

- Reindex the columns to prepare the two dataframes for the matchup

In [11]:
df_cdr = le.put_first(df_cdr, ['OASIS_session_label', 'OASISID'])
df_fsr = le.put_first(df_fsr, ['MR_session', 'Subject'])

- Match diagnosis with sessions

In [12]:
df_matched = le.session_matchup(df_fsr, df_cdr, 365, 365)

- Align the cdr values

In [13]:
df_matched['CDRTOT'] = le.align_labels(df_matched, 'Subject', 'CDRTOT')

OAS30040
OAS30139
OAS30161
OAS30173
OAS30181
OAS30185
OAS30194
OAS30206
OAS30208
OAS30241
OAS30342
OAS30369
OAS30418
OAS30445
OAS30452
OAS30466
OAS30596
OAS30597
OAS30612
OAS30667
OAS30810
OAS30867
OAS30978
OAS31037
OAS31043
OAS31046
OAS31111
OAS31127
OAS31128


In [14]:
df_matched['label'] = df_matched['CDRTOT'].map(
    lambda x: 'Cognitevely-normal' if x == 0 else 'Early-stage' if x == .5 else 'Demented'
)

- Perform a final check

In [15]:
f = os.path.join('..', 'data', 'freesurfers')

left_hippo_files = sorted(glob(os.path.join(f, '*', 'mri', '*Left-Hippocampus.mgz')))
right_hippo_files = sorted(glob(os.path.join(f, '*', 'mri', '*Right-Hippocampus.mgz')))

l1 = [s.split("\\")[3] for s in left_hippo_files]
l2 = [s.split("\\")[3] for s in right_hippo_files]

print(f"The number of elements NOT in common is: {set.difference(set(l1), set(l2))}")

The number of elements NOT in common is: set()


In [16]:
df_matched[[
    'Subject', 'MR_session', 'TOTAL_HIPPOCAMPUS_VOLUME', 'Left-Hippocampus_volume', 
    'lh_parahippocampal_thickness', 'lh_parahippocampal_volume', 'rh_parahippocampal_volume',
    'rh_parahippocampal_thickness', 'Right-Hippocampus_volume', 'label', 'CDRTOT',
]].to_csv('../data/csv/final_dataset.csv')

In [17]:
df = pd.read_csv('../data/csv/final_dataset.csv')

In [18]:
df['label'].value_counts()

label
Cognitevely-normal    1892
Early-stage            368
Demented               119
Name: count, dtype: int64