# Target label definition

Describe the problem.

In [1]:
import pandas as pd
import os
import sys

# Append the path to access custom modules
if '../' not in sys.path:
    sys.path.append('../')

import src.modules.label_extraction as le
import importlib

importlib.reload(le);

- `df_fsr`: contains freesurfers information
- `df_cdr`: contains data about subjects' visit from which it is possible to extract the target label

In [2]:
df_fsr = pd.read_csv('../data/csv/OASIS3_Freesurfer_output.csv')
df_cdr = pd.read_csv('../data/csv/OASIS3_UDSb4_cdr.csv')

- Remove from `df_fsr` the labels related to empty freesurfers

In [None]:
df_fsr = df_fsr[~df_fsr['MR_session'].isin(os.listdir('../data/empty'))]

- Define a new column in `df_cdr` and `df_fsr` that encapsule the temporal information

In [4]:
df_cdr['time'] = le.get_time_column(df_cdr['OASIS_session_label'])
df_fsr['time'] = le.get_time_column(df_fsr['MR_session'])

- Checking negative time columns for `df_cdr`:

In [5]:
df_cdr[df_cdr['time'] < 0].shape[0]

5

- Checking negative time columns for `df_fsr`:

In [6]:
df_fsr[df_fsr['time'] < 0].shape[0]

0

## Fixing negative time columns

- Clearly OAS30753 is a mistake: the cdr is null for this patient and we can't infer about its possible dementia
- The other ones looks like errors in typing, they will be fixed

In [7]:
df_cdr[df_cdr['time'] < 0]

Unnamed: 0,OASISID,OASIS_session_label,days_to_visit,age at visit,MMSE,memory,orient,judgment,commun,homehobb,...,dx2_code,dx3_code,dx4_code,dx5_code,dx1,dx2,dx3,dx4,dx5,time
1883,OAS30290,OAS30290_UDSb4_d-0002,-2,47.45,,0.0,0.0,0.0,0.0,0.0,...,,,,,No dementia,.,.,.,.,-2
2116,OAS30330,OAS30330_UDSb4_d-0101,-101,80.53,24.0,0.5,0.5,0.0,0.0,0.5,...,,,,,"DLBD, primary",.,.,.,.,-101
2462,OAS30380,OAS30380_UDSb4_d-0015,-15,61.39,28.0,0.5,0.0,0.0,0.0,0.0,...,,,,,uncertain dementia,Active Other neurol/med diagnoses,.,.,.,-15
4914,OAS30753,OAS30753_UDSb4_d-39520,-39520,-47.25,,,,,,,...,,,,,,,,,,-39520
5667,OAS30851,OAS30851_UDSb4_d-0001,-1,73.49,,0.5,0.0,0.5,0.5,0.5,...,,,,,"Non DAT, Other primary",.,.,.,.,-1


- Remove the error instance

In [8]:
df_cdr = df_cdr.drop(index=4914)
df_fsr = df_fsr.drop(index=df_fsr[df_fsr['Subject'] == 'OAS30753'].index)

- Fix the column and subject label names

In [9]:
df_cdr[['OASIS_session_label', 'time']] = le.fix_negative_time_label(df_cdr, 'time', 'OASIS_session_label')

## Target mapping

in this section we extract the target label from `df_cdr`, that needs to be correctly mapped inside `df_fsr` for each different subject, using the time period extracted in the previous python cell.

In [10]:
df_fsr['CDR'] = le.get_CDR_column(
    df_cdr.rename(columns={'OASISID': 'Subject'}), 
    df_fsr, 
    'Subject',
    target_col_name='CDRTOT'
)

In [11]:
df_fsr['CDR'].value_counts()

CDR
0.0    2021
0.5     344
1.0     103
2.0       8
Name: count, dtype: int64

In [6]:
df_cdr['CDRTOT'].value_counts()

CDRTOT
0.0    6479
0.5    1444
1.0     528
2.0     155
3.0      19
Name: count, dtype: int64

## Checking the alignment