# Preprocessing

## Import modules and reading datasets

In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler


In [46]:
visit_1_df = pd.read_csv('../data/tabular/visit-1.csv')
visit_2_df = pd.read_csv('../data/tabular/visit-2.csv')
visit_3_df = pd.read_csv('../data/tabular/visit-3.csv')
visit_4_df = pd.read_csv('../data/tabular/visit-4.csv')
visit_5_df = pd.read_csv('../data/tabular/visit-5.csv')

visit_1_df.head()

visit_1_df['SES'].describe()


count    142.000000
mean       2.492958
std        1.128273
min        1.000000
25%        2.000000
50%        2.000000
75%        3.000000
max        5.000000
Name: SES, dtype: float64

## Visit 1

### 1. Converting spelling mistakes to single value

In [47]:
visit_1_df['CDR'] = visit_1_df['CDR'].replace(['very miId','very midl','vry mild'], 'very mild')
visit_1_df['CDR'].value_counts()



CDR
none         85
very mild    52
mild         13
Name: count, dtype: int64

## Visit 2

In [48]:
visit_2_df['CDR'].value_counts()


CDR
none         73
very mild    48
mild         18
moderate      3
midl          1
very miId     1
Name: count, dtype: int64

### 1. Converting spelling mistakes to single value

In [49]:
mild_replace = {'midl':'mild'}
very_mild_replace = {'very miId': 'very mild'}

visit_2_df['CDR'] = visit_2_df['CDR'].replace(mild_replace)
visit_2_df['CDR'] = visit_2_df['CDR'].replace(very_mild_replace)

visit_2_df['CDR'].value_counts()





CDR
none         73
very mild    49
mild         19
moderate      3
Name: count, dtype: int64

## Visit 3

### 1. Converting spelling mistakes to single value
    - No need as none

In [50]:
visit_2_df['CDR'].value_counts()

CDR
none         73
very mild    49
mild         19
moderate      3
Name: count, dtype: int64

## Visit 4

### 1. Converting spelling mistakes to singla value
    - No need as none

In [51]:
visit_4_df['CDR'].value_counts()

CDR
none         10
very mild     3
mild          2
Name: count, dtype: int64

## Visit 5

### 1. Converting spelling mistakes to singla value
    - No need as none

In [52]:
visit_5_df['CDR'].value_counts()

CDR
none         4
mild         1
very mild    1
Name: count, dtype: int64

## Master Dataset


### 1. Merging 5 datasets

In [53]:
merged_df = pd.concat([visit_1_df, visit_2_df, visit_3_df, visit_4_df, visit_5_df], axis=0, ignore_index=True)


merged_df.isna().sum()

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      373 non-null    int64  
 1   MRI_ID  373 non-null    object 
 2   visit   373 non-null    int64  
 3   delay   373 non-null    int64  
 4   sex     373 non-null    object 
 5   hand    373 non-null    object 
 6   age     373 non-null    int64  
 7   YOE     373 non-null    int64  
 8   SES     354 non-null    float64
 9   MMSE    371 non-null    float64
 10  CDR     373 non-null    object 
 11  eTIV    371 non-null    float64
 12  nWBV    373 non-null    float64
 13  ASF     371 non-null    object 
dtypes: float64(4), int64(5), object(5)
memory usage: 40.9+ KB


In [54]:
merged_df.drop(['hand'],axis=1,inplace=True)
merged_df



Unnamed: 0,ID,MRI_ID,visit,delay,sex,age,YOE,SES,MMSE,CDR,eTIV,nWBV,ASF
0,1,0001_MR1,1,0,M,87,14,2.0,27.0,none,1987.0,0.696,0.883
1,2,0002_MR1,1,0,M,75,12,,23.0,very mild,1678.0,0.736,1.046
2,4,0004_MR1,1,0,F,88,18,3.0,28.0,none,1215.0,0.710,1.444
3,5,0005_MR1,1,0,M,80,12,4.0,28.0,none,1689.0,0.712,1.039
4,7,0007_MR1,1,0,M,71,16,,28.0,very mild,1357.0,0.748,1.293
...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,36,0036_MR5,5,2369,F,75,13,4.0,29.0,none,1349.0,0.778,1.301
369,48,0048_MR5,5,1233,M,69,16,1.0,4.0,mild,1701.0,0.676,1.032
370,70,0070_MR5,5,2386,M,86,17,1.0,30.0,none,1720.0,0.705,1.02
371,73,0073_MR5,5,2517,F,77,14,3.0,29.0,none,1504.0,0.769,1.167


### 2. Changing types to category for memory efficiency

- reduced memory usage from memory usage: 40.9+ KB to 25.7

In [55]:
merged_df['sex'] = merged_df['sex'].astype('category')
merged_df['hand'] = merged_df['hand'].astype('category')
merged_df['YOE'] = merged_df['YOE'].astype('category')
merged_df['age'] = merged_df['age'].astype('category')
merged_df['SES'] = merged_df['SES'].astype('category')
merged_df['CDR'] = merged_df['CDR'].astype('category')
merged_df['visit'] = merged_df['visit'].astype('category')


merged_df.info()

KeyError: 'hand'

### Modifying erroneous string value in ASF column

In [None]:
# merged_df['ASF'] = merged_df['ASF'].astype('float') - Returns error as there is a string in the column '1,38'

merged_df['ASF'] = merged_df['ASF'].replace({'1,38': 1.38})

merged_df['ASF'] = merged_df['ASF'].astype('float')

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   ID      373 non-null    int64   
 1   MRI_ID  373 non-null    object  
 2   visit   373 non-null    category
 3   delay   373 non-null    int64   
 4   sex     373 non-null    category
 5   hand    373 non-null    category
 6   age     373 non-null    category
 7   YOE     373 non-null    category
 8   SES     354 non-null    category
 9   MMSE    371 non-null    float64 
 10  CDR     373 non-null    category
 11  eTIV    371 non-null    float64 
 12  nWBV    373 non-null    float64 
 13  ASF     371 non-null    float64 
dtypes: category(7), float64(4), int64(2), object(1)
memory usage: 25.7+ KB


### Removing leading zeros

In [None]:
merged_df['MRI_ID'] = merged_df['MRI_ID'].astype(str).str.lstrip('0')

In [None]:
merged_df.isna().sum()

merged_df.to_csv('../data/eda.csv', index=False)

### Imputing nan values

In [None]:
nan_values = ['SES','MMSE','ASF','eTIV']

def impute_na_values(feature):
    mode = merged_df[feature].mode()[0]
    merged_df[feature].fillna(mode,inplace=True)

for feature in nan_values:
    impute_na_values(feature)


### Convert CDR values into numbers 

In [None]:
CDR_mapping = {'none':0, 'very mild':1, 'mild':2, 'moderate':3, 'severe':4}

merged_df['CDR_rating'] = merged_df['CDR'].replace(CDR_mapping)

merged_df['CDR'].value_counts()

(merged_df['ASF'].unique() == '1,38').sum()

0

### Convert Sex into 0 and 1

In [None]:
sex_mapping = {'M':1,'F':0}

merged_df['sex'].replace(sex_mapping, inplace=True)

merged_df

Unnamed: 0,ID,MRI_ID,visit,delay,sex,hand,age,YOE,SES,MMSE,CDR,eTIV,nWBV,ASF,CDR_rating
0,1,1_MR1,1,0,1,R,87,14,2.0,27.0,none,1987.0,0.696,0.883,0
1,2,2_MR1,1,0,1,R,75,12,2.0,23.0,very mild,1678.0,0.736,1.046,1
2,4,4_MR1,1,0,0,R,88,18,3.0,28.0,none,1215.0,0.710,1.444,0
3,5,5_MR1,1,0,1,R,80,12,4.0,28.0,none,1689.0,0.712,1.039,0
4,7,7_MR1,1,0,1,R,71,16,2.0,28.0,very mild,1357.0,0.748,1.293,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,36,36_MR5,5,2369,0,R,75,13,4.0,29.0,none,1349.0,0.778,1.301,0
369,48,48_MR5,5,1233,1,R,69,16,1.0,4.0,mild,1701.0,0.676,1.032,2
370,70,70_MR5,5,2386,1,R,86,17,1.0,30.0,none,1720.0,0.705,1.020,0
371,73,73_MR5,5,2517,0,R,77,14,3.0,29.0,none,1504.0,0.769,1.167,0


### Scale 'eTIV' and 'delay' values

In [None]:
""" scaler = MinMaxScaler()

features = ['eTIV', 'delay']

def scale_feature(feature):
    arr = np.array((merged_df[feature]))
    arr = arr.reshape(-1,1)
    arr = scaler.fit_transform(arr)
    merged_df[feature] = arr

for feature in features:
    scale_feature(feature)

merged_df """

" scaler = MinMaxScaler()\n\nfeatures = ['eTIV', 'delay']\n\ndef scale_feature(feature):\n    arr = np.array((merged_df[feature]))\n    arr = arr.reshape(-1,1)\n    arr = scaler.fit_transform(arr)\n    merged_df[feature] = arr\n\nfor feature in features:\n    scale_feature(feature)\n\nmerged_df "

### Create bins for MMSE 



In [None]:
bins = [0,10,20,26,np.inf]
labels = ['Severe','Moderate','Mild','None']

merged_df['MMSE_labels'] = pd.cut(merged_df['MMSE'], bins=bins, labels=labels)

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   ID           373 non-null    int64   
 1   MRI_ID       373 non-null    object  
 2   visit        373 non-null    category
 3   delay        373 non-null    int64   
 4   sex          373 non-null    category
 5   hand         373 non-null    category
 6   age          373 non-null    category
 7   YOE          373 non-null    category
 8   SES          373 non-null    category
 9   MMSE         373 non-null    float64 
 10  CDR          373 non-null    category
 11  eTIV         373 non-null    float64 
 12  nWBV         373 non-null    float64 
 13  ASF          373 non-null    float64 
 14  CDR_rating   373 non-null    category
 15  MMSE_labels  373 non-null    category
dtypes: category(9), float64(4), int64(2), object(1)
memory usage: 26.8+ KB


### Create dementia 

In [None]:

def dementiaBool(x):
    if(x > 0):
        return 1
    else:
        return 0

merged_df['dementia'] = merged_df['CDR_rating'].apply(dementiaBool)

merged_df.info()

merged_df['dementia'] = merged_df['dementia'].astype('category')

merged_df.isna().sum()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   ID           373 non-null    int64   
 1   MRI_ID       373 non-null    object  
 2   visit        373 non-null    category
 3   delay        373 non-null    int64   
 4   sex          373 non-null    category
 5   hand         373 non-null    category
 6   age          373 non-null    category
 7   YOE          373 non-null    category
 8   SES          373 non-null    category
 9   MMSE         373 non-null    float64 
 10  CDR          373 non-null    category
 11  eTIV         373 non-null    float64 
 12  nWBV         373 non-null    float64 
 13  ASF          373 non-null    float64 
 14  CDR_rating   373 non-null    category
 15  MMSE_labels  373 non-null    category
 16  dementia     373 non-null    int64   
dtypes: category(9), float64(4), int64(3), object(1)
memory usage: 29.7+ KB


ID             0
MRI_ID         0
visit          0
delay          0
sex            0
hand           0
age            0
YOE            0
SES            0
MMSE           0
CDR            0
eTIV           0
nWBV           0
ASF            0
CDR_rating     0
MMSE_labels    0
dementia       0
dtype: int64

## Creating final pkl file

In [None]:
merged_df.to_pickle('../data/processed/proecessed.pkl.gz')