# Initialize Notebook

### Data-preprocessing

In [1]:
# Import basic libraries for EDA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from dotenv import load_dotenv
import os
import pingouin as pg

In [2]:
# Import dataset
load_dotenv()
dataset_path=os.getenv("DATASET_PATH_TWO")
dataset = pd.read_csv(dataset_path)
dataset.head()

Unnamed: 0,Label,Gender,Age,MMSE,CDR,APOE,DIAGNOSIS,Unnamed: 7,ERC ASM,ERC CONTRAST,...,rh_superiorparietal_thickness,rh_superiortemporal_volume,rh_superiortemporal_thickness,rh_supramarginal_volume,rh_supramarginal_thickness,rh_temporalpole_volume,rh_temporalpole_thickness,rh_transversetemporal_volume,rh_transversetemporal_thickness,TOTAL_HIPPOCAMPUS_VOLUME
0,OAS30001,female,65.149895,30.0,0.0,23.0,Cognitively normal,,0.108767,245.139997,...,2.175,8928,2.698,6622,2.369,1965,3.612,578,2.313,6861.9
1,OAS30002,male,67.206024,30.0,0.0,34.0,Cognitively normal,,0.129492,221.932844,...,1.938,12989,2.883,9930,2.432,2774,3.54,961,2.588,7378.1
2,OAS30003,female,58.77344,30.0,0.0,33.0,Cognitively normal,,0.133642,283.711655,...,2.181,10897,2.62,10879,2.493,2137,3.756,874,2.671,7983.5
3,OAS30004,female,55.096508,30.0,0.0,23.0,Cognitively normal,,0.180788,231.188679,...,2.094,11493,2.53,8862,2.232,1918,3.309,792,1.834,8525.1
4,OAS30005,female,48.030117,29.0,0.0,33.0,Cognitively normal,,0.118517,245.120671,...,2.105,12135,2.926,11602,2.609,2057,3.655,993,2.557,9320.5


In [3]:
dataset.columns.tolist()

['Label',
 'Gender',
 'Age',
 'MMSE',
 'CDR',
 'APOE',
 'DIAGNOSIS',
 'Unnamed: 7',
 'ERC ASM',
 'ERC CONTRAST',
 'ERC CORRELATION',
 'ERC VARIANCE ',
 'ERC SUM AVERAGE',
 'ERC SUM VARIANCE',
 'ERC ENTROPY',
 'ERC CLUSTER SHADE',
 'Hip ASM',
 'Hip Contrast',
 'Hip Correlation',
 'Hip Variance ',
 'Hip Sum Average',
 'Hip Sum Variance',
 'Hip Entropy',
 'Hip Clusterhade',
 'IntraCranialVol',
 'lhCortexVol',
 'rhCortexVol',
 'CortexVol',
 'SubCortGrayVol',
 'TotalGrayVol',
 'SupraTentorialVol',
 'lhCorticalWhiteMatterVol',
 'rhCorticalWhiteMatterVol',
 'CorticalWhiteMatterVol',
 '3rd-Ventricle_volume',
 '4th-Ventricle_volume',
 '5th-Ventricle_volume',
 'Brain-Stem_volume',
 'CC_Anterior_volume',
 'CC_Central_volume',
 'CC_Mid_Anterior_volume',
 'CC_Mid_Posterior_volume',
 'CC_Posterior_volume',
 'CSF_volume',
 'Left-Accumbens-area_volume',
 'Left-Amygdala_volume',
 'Left-Caudate_volume',
 'Left-Cerebellum-White-Matter_volume',
 'Left-Cerebellum-Cortex_volume',
 'Left-choroid-plexus_volum

In [4]:
dataset["ERCs_thicknessbaseline"] = dataset["rh_entorhinal_thickness"] + dataset["lh_entorhinal_thickness"]
dataset["ERCsVolumebaseline"] = dataset["rh_entorhinal_volume"] + dataset["lh_entorhinal_volume"]

In [5]:
columns = [
    "Gender", "Age", "DIAGNOSIS", "MMSE",
    "Hip ASM", "Hip Contrast", "Hip Correlation", "Hip Variance ", 
    "Hip Sum Average", "Hip Sum Variance", "Hip Entropy", "Hip Clusterhade",
    "ERC ASM", "ERC CONTRAST", "ERC CORRELATION", "ERC VARIANCE ", 
    "ERC SUM AVERAGE", "ERC SUM VARIANCE", "ERC ENTROPY", "ERC CLUSTER SHADE",
    "ERCs_thicknessbaseline",
    "ERCsVolumebaseline", "TOTAL_HIPPOCAMPUS_VOLUME"
]

In [6]:
dataset = dataset[columns]
dataset

Unnamed: 0,Gender,Age,DIAGNOSIS,MMSE,Hip ASM,Hip Contrast,Hip Correlation,Hip Variance,Hip Sum Average,Hip Sum Variance,...,ERC CONTRAST,ERC CORRELATION,ERC VARIANCE,ERC SUM AVERAGE,ERC SUM VARIANCE,ERC ENTROPY,ERC CLUSTER SHADE,ERCs_thicknessbaseline,ERCsVolumebaseline,TOTAL_HIPPOCAMPUS_VOLUME
0,female,65.149895,Cognitively normal,30.0,0.460200411,83.37129666,0.508610428,89.06630373,10.31846143,272.8939183,...,245.139997,0.385395,209.332939,41.129640,592.191757,3.234648,18110.527190,6.948,2826,6861.9
1,male,67.206024,Cognitively normal,30.0,0.329647269,149.7122651,0.328176378,109.8242096,13.76140257,289.5845731,...,221.932844,0.406722,197.689209,42.149774,568.823992,3.240436,23801.869010,7.090,3792,7378.1
2,female,58.773440,Cognitively normal,30.0,0.385797323,104.7693541,0.545985695,125.6787833,15.52117681,397.9457789,...,283.711655,0.336695,217.002950,41.289080,584.300146,3.069283,17579.374770,6.832,2964,7983.5
3,female,55.096508,Cognitively normal,30.0,0.229011059,166.9534306,0.27560138,112.1868343,18.12678643,281.7939065,...,231.188679,0.498026,232.827358,33.062621,700.120751,3.093719,4888.606116,6.523,3184,8525.1
4,female,48.030117,Cognitively normal,29.0,0.24291323,86.8162215,0.377438554,73.48465063,13.46392035,207.122381,...,245.120671,0.458936,233.016858,44.574015,686.946759,3.149858,22134.931110,8.253,3419,9320.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
744,male,51.520878,Cognitively normal,30.0,0.241905071,185.9517258,0.26081542,119.7282993,20.4719512,292.9614715,...,211.115923,0.541205,229.997956,29.612012,708.875901,3.026495,-1155.195119,7.339,4268,8947.8
745,male,65.171800,Cognitively normal,28.0,0.460262312,110.9184814,0.380646123,84.41390623,11.36978687,226.7371435,...,247.267295,0.481906,238.429315,32.328782,706.449967,3.072458,1949.812723,6.363,3880,7373.2
746,female,62.965090,Cognitively normal,29.0,0.456332442,87.97432739,0.494138979,102.0331982,10.98650287,320.1584654,...,252.115540,0.435960,221.698071,38.821700,634.676744,3.122909,14482.990380,6.368,3310,6828.5
747,female,64.774810,Unc: ques. Impairment,27.0,0.165,478.2875,-0.201970198,200.5570312,23.4375,323.940625,...,253.821524,0.432143,225.827181,27.061909,649.487198,2.661899,-2314.065889,5.174,2044,4501.3


In [7]:
# Check column names in case 
# we need to do some spelling correction
dataset.columns

Index(['Gender', 'Age', 'DIAGNOSIS', 'MMSE', 'Hip ASM', 'Hip Contrast',
       'Hip Correlation', 'Hip Variance ', 'Hip Sum Average',
       'Hip Sum Variance', 'Hip Entropy', 'Hip Clusterhade', 'ERC ASM',
       'ERC CONTRAST', 'ERC CORRELATION', 'ERC VARIANCE ', 'ERC SUM AVERAGE',
       'ERC SUM VARIANCE', 'ERC ENTROPY', 'ERC CLUSTER SHADE',
       'ERCs_thicknessbaseline', 'ERCsVolumebaseline',
       'TOTAL_HIPPOCAMPUS_VOLUME'],
      dtype='object')

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 749 entries, 0 to 748
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Gender                    749 non-null    object 
 1   Age                       749 non-null    float64
 2   DIAGNOSIS                 747 non-null    object 
 3   MMSE                      736 non-null    float64
 4   Hip ASM                   749 non-null    object 
 5   Hip Contrast              749 non-null    object 
 6   Hip Correlation           749 non-null    object 
 7   Hip Variance              749 non-null    object 
 8   Hip Sum Average           749 non-null    object 
 9   Hip Sum Variance          749 non-null    object 
 10  Hip Entropy               749 non-null    object 
 11  Hip Clusterhade           749 non-null    object 
 12  ERC ASM                   749 non-null    float64
 13  ERC CONTRAST              749 non-null    float64
 14  ERC CORREL

In [9]:
dataset.select_dtypes(include='object').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 749 entries, 0 to 748
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Gender            749 non-null    object
 1   DIAGNOSIS         747 non-null    object
 2   Hip ASM           749 non-null    object
 3   Hip Contrast      749 non-null    object
 4   Hip Correlation   749 non-null    object
 5   Hip Variance      749 non-null    object
 6   Hip Sum Average   749 non-null    object
 7   Hip Sum Variance  749 non-null    object
 8   Hip Entropy       749 non-null    object
 9   Hip Clusterhade   749 non-null    object
dtypes: object(10)
memory usage: 58.6+ KB


In [10]:
cols_to_convert = ['Hip Contrast', 'Hip ASM', 'Hip Correlation', 'Hip Variance ', 
                   'Hip Sum Average', 'Hip Sum Variance', 'Hip Entropy', 'Hip Clusterhade']

for col in cols_to_convert:
    dataset[col] = pd.to_numeric(dataset[col], errors='coerce')

In [11]:
dataset["Gender"] = dataset["Gender"].map({'female': 0, 'male': 1})

In [12]:
dataset["Gender"]

0      0
1      1
2      0
3      0
4      0
      ..
744    1
745    1
746    0
747    0
748    0
Name: Gender, Length: 749, dtype: int64

In [13]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 749 entries, 0 to 748
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Gender                    749 non-null    int64  
 1   Age                       749 non-null    float64
 2   DIAGNOSIS                 747 non-null    object 
 3   MMSE                      736 non-null    float64
 4   Hip ASM                   747 non-null    float64
 5   Hip Contrast              747 non-null    float64
 6   Hip Correlation           747 non-null    float64
 7   Hip Variance              747 non-null    float64
 8   Hip Sum Average           747 non-null    float64
 9   Hip Sum Variance          747 non-null    float64
 10  Hip Entropy               747 non-null    float64
 11  Hip Clusterhade           747 non-null    float64
 12  ERC ASM                   749 non-null    float64
 13  ERC CONTRAST              749 non-null    float64
 14  ERC CORREL

In [14]:
dataset["DIAGNOSIS"].unique()

array(['Cognitively normal', 'uncertain dementia',
       'AD dem w/CVD contribut', 'AD Dementia',
       'AD dem Language dysf after', 'AD dem w/depresss- not contribut',
       'AD dem distrubed social- prior', 'Unc: ques. Impairment',
       'DLBD- primary', 'AD dem w/depresss- contribut',
       'AD dem w/CVD not contrib', '0.5 in memory only',
       'Vascular Demt- primary', 'Frontotemporal demt. prim',
       'AD dem Language dysf with', 'AD dem w/PDI after AD dem contribut',
       'DAT', 'Dementia/PD- primary', 'AD dem w/oth (list B) not contrib',
       'uncertain- possible NON AD dem', 'DLBD, primary', nan,
       'No dementia', 'AD dem cannot be primary',
       'AD dem w/oth unusual features/demt on', '.',
       'Non AD dem- Other primary', 'AD dem w/oth (list B) contribut',
       'AD dem/FLD prior to AD dem', 'AD dem Language dysf prior'],
      dtype=object)

In [15]:
import numpy as np

diagnosis_mapping = {
    # 0 - Normal controls
    "Cognitively normal": 0,
    "No dementia": 0,

    # 1 - Mild Cognitive Impairment (MCI / uncertain)
    "uncertain dementia": 1,
    "0.5 in memory only": 1,
    "Unc: ques. Impairment": 1,
    "uncertain- possible NON AD dem": 1,

    # 2 - MCI Converters (transitioning to AD)
    "AD dem w/CVD contribut": 2,
    "AD dem w/depresss- contribut": 2,
    "AD dem w/oth (list B) contribut": 2,
    "AD dem w/PDI after AD dem contribut": 2,
    "AD dem/FLD prior to AD dem": 2,
    "AD dem Language dysf prior": 2,

    # 3 - Alzheimer's Disease and other dementias
    "AD Dementia": 3,
    "AD dem Language dysf after": 3,
    "AD dem w/depresss- not contribut": 3,
    "AD dem distrubed social- prior": 3,
    "AD dem w/CVD not contrib": 3,
    "Vascular Demt- primary": 3,
    "Frontotemporal demt. prim": 3,
    "AD dem Language dysf with": 3,
    "DAT": 3,
    "Dementia/PD- primary": 3,
    "AD dem w/oth (list B) not contrib": 3,
    "DLBD- primary": 3,
    "DLBD, primary": 3,
    "AD dem cannot be primary": 3,
    "AD dem w/oth unusual features/demt on": 3,
    "Non AD dem- Other primary": 3,
}

# Then replace
dataset["DIAGNOSIS"] = dataset["DIAGNOSIS"].replace(diagnosis_mapping)

In [16]:
dataset["DIAGNOSIS"].unique()

array([0, 1, 2, 3, nan, '.'], dtype=object)

In [17]:
# Drop rows where DIAGNOSIS is NaN or '.'
dataset = dataset[~dataset["DIAGNOSIS"].isin([np.nan, '.'])]

In [18]:
dataset["DIAGNOSIS"].unique()

array([0, 1, 2, 3], dtype=object)

In [19]:
dataset["DIAGNOSIS"] = dataset["DIAGNOSIS"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["DIAGNOSIS"] = dataset["DIAGNOSIS"].astype(int)


In [20]:
# Check for duplicated instances
dataset.duplicated().sum()

np.int64(0)

In [21]:
# Quick descriptive overview
dataset.describe(include = "all")

Unnamed: 0,Gender,Age,DIAGNOSIS,MMSE,Hip ASM,Hip Contrast,Hip Correlation,Hip Variance,Hip Sum Average,Hip Sum Variance,...,ERC CONTRAST,ERC CORRELATION,ERC VARIANCE,ERC SUM AVERAGE,ERC SUM VARIANCE,ERC ENTROPY,ERC CLUSTER SHADE,ERCs_thicknessbaseline,ERCsVolumebaseline,TOTAL_HIPPOCAMPUS_VOLUME
count,746.0,746.0,746.0,736.0,744.0,744.0,744.0,744.0,744.0,744.0,...,746.0,746.0,746.0,746.0,746.0,746.0,746.0,746.0,746.0,746.0
mean,0.454424,68.670536,0.72252,27.675272,0.412985,100.708001,0.482648,111.411195,14.098334,344.93678,...,251.045018,0.436716,225.402946,34.10342,650.566767,3.012379,6813.99986,6.406003,3210.647453,7109.631367
std,0.498253,9.06698,1.217083,3.539359,0.094918,78.542801,0.195306,22.992787,4.266376,89.68101,...,32.977273,0.081976,14.020532,5.381479,61.361237,0.228906,9033.289914,0.920498,705.216865,1231.323149
min,0.0,45.18549,0.0,7.0,0.118298,28.042811,-0.597412,48.366313,7.193376,112.5,...,171.080919,-0.070953,154.961986,21.073748,313.471204,2.172023,-4288.108246,3.2,1307.0,2728.8
25%,0.0,63.724846,0.0,27.0,0.351253,54.931038,0.414983,96.866341,11.704381,279.411924,...,228.960739,0.402088,218.554944,30.293043,625.984953,2.88423,821.72373,5.96225,2781.25,6332.225
50%,0.0,68.89391,0.0,29.0,0.430127,82.149331,0.545802,110.715394,13.515363,353.421997,...,245.546329,0.452831,227.574671,33.065735,664.374671,3.035445,3503.964875,6.536,3164.5,7176.2
75%,1.0,74.471596,1.0,30.0,0.479648,120.727373,0.611231,124.957074,15.427247,414.563729,...,270.649778,0.493264,234.874824,37.484351,689.969948,3.171801,10096.127355,7.01625,3646.75,7966.45
max,1.0,95.56742,3.0,30.0,0.676056,765.5,0.748333,277.306858,60.5,590.975677,...,424.328054,0.583185,256.835283,52.663876,784.45522,3.536395,57862.31854,9.176,5425.0,10549.6


In [22]:
# Export to CSV
dataset.to_csv('C:/Users/steve/Desktop/Notebooks/Thesis-Project/datasets/processed/oasis_processed_dataset.csv', index=False)