In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd

In [3]:
image_dir = Path('data/IXI/T1/raw/')
# Get all IXI_IDs from the image directory
ixi_ids = [path.name.split('-')[0][3:] for path in image_dir.glob('*.nii.gz')]

if  len(ixi_ids) != len(set(ixi_ids)):
    print("There are multiple scans for the same subject")
else:
    print("There are no multiple scans for the same subject")

There are no multiple scans for the same subject


In [4]:
df = pd.read_csv('data/IXI/IXI_Demographic.csv')

In [5]:
print(f'number of rows and columns: {df.shape}')

number of rows and columns: (619, 12)


In [6]:
print('number of nulls per column:')
df.isnull().sum()

number of nulls per column:


IXI_ID                0
SEX_ID (1=m, 2=f)     0
HEIGHT                0
WEIGHT                0
ETHNIC_ID             0
MARITAL_ID            0
OCCUPATION_ID         0
QUALIFICATION_ID      0
DOB                   3
DATE_AVAILABLE        0
STUDY_DATE           26
AGE                  29
dtype: int64

In [7]:
vc = df['IXI_ID'].value_counts()
print('Number of duplicate IXI_IDs:', len(vc[vc > 1]))

ixi_ids_gt1 = vc[vc > 1].index.tolist()
df_dup = df[df['IXI_ID'].isin(ixi_ids_gt1)]


print('inspecting the duplicates:')
df_dup

Number of duplicate IXI_IDs: 26
inspecting the duplicates:


Unnamed: 0,IXI_ID,"SEX_ID (1=m, 2=f)",HEIGHT,WEIGHT,ETHNIC_ID,MARITAL_ID,OCCUPATION_ID,QUALIFICATION_ID,DOB,DATE_AVAILABLE,STUDY_DATE,AGE
195,219,2,170,104,1,2,1,4,1953-02-08,1,2006-03-10,53.08
196,219,2,164,73,1,4,1,5,2/12/48,1,2006-03-10,58.07
212,237,2,167,0,1,5,5,5,1928-06-11,1,2006-08-25,78.2
213,237,2,0,0,0,0,0,0,1928-06-11,1,2006-08-25,78.2
225,251,2,167,74,1,5,5,1,1926-06-28,1,2006-08-29,80.17
226,251,2,0,0,0,0,0,0,1926-06-28,1,2006-08-29,80.17
299,328,2,155,70,1,2,5,3,1944-11-03,1,2006-08-17,61.79
300,328,1,0,0,0,0,0,0,1944-07-17,1,2006-08-17,62.08
324,360,2,165,64,1,3,6,4,1952-01-27,1,2006-04-07,54.19
325,360,2,0,0,0,0,0,0,1952-01-27,1,2006-04-07,54.19


In [8]:
print('inspect the duplicates (IXI_ID, AGE):')

df_dup[['IXI_ID', 'AGE']]

inspect the duplicates (IXI_ID, AGE):


Unnamed: 0,IXI_ID,AGE
195,219,53.08
196,219,58.07
212,237,78.2
213,237,78.2
225,251,80.17
226,251,80.17
299,328,61.79
300,328,62.08
324,360,54.19
325,360,54.19


In [9]:
df_age = df.dropna(subset=['AGE'])
df_age = df_age[(44 <= df_age['AGE']) & (df_age['AGE'] <= 80)]
print('number of rows with age 44-80:', len(df_age))
print('number of unique IXI_IDs with age 44-80:', df_age['IXI_ID'].nunique())

print('inspecting the rows with age 44-80:')
df_age.head(10)


number of rows with age 44-80: 340
number of unique IXI_IDs with age 44-80: 318
inspecting the rows with age 44-80:


Unnamed: 0,IXI_ID,"SEX_ID (1=m, 2=f)",HEIGHT,WEIGHT,ETHNIC_ID,MARITAL_ID,OCCUPATION_ID,QUALIFICATION_ID,DOB,DATE_AVAILABLE,STUDY_DATE,AGE
3,13,1,182,70,1,2,1,5,1958-09-15,1,2005-06-01,46.71
6,16,1,172,63,1,2,1,5,1950-04-24,1,2005-06-24,55.17
8,19,1,180,88,1,2,1,4,1946-11-02,1,2005-06-30,58.66
17,28,1,172,70,1,2,5,3,1932-08-11,1,2006-08-21,74.03
18,29,2,155,79,4,5,1,5,1946-08-29,1,2005-11-18,59.22
28,40,2,0,68,1,3,2,5,1961-06-18,1,2005-07-22,44.09
32,44,2,163,69,1,3,3,5,1960-09-05,1,2005-07-13,44.85
36,48,1,194,90,1,2,1,5,1954-11-26,1,2005-07-21,50.65
38,50,1,180,79,1,4,2,5,1942-05-06,1,2005-07-13,63.19
41,53,1,185,101,1,2,1,5,1952-04-01,1,2005-07-22,53.31


In [10]:
print('inspect duplicates of age 44-80 (IXI_ID, AGE):')
df_age_dup = df_age[df_age.duplicated(subset=['IXI_ID'], keep=False)]
df_age_dup[['IXI_ID', 'AGE']]

inspect duplicates of age 44-80 (IXI_ID, AGE):


Unnamed: 0,IXI_ID,AGE
195,219,53.08
196,219,58.07
212,237,78.2
213,237,78.2
299,328,61.79
300,328,62.08
324,360,54.19
325,360,54.19
357,392,62.46
358,392,62.46


In [11]:
print('number of duplicated IXI_IDs (age 44-80):', df_age_dup['IXI_ID'].nunique())
print('number of duplicated IXI_IDs with age disagreement:', (df_age_dup.groupby('IXI_ID')['AGE'].nunique() > 1).sum())

number of duplicated IXI_IDs (age 44-80): 22
number of duplicated IXI_IDs with age disagreement: 2
