In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set() 
import os

In [None]:
os.chdir('processedData\\')
cwd = os.getcwd

In [None]:
sns.set_style({'axes.facecolor': 'white',
 'axes.edgecolor': '.15',
 'axes.grid': True,
 'axes.axisbelow': True,
 'axes.labelcolor': '.15',
 'figure.facecolor': 'white',
 'grid.color': '.8',
 'grid.linestyle': '-',
 'text.color': '.15',
 'xtick.color': '.15',
 'ytick.color': '.15',
 'xtick.direction': 'out',
 'ytick.direction': 'out',
 #'lines.solid_capstyle': <CapStyle.round: 'round'>,
 'patch.edgecolor': 'w',
 'patch.force_edgecolor': True,
 'image.cmap': 'rocket',
 'font.family': ['sans-serif'],
 'font.sans-serif': ['Arial',
  'DejaVu Sans',
  'Liberation Sans',
  'Bitstream Vera Sans',
  'sans-serif'],
 'xtick.bottom': False,
 'xtick.top': False,
 'ytick.left': False,
 'ytick.right': False,
 'axes.spines.left': True,
 'axes.spines.bottom': True,
 'axes.spines.right': True,
 'axes.spines.top': True})

In [None]:
sns.set_context({'font.size': 15.0,
 'axes.labelsize': 'medium',
 'axes.titlesize': 'large',
 'xtick.labelsize': 'medium',
 'ytick.labelsize': 'medium',
 'legend.fontsize': 'medium',
 'axes.linewidth': 0.8,
 'grid.linewidth': 0.8,
 'lines.linewidth': 1.5,
 'lines.markersize': 6.0,
 'patch.linewidth': 1.0,
 'xtick.major.width': 0.8,
 'ytick.major.width': 0.8,
 'xtick.minor.width': 0.6,
 'ytick.minor.width': 0.6,
 'xtick.major.size': 3.5,
 'ytick.major.size': 3.5,
 'xtick.minor.size': 2.0,
 'ytick.minor.size': 2.0,
 'legend.title_fontsize': None})

In [None]:
MRI = pd.read_csv('processedMRIDataMenarcheSubs.csv')
puberty = pd.read_csv('processedPubertyDataMenarcheSubs.csv')

In [None]:
# get only the menarche variable from the puberty df
y = puberty[['subjectkey','pds_f5_y_P']]
# combine imaging data with the menarche variable
completeData = MRI.merge(y, on='subjectkey', how='outer')

In [None]:
# remove redundant columns that contain averaged or totalled data

# remove averaged data from thickness data
reduced = completeData.drop(columns = completeData.filter(regex = '.*cdk_mean.*'))

# remove summed up data from volume and area data
reduced = reduced.drop(columns = reduced.filter(regex = '.*.*cdk_total'))

# remove subcortical volume sums
reduced = reduced.drop(columns=['smri_vol_scs_intracranialv_M1', 'smri_vol_scs_allventricles_M1', 'smri_vol_scs_wholeb_M1'])

In [None]:
# look at pre vs post menarche distributions
pub = reduced.groupby(['pds_f5_y_P']).size().reset_index(name='count')
pub

In [None]:
# split the data by using 20% as held out test data
holdoutTest = reduced.sample(frac=0.2, random_state=666)

In [None]:
# drop test subjects from data to create training dataset
TrainingData = reduced.drop(index = holdoutTest.index)

In [None]:
# look at distribution of test data
testdist = holdoutTest.groupby(['pds_f5_y_P']).size().reset_index(name='count')
testdist

In [None]:
# look at distribution of test data
holdoutTest.groupby('pds_f5_y_P')['interview_age_M1'].mean()/12

In [None]:
# look at distribution of test data
holdoutTest.groupby('pds_f5_y_P')['interview_age_M1'].std()/12

In [None]:
# look at distribution of training data
traindist = TrainingData.groupby(['pds_f5_y_P']).size().reset_index(name='count')
traindist

In [None]:
# look at distribution of training data
TrainingData.groupby('pds_f5_y_P')['interview_age_M1'].mean()/12

In [None]:
# look at distribution of training data
TrainingData.groupby('pds_f5_y_P')['interview_age_M1'].std()/12

In [None]:
# Compare distributions -> No visible distortion of distribution introduced by random sampling

fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize = (15,6))
fig.tight_layout(pad = 2)

a1 = sns.barplot(x = 'pds_f5_y_P', y = 'count', data = pub, palette="Reds", ax=ax1)
a1.set_xlabel('Menarche Status')
a1.set_xticklabels(['Pre','Post'])
a1.set_title('Complete Data', size = 15)

a2 = sns.barplot(x = 'pds_f5_y_P', y = 'count', data = traindist, palette="Reds", ax=ax2)
a2.set_xlabel('Menarche Status')
a2.set_xticklabels(['Pre','Post'])
a2.set_title('Training Data - pre matching', size = 15)

a3 = sns.barplot(x = 'pds_f5_y_P', y = 'count', data = testdist, palette="Reds", ax=ax3)
a3.set_xlabel('Menarche Status')
a3.set_xticklabels(['Pre','Post'])
a3.set_title('Test Data', size = 15)
plt.savefig('..\\Plots\\MenarcheDistributions.pdf', dpi = 1000)

In [None]:
# save data for further preprocessing (scansite harmonisation + propensity score matching)

holdoutTest.to_csv('menarcheTest_redONLYSMRI.csv', index = False)
TrainingData.to_csv('menarcheTrain_redONLYSMRI.csv', index = False)