# Creating test spreadsheet
## Import pandas and load files

In [97]:
import pandas as pd
import numpy as np
import os

In [98]:
data_dir='/Users/davecash/Data/IDEAS/sample'
xls_demo=os.path.join(data_dir,'GENFI_DEMOGRAPHICS_DF3_FINAL_BLINDED.xlsx')
df_demo=pd.read_excel(xls_demo)
xls_img=os.path.join(data_dir,'GENFI_IMAGING_DF3_FINAL_BLINDED.xlsx')
df_img=pd.read_excel(xls_img)

## Combining data
Join the data, get rid of unneeded variables and keep first visit that has both demographics and imaging

In [99]:
df_combined=pd.merge(df_img,df_demo,on=['Blinded Code','Visit'])

Categorize key variables and get rid of any rare mutations

In [100]:
df_combined['MutationType']=pd.Categorical(df_combined['Genetic Group'],categories=['C9orf72','GRN','MAPT'])
df_combined = df_combined.dropna(subset=['MutationType','Age at visit','DRC_QC','Scanner'])

Now it's time to get rid of some missing values. Start by assuming TIV constant over time and just keeping first value

In [101]:
df_combined['TIV'] = df_combined.groupby(['Blinded Code'])["TIV mm3"].fillna(method="ffill")

Remove a bunch of columns that we don't need.

In [102]:
df_combined = df_combined.drop(columns=['Date of scan','Blinded Site_x','TIV mm3','1  Orbitofrontal LEFT', 
                                        '2.  DLPFC LEFT', '3.  VMPFC LEFT', '4.  Motor LEFT', '5.  Opercular LEFT', 
                                        '6.  FRP LEFT', '7.  Medial Temp LEFT', '8.  Lateral Temp  LEFT', 
                                        '9.  Temporal Pole LEFT', '10.  Supra Temp LEFT', '11.  Medial Parietal LEFT', 
                                        '12.  Lateral Parietal LEFT', '13.  Sensory LEFT', '14.  Medial occ LEFT', 
                                        '15.  Lateral Occ LEFT', '16.  Anterior Cing LEFT', '17.  Middle Cing LEFT', 
                                        '18.  Posterior Cing LEFT', '19.  Ant Insula  LEFT', '20.  Post Insula LEFT', 
                                        '1  Orbitofrontal RIGHT', '2.  DLPFC RIGHT', '3.  VMPFC RIGHT', '4.  Motor RIGHT',
                                        '5.  Opercular RIGHT', '6.  FRP RIGHT', '7.  Medial Temp RIGHT', 
                                        '8.  Lateral Temp  RIGHT', '9.  Temporal Pole RIGHT', '10.  Supra Temp RIGHT', 
                                        '11.  Medial Parietal RIGHT', '12.  Lateral Parietal RIGHT', '13.  Sensory RIGHT', 
                                        '14.  Medial occ RIGHT', '15.  Lateral Occ RIGHT', '16.  Anterior Cing RIGHT', 
                                        '17.  Middle Cing RIGHT', '18.  Posterior Cing RIGHT', '19.  Ant Insula  RIGHT', 
                                        '20.  Post Insula RIGHT','Genetic Group', 'Blinded Family', 'Date of assessment','Handedness', 'Employment', 'Ethnicity'])

Keep only images that passed QC for GIF

In [103]:
df_combined=df_combined[df_combined['QC_include in GIF']==1]

Keep only one timepoint

In [104]:
df_xsec=df_combined.drop_duplicates(subset='Blinded Code')

Find all of the scanners, sort and assign a new identifying number to use instead of the official GENFI one.

In [105]:
site_list=df_xsec['Blinded Site_y'].drop_duplicates()
site_list=site_list.sort_values(ignore_index=True)
site_list = site_list.reset_index(level=0)
site_list.set_index('Blinded Site_y',inplace=True)

In [106]:
df_xsec = pd.merge(df_xsec,site_list,how='left',left_on='Blinded Site_y',right_index=True)

In [107]:
df_xsec.rename(columns={'index':'Site',
                        'Gender':'Sex',
                        'Genetic status 2': 'Group',
                        'Age at visit': 'Age'},inplace=True)

Finally blind the blinded code and re-sort on the randomcode

In [108]:
new_id=1+np.arange(len(df_xsec.index))
np.random.shuffle(new_id)

In [109]:
df_xsec['New Code']=new_id

In [110]:
df_xsec['ID']=df_xsec['New Code'].map('GENFI{:03d}'.format)

In [111]:
df_xsec=df_xsec[['ID','Site','Scanner', 'Age', 'Sex',
                 'Education', 'EYO', 'Group', 'MutationType', 'TIV',
                 'Right Accumbens Area','Left Accumbens Area', 'Right Amygdala', 
                 'Left Amygdala', 'Pons','Brain Stem', 'Right Caudate', 'Left Caudate',
                 'Right Hippocampus','Left Hippocampus', 'Right Pallidum', 'Left Pallidum',
                 'Right Putamen','Left Putamen', 'Right Thalamus Proper', 'Left Thalamus Proper',
                 'Total_Brain', 'Frontal lobe volume', 'Temporal lobe volume',
                 'Parietal lobe volume', 'Occipital lobe volume', 'Cingulate volume',
                 'Insula volume', 'Left Frontal lobe volume', 'Right Frontal lobe volume', 
                 'Left Temporal lobe volume','Right Temporal lobe volume', 'Left Parietal lobe volume',
                 'Right Parietal lobe volume', 'Left Occipital lobe volume',
                 'Right Occipital lobe volume', 'Left Cingulate volume', 'Right Cingulate volume', 
                 'Left Insula volume', 'Right Insula volume','Total Cerebellum']]
df_xsec=df_xsec.set_index('ID',drop=True)

In [112]:
df_xsec=df_xsec.sort_index()

In [113]:
df_xsec['TIV']=df_xsec['TIV']/1000
df_xsec=df_xsec.round(1)


In [114]:
df_xsec.to_excel(os.path.join(data_dir,'GENFI_DEMON_SPREADSHEET.xlsx'))