# LOAD GENETICS

In [None]:
## GENETIC 

### Load files

# File with variants for PARIS cohort
Paris_var = pd.read_csv(os.path.join(data_dir, 'Paris-set0-15.slivar-bed.HC-NDD-dom.pp.pext_LCR_HighQualRareLofMPC2CADD28.3.tsv'), 
                        sep = '\t',
                        encoding = 'utf-8')

# File with WGS for Paris cohort
Paris_WGS = pd.read_csv(os.path.join(data_dir, 'Freeze_WGS_set0to15_pedFile_IRM_SubsetForIRManalysis.txt'),
                 sep = '\t')

# File with WGS for EU-AIMS cohort


AIMS_WGS = pd.read_csv(os.path.join(data_dir, 
                                        'pedigree_LEAP.txt'))

# File with variants for EU-AIMS cohort
AIMS_var = pd.read_csv(os.path.join(data_dir, 
                                       'EUAIMS-set1-5.slivar-bed.HC-NDD-dom.pp.pext_LCR_HighQualRareLofMPC2CADD28.3.tsv'),
                           sep = '\t', 
                           encoding = 'utf-8'
                )

WGS_both = pd.concat([AIMS_WGS[AIMS_WGS.PSC2_participant_id.isin(df_all.participant_id)],
                    Paris_WGS[Paris_WGS.Structural_MRI]])

PGS_quantile = pd.read_csv(os.path.join(data_dir,'Paris-LEAP.PGS.quantiles.tsv'), sep = '\t')

### Filters

# Filter only European subjects for EU-AIMS
AIMS_WGS = AIMS_WGS.loc[AIMS_WGS.Ancestry == 'Europe']

### Merge DataFrames IRM with WGS

# Format participant ID for Paris_WGS

participant_id_l = []
for i in Paris_WGS.BIDS_participant_ID:
    if (i is not np.nan):
        participant_id_l.append(str(i).split('_')[1])
    else:
        participant_id_l.append(-1)
Paris_WGS['participant_id'] = participant_id_l
Paris_WGS['participant_id'] = Paris_WGS['participant_id'].astype(int)

Paris_WGS = Paris_WGS[Paris_WGS.participant_id.isin(df_all.participant_id)]


AIMS_WGS = AIMS_WGS.rename(columns = {'PSC2_participant_id' : 'participant_id',
                                               'barcode' : 'Barcode_curated'})

AIMS_WGS = AIMS_WGS[AIMS_WGS.participant_id.isin(df_all.participant_id)]

# MRI + WGS EU-AIMS
IRM_WGS_AIMS = AIMS_WGS.merge(df_all, on = 'participant_id')

# MRI + WGS PARIS
IRM_WGS_paris = df_all.merge(Paris_WGS, on = 'participant_id')

# MRI + WGS both cohorts
df_all_IRM_genet = pd.concat([IRM_WGS_AIMS, IRM_WGS_paris])

## Count variants in IRM+WGS

var_both = pd.concat([Paris_var, AIMS_var])
count_sampleId = var_both.groupby('sample_id').size().to_frame('Count_sampleId').reset_index()


for i in df_all_IRM_genet.Barcode_curated:
    if (i in list(var_both.sample_id)):
        df_all_IRM_genet.loc[df_all_IRM_genet.Barcode_curated == i, 'Nvariants'] = var_both.loc[var_both['sample_id'] == i].shape[0]
        
        
    else:
        df_all_IRM_genet.loc[df_all_IRM_genet.Barcode_curated == i, 'Nvariants'] = 0
         


### Merge DataFrames IRM + WGS + Variants


Paris_var_WGS = IRM_WGS_paris.merge(Paris_var.rename(columns ={'sample_id' : 'Barcode_curated'}), 
                                                     on = 'Barcode_curated', how = 'left')

# Merge variants with WGS (EU-AIMS)
# Keeps only participants with at least 1 variant

AIMS_var_WGS = IRM_WGS_AIMS.merge(AIMS_var.rename(columns ={'sample_id' : 'Barcode_curated'}), 
                                  on = 'Barcode_curated', how = 'left')

# Merge Var + WGS = only individuals with at least 1 variant (both cohorts)

all_IRM_var_WGS = pd.concat([Paris_var_WGS, AIMS_var_WGS])

# Merge Var + WGS + MRI = only invidiuals with at least 1 variant & with data for MRI
df_all_IRM_WGS_var = df_all_IRM_genet.merge(all_IRM_var_WGS, on = 'Barcode_curated', suffixes = ('', '_y'))

df_all_IRM_WGS_var

### Merge IRM + WGS + PGS

df_IRM_WGS_PGS = PGS_quantile.rename(columns = {'IID':'Barcode_curated'}).merge(df_all_IRM_genet, 
                                                               on = 'Barcode_curated',
                                                              suffixes=('', '_y'))