# 1 - Genome Wide Association Study

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np


import math
import sys
import os

## Introduction

During this step of the workflow we will compute the SNP-PD associations, using imaging information to improve the phenotypic associations.

During the Individual View step, we will focus on finding candidate SNP biomarkers by integrating genetic and neuroimaging data. During this phase, we search for SNP-disease associations accounting for the phenotypic information carried by each imaging measure separately, and by integrating the single results obtained, in order to retrieve comprehensive potential SNP biomarkers for each imaging type considered. 

Thus, we will merge the results obtained using DaTSCAN features in a single results dataset, and those retrieved by using MRI data in another results summary.

To combine the results previously obtained, we use TATES (Van der Sluis *et al*., 2013), which combines the *P*-values in a single-trait-based statistical significance, by correcting for correlation among the imaging features, and evaluating if at least one of the analyzed traits could be associated with a SNP.

In [5]:
genotyping_path = "../../data/genotyping/"
imaging_path = "../../data/imaging/"
patdocs_path = "../../data/patient_docs/"
datscan_path = os.path.join(imaging_path, "DaTSCAN")
mri_path = os.path.join(imaging_path, "MRI")

## GWAS with DaTSCAN features

### Exploring normalized DaTSCAN data

During the previous steps of the workflow we normalized the DaTSCAN data with rank based inverse normal transformation (r-INT).

As previously discussed, we normalized DaTSCAN data because the measurements fell in different range of values. To normalize our data we applied r-INT, in order to provide a normalization procedure less susceptible to outlier values.

In [4]:
datscan = pd.read_csv(
    os.path.join(datscan_path, "DATScan_Analysis_eu_fv_norm.csv")
)
datscan.head(n=10)

Unnamed: 0,PATNO,EVENT_ID,SCAN_DATE,CAUDATE_R,CAUDATE_L,PUTAMEN_R,PUTAMEN_L
0,3000,SC,2011-01-20,0.932467,1.368655,1.786562,1.448689
1,3001,U01,2011-06-23,-0.313764,-0.507784,-0.313764,-0.648172
2,3002,U01,2011-06-28,0.835925,1.656152,0.032656,0.696547
3,3004,U01,2011-08-24,2.971827,2.971827,2.383049,1.882275
4,3006,SC,2011-08-02,0.038596,-0.188099,-0.408846,-2.499599
5,3008,SC,2011-08-30,1.957186,1.407545,0.941676,1.023443
6,3011,SC,2011-07-07,1.600299,1.848464,2.095619,1.611072
7,3012,SC,2011-10-11,-0.221442,-0.276558,-2.02157,-0.434802
8,3016,SC,2012-01-17,1.510772,1.558962,1.611072,1.262883
9,3018,SC,2012-02-29,-0.360897,-1.018445,-0.823362,-1.730834


### Constructing Phenotype and Covariate files

Before finding statistically significant SNP-phenotype associations with PLINK we need to define two files which are used by the tool to compute the linear model, used during SNP-trait associations:

- phenotype file

- covariate file

The phenotype file contains the phenotypic information we want to add to the model, in our study we add subjects enrolment category (HC or PD) and the imaging feature values.

The covariate file contains all the confounders used to adjust the linear model, in ourstudy we used as covariates the subjects' age, gender and the first 10 principal components of the SNP relatedness matrix.

Let's begin by creating the phenotype file.

In [23]:
pheno_fn = os.path.join(genotyping_path, "phenotype")

# load the subjects enrolment status and demographics informations
status = pd.read_csv(
    os.path.join(patdocs_path, "Patient_Status.csv")
)

# load the IDs of subjects considered in our study
eu_pats_ds = pd.read_csv(
    os.path.join(genotyping_path, "PPMI_eu_woswedd_ds.fam"),
    header=None,
    sep=" "
)

# retrieve status info and demographics for our subjects
status_eu_ds = status[status["PATNO"].isin(eu_pats_ds.iloc[:,0].tolist())]
status_eu_ds.reset_index(inplace=True, drop=True)
status_eu_ds.head(n=10)

Unnamed: 0,PATNO,RECRUITMENT_CAT,IMAGING_CAT,ENROLL_DATE,ENROLL_CAT,ENROLL_STATUS,DESCRP_CAT,STATUS_DATE
0,3000,HC,HC,02/2011,HC,Enrolled,,02/2011
1,3001,PD,PD,03/2011,PD,Enrolled,,03/2011
2,3002,PD,PD,03/2011,PD,Enrolled,,03/2011
3,3004,HC,HC,04/2011,HC,Enrolled,,04/2011
4,3006,PD,PD,05/2011,PD,Withdrew,,10/2013
5,3008,HC,HC,06/2011,HC,Enrolled,,06/2011
6,3011,HC,HC,07/2011,HC,Withdrew,,07/2014
7,3012,PD,PD,11/2011,PD,Enrolled,,11/2011
8,3016,HC,HC,02/2012,HC,Enrolled,,02/2012
9,3018,PD,PD,04/2012,PD,Enrolled,,04/2012


In [41]:
phenotype = status_eu_ds.merge(datscan, on=["PATNO"])
# extract the columns of interest
phenotype = phenotype.loc[:,["PATNO", "ENROLL_CAT", "CAUDATE_R", "CAUDATE_L", "PUTAMEN_R", "PUTAMEN_L"]]
phenotype.head(n=10)

Unnamed: 0,PATNO,ENROLL_CAT,CAUDATE_R,CAUDATE_L,PUTAMEN_R,PUTAMEN_L
0,3000,HC,0.932467,1.368655,1.786562,1.448689
1,3001,PD,-0.313764,-0.507784,-0.313764,-0.648172
2,3002,PD,0.835925,1.656152,0.032656,0.696547
3,3004,HC,2.971827,2.971827,2.383049,1.882275
4,3006,PD,0.038596,-0.188099,-0.408846,-2.499599
5,3008,HC,1.957186,1.407545,0.941676,1.023443
6,3011,HC,1.600299,1.848464,2.095619,1.611072
7,3012,PD,-0.221442,-0.276558,-2.02157,-0.434802
8,3016,HC,1.510772,1.558962,1.611072,1.262883
9,3018,PD,-0.360897,-1.018445,-0.823362,-1.730834


We can write the phenotype files. Fpr each imaging feature we create a TXT file containing subjects FID, IID, enrollment category and the DaTSCAN uptake values for the considered feature.

In [43]:
datscan_features = ["CAUDATE_R", "CAUDATE_L", "PUTAMEN_R", "PUTAMEN_L"]
for feature in datscan_features:
    phenotype.loc[:,["PATNO", "PATNO", "ENROLL_CAT", feature]].to_csv(
        os.path.join("".join([pheno_fn, "_", feature, ".txt"])),
        header=False,
        index=False,
        sep=" "
    )

Let's now create the covariate file.

In [51]:
demo = pd.read_csv(
    os.path.join(patdocs_path, "Screening___Demographics.csv")
)
demo = demo[demo["PATNO"].isin(eu_pats_ds.iloc[:,0].tolist())].sort_values(["PATNO"])
demo.reset_index(inplace=True, drop=True)
demo.head(n=10)

Unnamed: 0,REC_ID,F_STATUS,PATNO,EVENT_ID,PAG_NAME,SIGNCNST,CONSNTDT,APPRDX,CURRENT_APPRDX,P3GRP,...,PRJENRDT,REFERRAL,DECLINED,RSNDEC,EXCLUDED,RSNEXC,ORIG_ENTRY,LAST_UPDATE,QUERY,SITE_APRV
0,269535301,V,3000,CONSENT,SCREEN,1.0,01/2011,2.0,2.0,,...,02/2011,31.0,,,,,01/2011,2011-01-19 15:44:58.0,,01/2011
1,274783501,V,3001,CONSENT,SCREEN,1.0,02/2011,1.0,1.0,,...,03/2011,60.0,,,,,02/2011,2011-02-10 13:43:30.0,,03/2011
2,278717701,S,3002,CONSENT,SCREEN,1.0,03/2011,1.0,1.0,,...,03/2011,1.0,,,,,03/2011,2020-04-01 17:33:42.0,,03/2011
3,281159801,V,3004,CONSENT,SCREEN,1.0,03/2011,2.0,2.0,,...,04/2011,99.0,,,,,03/2011,2011-03-30 14:43:13.0,,03/2011
4,283722401,S,3006,CONSENT,SCREEN,1.0,03/2011,1.0,1.0,,...,04/2011,1.0,,,,,03/2011,2020-04-22 09:14:39.0,,03/2011
5,289807201,V,3008,CONSENT,SCREEN,1.0,05/2011,2.0,2.0,,...,06/2011,4.0,,,,,05/2011,2013-10-31 14:28:09.0,,05/2011
6,302607101,V,3011,CONSENT,SCREEN,1.0,06/2011,2.0,2.0,,...,07/2011,1.0,,,,,06/2011,2011-07-22 16:39:28.0,,06/2011
7,316309601,V,3012,CONSENT,SCREEN,1.0,10/2011,1.0,1.0,,...,11/2011,1.0,,,,,10/2011,2011-10-12 13:48:43.0,,10/2011
8,329167601,V,3016,CONSENT,SCREEN,1.0,01/2012,2.0,2.0,,...,02/2012,4.0,,,,,01/2012,2012-01-12 13:19:24.0,,01/2012
9,336629801,V,3018,CONSENT,SCREEN,1.0,02/2012,1.0,1.0,,...,03/2012,60.0,,,,,03/2012,2012-05-17 12:12:53.0,,03/2012


Let's now compute the subjects age at the DaTSCAN visit time.

In [59]:
def compute_age(df):
    scan_year = int(df["SCAN_DATE"].split("-")[0])
    birth_year = int(df["BIRTHDT"])
    return (scan_year - birth_year)

x = demo.merge(datscan, on=["PATNO"])
demo["AGE"] = x.apply(compute_age, axis=1)
demo.loc[:, ["PATNO", "AGE"]].head(n=10)  # display subjects age

Unnamed: 0,PATNO,AGE
0,3000,70
1,3001,65
2,3002,68
3,3004,60
4,3006,58
5,3008,82
6,3011,32
7,3012,58
8,3016,58
9,3018,61


Since we removed three subjects from those of step 2 of preprocessing phase we must recompute the PCs of the relatedness matrix in order to have consistent data with our current dataset.

As we did before, to compute the first 20 PCs we use PLINK PCA.

In [18]:
ppmi_eu_woswedd_ds_fn = os.path.join(genotyping_path, "PPMI_eu_woswedd_ds")

!plink --bfile {ppmi_eu_woswedd_ds_fn} --pca 20 --out {ppmi_eu_woswedd_ds_fn}

PLINK v1.90b6.12 64-bit (28 Oct 2019)          www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to ../../data/genotyping/PPMI_eu_woswedd_ds.log.
Options in effect:
  --bfile ../../data/genotyping/PPMI_eu_woswedd_ds
  --out ../../data/genotyping/PPMI_eu_woswedd_ds
  --pca 20

16384 MB RAM detected; reserving 8192 MB for main workspace.
128812 variants loaded from .bim file.
422 people (286 males, 136 females) loaded from .fam.
Using up to 15 threads (change this with --threads).
Before main variant filters, 422 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.996652.
128812 variants and 422 people pass filters and QC.
Note: No phenotypes present.
Relationship matrix calculation complete.


In [20]:
ppmi_pca = pd.read_csv(".".join([ppmi_eu_woswedd_ds_fn, "eigenvec"]), sep=' ', header=None)

# assign FID and IID
cnames = ['FID','IID']
for i in range(1, 21):
    cnames.append("PC"+str(i))
    
ppmi_pca.columns = cnames
ppmi_pca.head(n=10)

Unnamed: 0,FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
0,3000,3000,0.023104,0.001445,0.038672,0.065804,-0.007233,-0.032171,0.041329,0.020136,...,0.006711,0.038946,0.035519,0.011342,0.0455,0.09911,-0.092471,-0.061807,0.035368,0.053554
1,3001,3001,0.01577,0.014693,-0.003507,-0.048157,-0.043188,-0.063182,0.035549,-0.039994,...,0.044206,-0.014498,-0.015049,-0.024854,-0.049211,0.032961,-0.047794,0.025384,-0.064395,-0.009126
2,3002,3002,-0.082304,-0.035871,-0.021713,0.012172,0.028592,0.001123,-0.01088,-0.00113,...,-0.027085,0.010237,-0.010473,-0.079551,0.055008,0.058719,0.037635,0.031731,-0.082463,-0.020639
3,3004,3004,0.023939,0.032296,-0.007026,-0.002677,-0.049808,-0.047968,0.072133,0.032667,...,-0.040209,0.027052,0.050784,0.055534,0.093572,0.027401,0.033852,-0.096813,-0.046014,-0.011014
4,3006,3006,0.004569,0.059148,0.101874,-0.011916,-0.001887,-0.047472,-0.001574,-0.023827,...,-0.004358,0.020863,0.022857,-0.004821,-0.017148,0.013909,-0.035708,-0.088609,-0.034012,-0.001674
5,3008,3008,0.013419,-0.003194,-0.049605,0.04923,-0.048325,0.045633,-0.042648,-0.003362,...,-0.000836,-0.126883,-0.051652,0.123715,-0.005408,-0.006208,-0.008207,-0.028819,-0.04982,-0.130877
6,3011,3011,-0.045717,0.02436,0.029712,0.028696,0.027011,-0.002224,-0.007908,0.000338,...,-0.01415,-0.027093,-0.024393,0.062394,-0.010077,-0.001595,-0.008321,-0.038271,0.0273,-0.005139
7,3012,3012,-0.006437,-0.015327,0.014593,-0.004998,-0.07254,0.063095,-0.023632,0.02453,...,0.014611,-0.027948,-0.02946,0.006143,-0.015118,-0.03922,0.006339,0.019902,0.009799,0.032107
8,3016,3016,0.039748,0.001799,-0.08467,0.02899,0.043786,-0.000456,-0.016404,-0.028868,...,0.089533,-0.086117,-0.060762,-0.038292,0.039845,-0.07336,0.022928,0.034004,-0.010412,-0.056705
9,3018,3018,0.016829,0.001896,0.067707,-0.08017,0.063767,0.062882,0.020254,-0.058311,...,0.016343,0.060883,-0.033268,0.099699,-0.080082,-0.032918,0.050229,-0.007243,-0.075981,0.017879


Let's now merge PC values to demographic data. 

In [65]:
covariate_fn = os.path.join(genotyping_path, "covariate.txt")

covariate = ppmi_pca.merge(demo, left_on=["IID"], right_on=["PATNO"])
covariate["GENDER"] = eu_pats_ds[4]  # solve 0s in demographics
covariate.head(n=10)

Unnamed: 0,FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,REFERRAL,DECLINED,RSNDEC,EXCLUDED,RSNEXC,ORIG_ENTRY,LAST_UPDATE,QUERY,SITE_APRV,AGE
0,3000,3000,0.023104,0.001445,0.038672,0.065804,-0.007233,-0.032171,0.041329,0.020136,...,31.0,,,,,01/2011,2011-01-19 15:44:58.0,,01/2011,70
1,3001,3001,0.01577,0.014693,-0.003507,-0.048157,-0.043188,-0.063182,0.035549,-0.039994,...,60.0,,,,,02/2011,2011-02-10 13:43:30.0,,03/2011,65
2,3002,3002,-0.082304,-0.035871,-0.021713,0.012172,0.028592,0.001123,-0.01088,-0.00113,...,1.0,,,,,03/2011,2020-04-01 17:33:42.0,,03/2011,68
3,3004,3004,0.023939,0.032296,-0.007026,-0.002677,-0.049808,-0.047968,0.072133,0.032667,...,99.0,,,,,03/2011,2011-03-30 14:43:13.0,,03/2011,60
4,3006,3006,0.004569,0.059148,0.101874,-0.011916,-0.001887,-0.047472,-0.001574,-0.023827,...,1.0,,,,,03/2011,2020-04-22 09:14:39.0,,03/2011,58
5,3008,3008,0.013419,-0.003194,-0.049605,0.04923,-0.048325,0.045633,-0.042648,-0.003362,...,4.0,,,,,05/2011,2013-10-31 14:28:09.0,,05/2011,82
6,3011,3011,-0.045717,0.02436,0.029712,0.028696,0.027011,-0.002224,-0.007908,0.000338,...,1.0,,,,,06/2011,2011-07-22 16:39:28.0,,06/2011,32
7,3012,3012,-0.006437,-0.015327,0.014593,-0.004998,-0.07254,0.063095,-0.023632,0.02453,...,1.0,,,,,10/2011,2011-10-12 13:48:43.0,,10/2011,58
8,3016,3016,0.039748,0.001799,-0.08467,0.02899,0.043786,-0.000456,-0.016404,-0.028868,...,4.0,,,,,01/2012,2012-01-12 13:19:24.0,,01/2012,58
9,3018,3018,0.016829,0.001896,0.067707,-0.08017,0.063767,0.062882,0.020254,-0.058311,...,60.0,,,,,03/2012,2012-05-17 12:12:53.0,,03/2012,61


And store the first 10 PCs and subjects age and gender in the covariate file, named ```covariate.txt```

In [66]:
cnames = ["FID", "IID"]
for i in range(1,11):
    cnames.append("PC" + str(i))
cnames += ["AGE", "GENDER"]
covariate.loc[:,cnames].to_csv(
    covariate_fn,
    index=False,
    sep="\t"
)

### Searching SNP-phenotype associations

In [48]:
ppmi_info = pd.read_csv('../PPMI_data/Info files/Screening___Demographics.csv')

pca = pd.read_table('Data/QC/White_no_swedd.eigenvec', sep=' ', header=None)
cnames = ['FID','IID']
for i in range(1,11):
    cnames.append("PC"+str(i))
pca.columns = cnames

info_pca = pca.merge(ppmi_info, left_on=['FID'], right_on=['PATNO'])

apprdx = list()

for i in info_pca['APPRDX'].to_list():
    if(i==1):
        apprdx.append("PD")
    if(i==2):
        apprdx.append("HC")

info_pca['APPRDX2'] = apprdx
info_pca

Unnamed: 0,FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,REFERRAL,DECLINED,RSNDEC,EXCLUDED,RSNEXC,ORIG_ENTRY,LAST_UPDATE,QUERY,SITE_APRV,APPRDX2
0,3000,3000,0.026492,0.000678,0.035338,0.064825,0.000766,-0.041875,0.033775,0.013332,...,31.0,,,,,01/2011,2011-01-19 12:44:58.0,,01/2011,HC
1,3001,3001,0.017766,0.017305,0.000688,-0.045234,-0.037821,-0.066630,0.010062,-0.054224,...,60.0,,,,,02/2011,2011-02-10 10:43:30.0,,03/2011,PD
2,3002,3002,-0.084263,-0.034856,-0.023204,0.006116,0.027577,0.008214,-0.006264,0.008593,...,1.0,,,,,03/2011,2011-03-04 11:04:58.0,,03/2011,PD
3,3004,3004,0.024563,0.033477,-0.003634,-0.005703,-0.042231,-0.059694,0.067412,0.017923,...,99.0,,,,,03/2011,2011-03-30 11:43:13.0,,03/2011,HC
4,3006,3006,0.008511,0.056539,0.101538,0.001351,0.006814,-0.047180,-0.012988,-0.033577,...,1.0,,,,,03/2011,2011-03-31 07:21:03.0,,03/2011,PD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,4124,4124,0.008077,0.029449,0.040843,-0.013005,0.074609,0.068977,0.010669,0.081488,...,1.0,,,,,02/2013,2013-02-22 12:57:46.0,,02/2013,PD
432,4125,4125,-0.019947,0.029816,-0.017732,-0.017475,-0.082603,0.015329,0.025788,0.053534,...,1.0,,,,,02/2013,2013-12-10 12:36:00.0,,05/2013,PD
433,4126,4126,-0.009423,0.001673,-0.037664,0.060811,-0.006292,0.034639,0.107867,0.040532,...,1.0,,,,,03/2013,2013-03-27 08:35:42.0,,03/2013,PD
434,4136,4136,0.032763,-0.016046,-0.064461,-0.015332,0.030023,0.123020,0.032174,0.084627,...,99.0,,,,,03/2013,2013-03-06 07:09:02.0,,06/2013,PD


In [49]:
info_pheno = info_pca.merge(images, left_on=['FID'], right_on=['PATNO'])
info_pheno

Unnamed: 0,FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,ORIG_ENTRY,LAST_UPDATE,QUERY,SITE_APRV,APPRDX2,PATNO_y,CAUDATE_R,CAUDATE_L,PUTAMEN_R,PUTAMEN_L
0,3000,3000,0.026492,0.000678,0.035338,0.064825,0.000766,-0.041875,0.033775,0.013332,...,01/2011,2011-01-19 12:44:58.0,,01/2011,HC,3000,2.99,3.43,2.328620,1.973532
1,3001,3001,0.017766,0.017305,0.000688,-0.045234,-0.037821,-0.066630,0.010062,-0.054224,...,02/2011,2011-02-10 10:43:30.0,,03/2011,PD,3001,1.56,1.12,-0.246956,-0.796382
2,3002,3002,-0.084263,-0.034856,-0.023204,0.006116,0.027577,0.008214,-0.006264,0.008593,...,03/2011,2011-03-04 11:04:58.0,,03/2011,PD,3002,2.13,2.28,0.255480,0.339528
3,3004,3004,0.024563,0.033477,-0.003634,-0.005703,-0.042231,-0.059694,0.067412,0.017923,...,03/2011,2011-03-30 11:43:13.0,,03/2011,HC,3004,5.09,5.30,2.949289,2.400680
4,3006,3006,0.008511,0.056539,0.101538,0.001351,0.006814,-0.047180,-0.012988,-0.033577,...,03/2011,2011-03-31 07:21:03.0,,03/2011,PD,3006,2.28,2.12,0.175997,-2.400680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,4124,4124,0.008077,0.029449,0.040843,-0.013005,0.074609,0.068977,0.010669,0.081488,...,02/2013,2013-02-22 12:57:46.0,,02/2013,PD,4124,1.95,1.59,-0.273727,-0.053985
429,4125,4125,-0.019947,0.029816,-0.017732,-0.017475,-0.082603,0.015329,0.025788,0.053534,...,02/2013,2013-12-10 12:36:00.0,,05/2013,PD,4125,2.03,2.47,0.391926,1.005487
430,4126,4126,-0.009423,0.001673,-0.037664,0.060811,-0.006292,0.034639,0.107867,0.040532,...,03/2013,2013-03-27 08:35:42.0,,03/2013,PD,4126,1.54,1.88,-0.219232,-1.158726
431,4136,4136,0.032763,-0.016046,-0.064461,-0.015332,0.030023,0.123020,0.032174,0.084627,...,03/2013,2013-03-06 07:09:02.0,,06/2013,PD,4136,0.68,0.57,-1.406477,-1.922162


Since we have a different of 3 patients between our data and the DATScan data, we remove these 3 subjects to have an accurata analysis.

In [None]:
info_pheno.iloc[:,0:2].to_csv("Data/QC/IDs_White_no_swedd.txt", header=False, index=False, sep=" ")
!plink --bfile Data/QC/White_no_swedd --keep Data/QC/IDs_White_no_swedd.txt --make-bed --out Data/QC/White_2_no_swedd

**To compute a gwas analysis with plink we need to create two files, one with the phenotypic(pheno) informations and one with the confounders(covariate)**

In [53]:
pheno = info_pheno[["FID","IID","APPRDX2","CAUDATE_R","CAUDATE_L","PUTAMEN_R","PUTAMEN_L"]]
pheno.to_csv('GWAS2/Phenotype.txt',sep="\t", index=False)
pheno

Unnamed: 0,FID,IID,APPRDX2,CAUDATE_R,CAUDATE_L,PUTAMEN_R,PUTAMEN_L
0,3000,3000,HC,2.99,3.43,2.328620,1.973532
1,3001,3001,PD,1.56,1.12,-0.246956,-0.796382
2,3002,3002,PD,2.13,2.28,0.255480,0.339528
3,3004,3004,HC,5.09,5.30,2.949289,2.400680
4,3006,3006,PD,2.28,2.12,0.175997,-2.400680
...,...,...,...,...,...,...,...
428,4124,4124,PD,1.95,1.59,-0.273727,-0.053985
429,4125,4125,PD,2.03,2.47,0.391926,1.005487
430,4126,4126,PD,1.54,1.88,-0.219232,-1.158726
431,4136,4136,PD,0.68,0.57,-1.406477,-1.922162


In [54]:
age = list()

# calculate the age from the year of birth
for i in range(len(info_pheno)):
    mm,yy = info_pheno.loc[i,"SITE_APRV"].split("/")
    a = int(yy) - int(info_pheno.loc[i,"BIRTHDT"])
    age.append(a)

info_pheno["AGE"] = age   

cov = info_pheno[["FID","IID","AGE","PC1","PC2","PC3","PC4","PC5"]]
cov.to_csv('GWAS2/Covariate.txt',sep="\t", index=False)
cov

Unnamed: 0,FID,IID,AGE,PC1,PC2,PC3,PC4,PC5
0,3000,3000,70,0.026492,0.000678,0.035338,0.064825,0.000766
1,3001,3001,65,0.017766,0.017305,0.000688,-0.045234,-0.037821
2,3002,3002,68,-0.084263,-0.034856,-0.023204,0.006116,0.027577
3,3004,3004,60,0.024563,0.033477,-0.003634,-0.005703,-0.042231
4,3006,3006,58,0.008511,0.056539,0.101538,0.001351,0.006814
...,...,...,...,...,...,...,...,...
428,4124,4124,71,0.008077,0.029449,0.040843,-0.013005,0.074609
429,4125,4125,64,-0.019947,0.029816,-0.017732,-0.017475,-0.082603
430,4126,4126,56,-0.009423,0.001673,-0.037664,0.060811,-0.006292
431,4136,4136,56,0.032763,-0.016046,-0.064461,-0.015332,0.030023


In [55]:
col = list(pheno.columns.values)[3:7] 
col

['CAUDATE_R', 'CAUDATE_L', 'PUTAMEN_R', 'PUTAMEN_L']

Core part of the analysis, the following cell create a linear model using plink. The command above can be lunched also with *--pheno-all*.

In [None]:
for i in col:
    !plink --bfile DATA/QC/White_2_no_swedd --pheno GWAS2/Phenotype.txt --covar GWAS2/Covariate.txt --pheno-name {i} --covar-name AGE,PC1-PC5 --linear sex --adjust --out GWAS2/{i}

We select only the ADD (additive) test because is the one associated to a gwas analysis.

In [57]:
for i in col:
    gwas = pd.read_table("GWAS2/"+i+".assoc.linear",sep="\s+")
    gwas = gwas[gwas.TEST=="ADD"]
    gwas.to_csv('GWAS2/GWAS_DATScan_'+i+'.csv',index=False)

The following script is used to generate the manhatann plot and the qq plot in one single image. To set your own path and other different information you can change it. If you want you can call R code inside this python notebook using the cell magic function **%%R** at the top of a cell after loading a python module: **%load_ext rpy2.ipython**

In [None]:
!Rscript qqman.R

## GWAS using MRI data

We load the MRI information

In [None]:
MRI = pd.read_csv("../Processed_Images/PPMIMERGE_20200419.csv")
MRI_unique = MRI.groupby('PATNO').first().reset_index()

In [None]:
pca = pd.read_table('Data/QC/White_2_no_swedd.eigenvec', sep=' ', header=None)
cnames = ['FID','IID']
for i in range(1,11):
    cnames.append("PC"+str(i))
pca.columns = cnames

info_pca = pca.merge(ppmi_info, left_on=['FID'], right_on=['PATNO'])

apprdx = list()

for i in info_pca['APPRDX'].to_list():
    if(i==1):
        apprdx.append("PD")
    if(i==2):
        apprdx.append("HC")

info_pca['APPRDX2'] = apprdx

info_pheno = info_pca.merge(MRI_unique, left_on=['FID'], right_on=['PATNO'])
age = list()
for i in range(len(info_pheno)):
    mm,yy = info_pheno.loc[i,"SITE_APRV"].split("/")
    a = int(yy) - int(info_pheno.loc[i,"BIRTHDT_x"])
    age.append(a)

info_pheno["AGE"] = age

In [None]:
pheno = info_pheno[["FID","IID","APPRDX2","rh_parahippocampal_area","lh_parahippocampal_area","rh_parahippocampal_volume","lh_parahippocampal_volume"]]

**N.B.**
The phenotype related to the MRI data are quite sensitive to outliers. Outliers can be easily produced during processing of the MRI. We want to convert the raw measures into something more robust (in a way we also have to exclude subjects). A solution is to **winsorize** the data.

In [None]:
for i in range(3,7):
       pheno.iloc[:,i] = winsorize(pheno.iloc[:,i], limits=[0.05, 0.05])

pheno.fillna("NA",inplace=True)
pheno.to_csv('GWAS2/MRI/Phenotype_Without_SWEDD.txt',sep="\t", index=False)

In [None]:
cov = info_pheno[["FID","IID","eTIV","AGE","PC1","PC2","PC3","PC4","PC5"]]
cov.fillna("NA",inplace=True)
cov.to_csv('GWAS2/MRI/Covariate_Without_SWEDD.txt',sep="\t", index=False)

In [None]:
for i in col:
    !plink --bfile DATA/QC/White_2_no_swedd --pheno GWAS2/MRI/Phenotype_Without_SWEDD.txt --covar GWAS2/MRI/Covariate_Without_SWEDD.txt --pheno-name {i} --covar-name eTIV,AGE,PC1-PC5 --linear sex --adjust --out GWAS2/MRI/{i+"_Without_SWEDD"}

In [33]:
for i in col:
    gwas = pd.read_table("GWAS2/MRI/"+i+"_Without_SWEDD.assoc.linear",sep="\s+")
    gwas = gwas[gwas.TEST=="ADD"]
    gwas.to_csv('GWAS2/MRI/GWAS_MRI_'+i+'_Without_SWEDD.csv',index=False)

In [None]:
!Rscript qqman.R