# Mining genetic, transcriptomic and imaging data in Parkinson’s disease - 1 

In [None]:
from IPython.display import Image, display
from matplotlib_venn import venn3
from scipy.stats import norm

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

import subprocess
import math
import sys
import os

## Introduction

#### Required software

- Plink (v1.9b)

#### Overview
GWA studies on complex diseases focus on statistical association between SNP and single phenotypes. However, several studies showed that one genetic variant can affect different traits (pleiotropy), particularly in complex diseases.<br>
Therefore, considering single phenotypic traits could result in a loss of statistical power in the identification of genetic mechanisms underlying complex diseases. Instead, taking into account multiple correlated phenotypes can improve the discovery of genetic variants, which could influence different traits underlying the onset of the studied complex disease providing new potential biological insights.<br>

Currently there are two main approaches to analyze multivariate phenotypes:
-  computing summary statistics from univariate analysis (namely individual GWAS)
-  providing generalized models combining phenotypic measures used to test for variant-trait associations

Here we apply to our data both these approaches, showing that exploring data from different perspectives improve our results. In the following we also show how to combine and validate these results using transcriptomic data.

In [None]:
genotyping_path = "data/genotyping/"
pheno_path = "data/pheno/"
gwasres_path = "resultsGWAS/"

## Let's prepare data for GWAS !

Let's begin by loading our phenotype data.

In [None]:
pheno = pd.read_csv(
    os.path.join(pheno_path, "PPMI-baseline_pheno.csv")
)
pheno.head()

In addition to the genome-wide genetic data, plink requres a phenotype file, and a file for the confounds. <br>
In python we can generate those quite quickly from our ```pheno``` data frame.

So, let's begin by creating the phenotype file!

In [None]:
# phenotype files for DaTscan
pheno_fn = os.path.join(gwasres_path, "individualView/pheno_datscan.txt")
pheno_datscan = pheno.loc[:,["PATNO", "ENROLL_CAT", "CAUDATE_R_norm", "CAUDATE_L_norm", "PUTAMEN_R_norm", "PUTAMEN_L_norm"]]
# plink requires FID + IID for all samples
pheno_datscan["FID"] = pheno_datscan["PATNO"]
# move FID and IID to first two columns
cols = pheno_datscan.columns.tolist()
cols = cols[-1:] + cols[:-1] 
pheno_datscan = pheno_datscan[cols]
pheno_datscan = pheno_datscan.rename({"PATNO":"IID"}, axis=1)
pheno_datscan.to_csv(
    pheno_fn,
    header=True,
    index=False,
    sep=" "
)

pheno_fn = os.path.join(gwasres_path, "integratedView/pheno_datscan.txt")
pheno_datscan = pheno.loc[:,["PATNO", "ENROLL_CAT", "DPS_DaTscan"]]
# plink requires FID + IID for all samples
pheno_datscan["FID"] = pheno_datscan["PATNO"]
# move FID and IID to first two columns
cols = pheno_datscan.columns.tolist()
cols = cols[-1:] + cols[:-1] 
pheno_datscan = pheno_datscan[cols]
pheno_datscan = pheno_datscan.rename({"PATNO":"IID"}, axis=1)
pheno_datscan.to_csv(
    pheno_fn,
    header=True,
    index=False,
    sep=" "
)

In [None]:
# phenotype files for MRI
pheno_fn = os.path.join(gwasres_path, "individualView/pheno_mri.txt")
pheno_mri = pheno.loc[
    :,[
        "PATNO", 
        "ENROLL_CAT", 
        "rh_parahippocampal_area_norm", 
        "lh_parahippocampal_area_norm",
        "rh_parahippocampal_volume_norm", 
        "lh_parahippocampal_volume_norm",
        "rh_parahippocampal_thickness_norm",
        "lh_parahippocampal_thickness_norm"
    ]
]
# plink requires FID + IID for all samples
pheno_mri["FID"] = pheno_mri["PATNO"]
# move FID and IID to first two columns
cols = pheno_mri.columns.tolist()
cols = cols[-1:] + cols[:-1] 
pheno_mri = pheno_mri[cols]
pheno_mri = pheno_mri.rename({"PATNO":"IID"}, axis=1)
pheno_mri.to_csv(
    pheno_fn,
    header=True,
    index=False,
    sep=" "
)

pheno_fn = os.path.join(gwasres_path, "integratedView/pheno_mri.txt")
pheno_mri = pheno.loc[:,["PATNO", "ENROLL_CAT", "DPS_MRI"]]
# plink requires FID + IID for all samples
pheno_mri["FID"] = pheno_mri["PATNO"]
# move FID and IID to first two columns
cols = pheno_mri.columns.tolist()
cols = cols[-1:] + cols[:-1] 
pheno_mri = pheno_mri[cols]
pheno_mri = pheno_mri.rename({"PATNO":"IID"}, axis=1)
pheno_mri.to_csv(
    pheno_fn,
    header=True,
    index=False,
    sep=" "
)

And create the covariate file...

In [None]:
# take age category, gender and years of education
covariate_fn = os.path.join(gwasres_path, "covariate.txt")
covariate = pheno.loc[:,["PATNO", "age_cat", "gen", "educ"]]
# add relatedness matrix PCs
ppmi_ceu_pca = pd.read_csv(
    os.path.join(genotyping_path, "PPMI_merge_final.eigenvec"),
    sep="\s+",
    header=None
)
cnames = ["FID","IID"]
for i in range(1,21):
    cnames.append("PC"+str(i))
ppmi_ceu_pca.columns = cnames
covariate = covariate.merge(ppmi_ceu_pca, left_on="PATNO", right_on="FID").drop(["PATNO"], axis=1)
# put FID and IID on front
cols = covariate.columns.tolist()
cols = [cols[3], cols[4]] + cols[:3] + cols[5:] 
covariate = covariate[cols]
covariate.to_csv(
    covariate_fn,
    header=True,
    index=False,
    sep=" "
)

Now, finally we have everything to run our GWAS!

## Individual View

*Individual View* focuses on finding potential genetic biomarkers by interpolating genotyping and neuroimaging data, considering phenotypic traits individually.<br> 
In other words, we apply to our data the first GWAS approach previously described!

Let's perform GWAS analysis with Plink on DaTscan!

In [None]:
pheno_datscan_fn = os.path.join(gwasres_path, "individualView/pheno_datscan.txt")
!plink --bfile {os.path.join(genotyping_path, "PPMI_merge_final")} --pheno {pheno_datscan_fn} --all-pheno --covar {covariate_fn} --covar-name age_cat,educ,PC1-PC5 --linear hide-covar sex --out {os.path.join(gwasres_path, "individualView/DaTscan_results")} 

And on MRI phenotypic data...

In [None]:
pheno_mri_fn = os.path.join(gwasres_path, "individualView/pheno_mri.txt")
!plink --bfile {os.path.join(genotyping_path, "PPMI_merge_final")} --pheno {pheno_mri_fn} --all-pheno --covar {covariate_fn} --covar-name age_cat,educ,PC1-PC5 --linear hide-covar sex --out {os.path.join(gwasres_path, "individualView/MRI_results")} 

In [None]:
pd.read_csv(os.path.join(gwasres_path, "individualView/DaTscan_results.CAUDATE_R_norm.assoc.linear"), sep="\s+")\
.sort_values("P", ascending=True)\
.reset_index(drop=True)\
.head()

Let's now combine our multivariate GWAS results in a single-trait based summary statistics with TATES!<br>
As input TATES requires:
- a file named ```defdims``` with execution parameters
- a file containing the phenotypes correlation matrix 
- a file storing the single trait P-values

**NB** TATES requires that all input files are in the same directory of TATES executable binary!

These files can be easily created with Python. Let's begin with DaTscan!

In [None]:
tateswd = os.path.join(gwasres_path, "individualView/")

# compute correlation matrix
corrmat_fn = "corrmat_datscan"
datscan = np.array(pheno_datscan.iloc[:,[3,4,5,6]])
corrmat_datascan = np.corrcoef(datscan, rowvar=False)  # observation are on rows
corrmat_datascan = np.float32(corrmat_datascan)  # make sure that values are float32
pd.DataFrame(corrmat_datascan)\
.to_csv(
    os.path.join(tateswd, corrmat_fn),
    header=False,
    index=False,
    sep=" "
)
# pvals file
pvals_fn = "pvals_datscan"
caudate_r = pd.read_csv(os.path.join(gwasres_path, "individualView/DaTscan_results.CAUDATE_R_norm.assoc.linear"), sep="\s+")
caudate_l = pd.read_csv(os.path.join(gwasres_path, "individualView/DaTscan_results.CAUDATE_L_norm.assoc.linear"), sep="\s+")
putamen_r = pd.read_csv(os.path.join(gwasres_path, "individualView/DaTscan_results.PUTAMEN_R_norm.assoc.linear"), sep="\s+")
putamen_l = pd.read_csv(os.path.join(gwasres_path, "individualView/DaTscan_results.PUTAMEN_L_norm.assoc.linear"), sep="\s+")
pd.DataFrame(
    {
        0:caudate_r.iloc[:,0],  # chromosome
        1:caudate_r.iloc[:,1],  # SNP name
        2:caudate_r.iloc[:,8],  # caudate_r pvals
        3:caudate_l.iloc[:,8],  # caudate_l pvals
        4:putamen_r.iloc[:,8],  # putamen_r pvals
        5:putamen_l.iloc[:,8]   # putamen_l pvals
    }
)\
.dropna()\
.to_csv(
   os.path.join(tateswd, pvals_fn),
    header=False,
    index=False,
    sep=" "
)
# defdims file
pheno_num = 4  # 4 phenotypes (DaTscan)
corrmat_type = "full"  # full correlation matrix
snp_num = pd.read_csv(os.path.join(tateswd, pvals_fn), header=None, sep=" ").shape[0]
tatesres_fn = "DaTscan_tates_results"
towrite = "{0} {1}\n{2}\n{3}\n{4}\n{5}".format(
    pheno_num,      # number of phenotype
    snp_num,        # number of snps
    corrmat_type,   # correlation matrix type
    pvals_fn,       # pvals file
    corrmat_fn,     # correlation matrix file
    tatesres_fn
)
try:
    with open(os.path.join(tateswd, "defdims"), mode="w+") as outfile:
        outfile.write(towrite)
except Exception as e:
    raise e
finally:
    outfile.close()

# run TATES from command line
cwd = os.getcwd()
os.chdir(tateswd)
! ./tates
os.chdir(cwd)

And let's do the same with MRI data...

In [None]:
pheno_mri.head()

In [None]:
tateswd = os.path.join(gwasres_path, "individualView/")

# compute correlation matrix
corrmat_fn = "corrmat_mri"
mri = np.array(pheno_mri.iloc[:,[3,4,5,6,7,8]])
corrmat_mri = np.corrcoef(mri, rowvar=False)  # observation are on rows
corrmat_mri = np.float32(corrmat_mri)  # make sure that values are float32
pd.DataFrame(corrmat_mri)\
.to_csv(
    os.path.join(tateswd, corrmat_fn),
    header=False,
    index=False,
    sep=" "
)
# pvals file
pvals_fn = "pvals_mri"
parahippo_area_r = pd.read_csv(os.path.join(gwasres_path, "individualView/MRI_results.rh_parahippocampal_area_norm.assoc.linear"), sep="\s+")
parahippo_area_l = pd.read_csv(os.path.join(gwasres_path, "individualView/MRI_results.lh_parahippocampal_area_norm.assoc.linear"), sep="\s+")
parahippo_vol_r = pd.read_csv(os.path.join(gwasres_path, "individualView/MRI_results.rh_parahippocampal_volume_norm.assoc.linear"), sep="\s+")
parahippo_vol_l = pd.read_csv(os.path.join(gwasres_path, "individualView/MRI_results.lh_parahippocampal_volume_norm.assoc.linear"), sep="\s+")
parahippo_thick_r = pd.read_csv(os.path.join(gwasres_path, "individualView/MRI_results.rh_parahippocampal_thickness_norm.assoc.linear"), sep="\s+")
parahippo_thick_l = pd.read_csv(os.path.join(gwasres_path, "individualView/MRI_results.lh_parahippocampal_thickness_norm.assoc.linear"), sep="\s+")
pd.DataFrame(
    {
        0:parahippo_area_r.iloc[:,0],  # chromosome
        1:parahippo_area_r.iloc[:,1],  # SNP name
        2:parahippo_area_r.iloc[:,8],  # parahippo_area_r pvals
        3:parahippo_area_l.iloc[:,8],  # parahippo_area_l pvals
        4:parahippo_vol_r.iloc[:,8],   # parahippo_volume_r pvals
        5:parahippo_vol_l.iloc[:,8],   # parahippo_volume_l pvals
        6:parahippo_thick_r.iloc[:,8], # parahippo_thickness_r pvals
        7:parahippo_thick_l.iloc[:,8]  # parahippo_thickness_l pvals
    }
)\
.dropna()\
.to_csv(
   os.path.join(tateswd, pvals_fn),
    header=False,
    index=False,
    sep=" "
)
# defdims file
pheno_num = 6  # 6 phenotypes (MRI)
corrmat_type = "full"  # full correlation matrix
snp_num = pd.read_csv(os.path.join(tateswd, pvals_fn), header=None, sep=" ").shape[0]
tatesres_fn = "MRI_tates_results"
towrite = "{0} {1}\n{2}\n{3}\n{4}\n{5}".format(
    pheno_num,      # number of phenotype
    snp_num,        # number of snps
    corrmat_type,   # correlation matrix type
    pvals_fn,       # pvals file
    corrmat_fn,     # correlation matrix file
    tatesres_fn
)
try:
    with open(os.path.join(tateswd, "defdims"), mode="w+") as outfile:
        outfile.write(towrite)
except Exception as e:
    raise e
finally:
    outfile.close()

# run TATES from command line
cwd = os.getcwd()
os.chdir(tateswd)
! ./tates
os.chdir(cwd)

We can now visualize combined GWAS results through Manhattan plots.<br>
To do this we will use ```qqman``` package, an R library specifically designed to provide publication-quality plots.

In [None]:
os.chdir("src")
!Rscript plotManhattanIndView.R
os.chdir(cwd)

In [None]:
display(Image(os.path.join(gwasres_path, "individualView/DaTscan_Manhattan.png")))

In [None]:
display(Image(os.path.join(gwasres_path, "individualView/MRI_Manhattan.png")))

## Integrated View

*Integrated View* is aimed to search for genetic variants-phenotype associations using generalized models as phenotypic measures. <br>
Therefore, will be searched "SNP-model" associations rather than simple SNP-trait associations.

Let's perform GWAS analyses!

In [None]:
pheno_datscan_fn = os.path.join(gwasres_path, "integratedView/pheno_datscan.txt")
!plink --bfile {os.path.join(genotyping_path, "PPMI_merge_final")} --pheno {pheno_datscan_fn} --all-pheno --covar {covariate_fn} --covar-name age_cat,educ,PC1-PC5 --linear hide-covar sex --out {os.path.join(gwasres_path, "integratedView/DaTscan_results")} 

In [None]:
pheno_mri_fn = os.path.join(gwasres_path, "integratedView/pheno_mri.txt")
!plink --bfile {os.path.join(genotyping_path, "PPMI_merge_final")} --pheno {pheno_mri_fn} --all-pheno --covar {covariate_fn} --covar-name age_cat,educ,PC1-PC5 --linear hide-covar sex --out {os.path.join(gwasres_path, "integratedView/MRI_results")} 

We can now visualize GWAS results.<br>
As before we will use ```qqman``` R package.

In [None]:
os.chdir("src")
!Rscript plotManhattanIntView.R
os.chdir(cwd)

In [None]:
display(Image(os.path.join(gwasres_path, "integratedView/DaTscan_Manhattan.png")))

In [None]:
display(Image(os.path.join(gwasres_path, "integratedView/MRI_Manhattan.png")))