# 1-Lifting SNP positions and merging the datasets

In [1]:
from scipy.stats import norm
from matplotlib_venn import venn3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import subprocess
import math
import os
import sys

Set the path to Immuno and NeuroX files (BIM, BED and FAM)

In [3]:
immuno_path = "data/IMMUNO/"
neurox_path = "data/NEUROX/"

Explore the data contained in the two BIMs

In [9]:
immuno_bim_fn = os.path.join(immuno_path, "IMMUNO.bim")
neurox_bim_fn = os.path.join(neurox_path, "NEUROX.bim")

immuno_bim = pd.read_csv(immuno_bim_fn, sep="\t", header=None)
neurox_bim = pd.read_csv(neurox_bim_fn, sep="\t", header=None)

In [8]:
immuno_bim.head(n=10)

Unnamed: 0,0,1,2,3,4,5
0,1,imm_1_898835,0,898835,0,A
1,1,vh_1_1108138,0,1108138,T,C
2,1,vh_1_1110294,0,1110294,A,G
3,1,rs9729550,0,1125105,C,A
4,1,rs1815606,0,1130298,T,G
5,1,rs7515488,0,1153667,T,C
6,1,rs11260562,0,1155173,A,G
7,1,rs6697886,0,1163474,A,G
8,1,1_1168711,0,1168711,A,G
9,1,rs6603785,0,1176365,T,A


In [10]:
neurox_bim.head(n=10)

Unnamed: 0,0,1,2,3,4,5
0,1,NeuroX_PARK7_Pro158del,0,0,0,I
1,1,NeuroX_PINK1_23bp_del_ex7,0,0,0,I
2,1,NeuroX_PINK1_534_535insQ,0,0,0,D
3,1,NeuroX_PINK1_Asp525fs,0,0,0,D
4,1,NeuroX_PINK1_Cys549fs,0,0,0,I
5,1,NeuroX_PINK1_Lys520fs,0,0,0,I
6,1,exm2268640,0,762320,T,C
7,1,exm41,0,861349,0,C
8,1,exm1916089,0,865545,0,0
9,1,exm44,0,865584,0,G


The SNPs contained in IMMUNO dataset have been mapped on hg18 genome assembly, while those in NEUROX have been mapped on hg19 assembly. To overcome this problem we must lift IMMUNO's SNPs positions to the corresponding hg19 positions. To accomplish this task we will use the web-based tool liftOver from UCSC (https://genome.ucsc.edu/cgi-bin/hgLiftOver).

Since liftOver accepts as input UCSC BED files and not BIMs, we turn IMMUNO BIM in the corresponding UCSC BED file.

In [12]:
tmp_chr = immuno_bim.iloc[:,0]
tmp_pos = immuno_bim.iloc[:,3]
tmp_name = immuno_bim.iloc[:,1]

# append 'chr' in fron of the chromosome number
tmp_chr_str = [''.join(["chr", str(c)]) for c in tmp_chr.values]

# replace chromosome numbers 23, 24, 25 and 26 with X, Y, X and MT
tmp_chr_str2 = [c.replace('23', 'X') for c in tmp_chr_str]
tmp_chr_str3 = [c.replace('24', 'Y') for c in tmp_chr_str2]
tmp_chr_str4 = [c.replace('25', 'X') for c in tmp_chr_str3]
tmp_chr_str5 = [c.replace('26', 'MT') for c in tmp_chr_str4]

# write the resulting UCSC BED file
bed = pd.concat(
    [pd.DataFrame(tmp_chr_str5), tmp_pos, tmp_pos+1, tmp_name],
    axis=1
)
print(bed.head(n=5))
print(bed.tail(n=5))
bed.to_csv(
    os.path.join(immuno_path, "IMMUNO_tolift.bed"), sep="\t",
    index=False, header=False
)

      0        3        3             1
0  chr1   898835   898836  imm_1_898835
1  chr1  1108138  1108139  vh_1_1108138
2  chr1  1110294  1110295  vh_1_1110294
3  chr1  1125105  1125106     rs9729550
4  chr1  1130298  1130299     rs1815606
            0          3          3           1
196519   chrX  154889906  154889907   rs3093525
196520   chrX  154889917  154889918   rs3093526
196521   chrX  154892630  154892631   rs2742301
196522   chrX  154907376  154907377   rs2981835
196523  chrMT       3721       3722  MitoA3721G


LiftOver mapped succesfully 196,520 of IMMUNO SNPs, while for 4 of them the conversion failed. We renamed the resulting UCSC BED file (with the positions lifted) IMMUNO_lifted.bed.

In [14]:
bed_lifted = pd.read_csv(os.path.join(immuno_path, "IMMUNO_lifted.bed"), sep="\t", header=None)

# we take SNPs whose conversion to hg19 positions was succesful
good_snps = bed_lifted.iloc[:,3]
good_snps.to_csv(
    os.path.join(immuno_path, "good_snps.txt"),
    index=False, header=False
)

As mentioned 4 SNPs were not converted to hg19 positions, thus we need to remove them from data in the BIM. To do this we use plink.

In [16]:
!plink --bfile {os.path.join(immuno_path, "IMMUNO")} --update-map {os.path.join(immuno_path, "IMMUNO_lifted.bed")} 2 4 --make-bed --extract {os.path.join(immuno_path, "good_snps.txt")} --out {os.path.join(immuno_path, "IMMUNO_hg19")}

PLINK v1.90b6.12 64-bit (28 Oct 2019)          www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to data/IMMUNO/IMMUNO_hg19.log.
Options in effect:
  --bfile data/IMMUNO/IMMUNO
  --extract data/IMMUNO/good_snps.txt
  --make-bed
  --out data/IMMUNO/IMMUNO_hg19
  --update-map data/IMMUNO/IMMUNO_lifted.bed 2 4

16384 MB RAM detected; reserving 8192 MB for main workspace.
196524 variants loaded from .bim file.
523 people (343 males, 180 females) loaded from .fam.
--update-map: 196520 values updated.
--extract: 196520 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 523 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
); many commands treat these as missing.
treat 

Now, both IMMUNO and NEUROX datasets have SNPs mapped on hg19 genome assembly. Then, we can merge the two datasets in a single one, containing all the SNPs available in the two sets.

In [18]:
# consider the hg19 mapped BIM of IMMUNO
immuno_bim = pd.read_csv(os.path.join(immuno_path, "IMMUNO_hg19.bim"), sep="\t", header=None)
immuno_bim.head(n=10)

Unnamed: 0,0,1,2,3,4,5
0,1,imm_1_898835,0,908972,0,A
1,1,vh_1_1108138,0,1118275,T,C
2,1,vh_1_1110294,0,1120431,A,G
3,1,rs9729550,0,1135242,C,A
4,1,rs1815606,0,1140435,T,G
5,1,rs7515488,0,1163804,T,C
6,1,rs11260562,0,1165310,A,G
7,1,rs6697886,0,1173611,A,G
8,1,1_1168711,0,1178848,A,G
9,1,rs6603785,0,1186502,T,A


In [19]:
neurox_bim.head(n=10)

Unnamed: 0,0,1,2,3,4,5
0,1,NeuroX_PARK7_Pro158del,0,0,0,I
1,1,NeuroX_PINK1_23bp_del_ex7,0,0,0,I
2,1,NeuroX_PINK1_534_535insQ,0,0,0,D
3,1,NeuroX_PINK1_Asp525fs,0,0,0,D
4,1,NeuroX_PINK1_Cys549fs,0,0,0,I
5,1,NeuroX_PINK1_Lys520fs,0,0,0,I
6,1,exm2268640,0,762320,T,C
7,1,exm41,0,861349,0,C
8,1,exm1916089,0,865545,0,0
9,1,exm44,0,865584,0,G


Before merging SNPs data, we must check how many subjects are in both NEUROX and IMMUNO datasets. Then we keep the data only for those whose data are in both the considered datasets.

In [22]:
immuno_fam_fn = os.path.join(immuno_path, "IMMUNO_hg19.fam")
neurox_fam_fn = os.path.join(neurox_path, "NEUROX.fam")

immuno_fam = pd.read_csv(immuno_fam_fn, sep=" ", header=None)
neurox_fam = pd.read_csv(neurox_fam_fn, sep=" ", header=None)

In [23]:
immuno_fam.head(n=5)

Unnamed: 0,0,1,2,3,4,5
0,3400,3400,0,0,2,-9
1,3401,3401,0,0,2,-9
2,3402,3402,0,0,1,-9
3,3403,3403,0,0,1,-9
4,3404,3404,0,0,2,-9


In [24]:
neurox_fam.head(n=5)

Unnamed: 0,0,1,2,3,4,5
0,3527,3527,0,0,1,-9
1,3274,3274,0,0,1,-9
2,3220,3220,0,0,2,-9
3,3467,3467,0,0,1,-9
4,3171,3171,0,0,1,-9


In [39]:
immuno_subj = immuno_fam.iloc[:,1].tolist()
neurox_subj = neurox_fam.iloc[:,1].tolist()

# get the subjects appearing in both IMMUNO and NEUROX data
common_subj = set(immuno_subj).intersection(neurox_subj)

# write the common subjects to a file
with open("data/common_subj.txt", mode="w+") as outfile:
    for s in common_subj:
        outfile.write(''.join([str(s), " ", str(s), "\n"]))

In IMMUNO FAM file are contained 523 subjects, while in NEUROX FAM there are 619 subjects. The subjects whose genotyping data are available in both the considered dataset are 520. Then, before merging IMMUNO and NEUROX in a single dataset, we keep data belonging only to the subjects in common between the two datasets. To do this we use plink.

In [40]:
!plink --bfile {os.path.join(immuno_path, "IMMUNO_hg19")} --keep data/common_subj.txt --make-bed --out {os.path.join(immuno_path, "IMMUNO_common")}

PLINK v1.90b6.12 64-bit (28 Oct 2019)          www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to data/IMMUNO/IMMUNO_common.log.
Options in effect:
  --bfile data/IMMUNO/IMMUNO_hg19
  --keep data/common_subj.txt
  --make-bed
  --out data/IMMUNO/IMMUNO_common

16384 MB RAM detected; reserving 8192 MB for main workspace.
196520 variants loaded from .bim file.
523 people (343 males, 180 females) loaded from .fam.
--keep: 520 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 520 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
); many commands treat these as missing.
treat these as missing.
Total genotyping rate in remaining samples is 0.957428.
196520 variants and

In [41]:
!plink --bfile {os.path.join(neurox_path, "NEUROX")} --keep data/common_subj.txt --make-bed --out {os.path.join(neurox_path, "NEUROX_common")}

PLINK v1.90b6.12 64-bit (28 Oct 2019)          www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to data/NEUROX/NEUROX_common.log.
Options in effect:
  --bfile data/NEUROX/NEUROX
  --keep data/common_subj.txt
  --make-bed
  --out data/NEUROX/NEUROX_common

16384 MB RAM detected; reserving 8192 MB for main workspace.
267607 variants loaded from .bim file.
619 people (409 males, 210 females) loaded from .fam.
--keep: 520 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 520 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
); many commands treat these as missing.
treat these as missing.
Total genotyping rate in remaining samples is 0.957466.
267607 variants and 520 

PPMI documentation tell us that the genotyping was made using custom genotype arrays. Thus, the SNP names do not always are identified through an rs ID. Moreover, we want to keep as much SNPs as possible from the two datasets: we need to keep the SNPs which are uniquely contained in IMMUNO and those uniquely contained in NEUROX and those shared by the two datasets. As mentioned before the SNP names are not always in rs ID format. Thus, to get the SNPs, which are uniquely in IMMUNO, we need to work on the genomic coordinates of the SNPs.

In [52]:
# the BIM are those containing the common subjects data
immuno_bim = pd.read_csv(
    os.path.join(immuno_path, "IMMUNO_common.bim"), sep="\t",
    header=None
)
neurox_bim = pd.read_csv(
    os.path.join(neurox_path, "NEUROX_common.bim"), sep="\t",
    header=None
)

# identify each entry of the two datasets by its genomic coordinates
chrs = immuno_bim.iloc[:,0]
pos = immuno_bim.iloc[:,3]
wt = immuno_bim.iloc[:,4]
fa = immuno_bim.iloc[:,5]

assert len(chrs) == len(pos)

coords = list()
for i in range(len(chrs)):
    coord = "_".join([str(chrs[i]), str(pos[i]), str(pos[i] + 1), wt[i], fa[i]])
    coords.append(coord)
immuno_coords = immuno_bim
immuno_coords[6] = coords

chrs = neurox_bim.iloc[:,0]
pos = neurox_bim.iloc[:,3]
wt = neurox_bim.iloc[:,4]
fa = neurox_bim.iloc[:,5]

assert len(chrs) == len(pos)

coords = list()
for i in range(len(chrs)):
    coord = "_".join([str(chrs[i]), str(pos[i]), str(pos[i] + 1), wt[i], fa[i]])
    coords.append(coord)
neurox_coords = neurox_bim
neurox_coords[6] = coords

In [53]:
immuno_coords.head(n=10)

Unnamed: 0,0,1,2,3,4,5,6
0,1,imm_1_898835,0,908972,0,A,1_908972_908973_0_A
1,1,vh_1_1108138,0,1118275,T,C,1_1118275_1118276_T_C
2,1,vh_1_1110294,0,1120431,A,G,1_1120431_1120432_A_G
3,1,rs9729550,0,1135242,C,A,1_1135242_1135243_C_A
4,1,rs1815606,0,1140435,T,G,1_1140435_1140436_T_G
5,1,rs7515488,0,1163804,T,C,1_1163804_1163805_T_C
6,1,rs11260562,0,1165310,A,G,1_1165310_1165311_A_G
7,1,rs6697886,0,1173611,A,G,1_1173611_1173612_A_G
8,1,1_1168711,0,1178848,A,G,1_1178848_1178849_A_G
9,1,rs6603785,0,1186502,T,A,1_1186502_1186503_T_A


In [54]:
neurox_coords.head(n=10)

Unnamed: 0,0,1,2,3,4,5,6
0,1,NeuroX_PARK7_Pro158del,0,0,0,I,1_0_1_0_I
1,1,NeuroX_PINK1_23bp_del_ex7,0,0,0,I,1_0_1_0_I
2,1,NeuroX_PINK1_534_535insQ,0,0,0,D,1_0_1_0_D
3,1,NeuroX_PINK1_Asp525fs,0,0,0,D,1_0_1_0_D
4,1,NeuroX_PINK1_Cys549fs,0,0,0,I,1_0_1_0_I
5,1,NeuroX_PINK1_Lys520fs,0,0,0,I,1_0_1_0_I
6,1,exm2268640,0,762320,T,C,1_762320_762321_T_C
7,1,exm41,0,861349,0,C,1_861349_861350_0_C
8,1,exm1916089,0,865545,0,0,1_865545_865546_0_0
9,1,exm44,0,865584,0,G,1_865584_865585_0_G


In [60]:
# we get the names of the SNPs only available in IMMUNO dataset
neurox_coords_set = set(neurox_coords.iloc[:,6].to_list())
immuno_coords_list = immuno_coords.iloc[:,6].tolist()
immuno_snp_names = immuno_coords.iloc[:,1].tolist()
immuno_unique_snps = list()
for i in range(len(immuno_coords_list)):
    c = immuno_coords_list[i]
    if c not in neurox_coords_set:
        immuno_unique_snps.append(immuno_snp_names[i])
        
# write the SNPs uniquely appearing in IMMUNO
with open("data/onlyIMMUNOsnps.txt", mode="w+") as outfile:
    for snp in immuno_unique_snps:
        outfile.write(''.join([snp, "\n"]))

Then from IMMUNO dataset we keep only data related to the SNPs uniquely found in IMMUNO and not in NEUROX dataset. To accomlish this task, we use plink --extract

In [61]:
!plink --bfile {os.path.join(immuno_path, "IMMUNO_common")} --extract data/onlyIMMUNOsnps.txt --make-bed --out {os.path.join(immuno_path, "IMMUNO_uniquesnps")}

PLINK v1.90b6.12 64-bit (28 Oct 2019)          www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to data/IMMUNO/IMMUNO_uniquesnps.log.
Options in effect:
  --bfile data/IMMUNO/IMMUNO_common
  --extract data/onlyIMMUNOsnps.txt
  --make-bed
  --out data/IMMUNO/IMMUNO_uniquesnps

16384 MB RAM detected; reserving 8192 MB for main workspace.
196520 variants loaded from .bim file.
520 people (341 males, 179 females) loaded from .fam.
--extract: 189564 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 520 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
data/IMMUNO/IMMUNO_uniquesnps.hh ); many commands treat these as missing.
treat these as missing.
Total genotyping r

Now we can merge NEUROX and IMMUNO datasets in a single plink file containing the SNPs of both the considered datsets. To do this we use agian plink

In [63]:
!plink --bfile {os.path.join(neurox_path, "NEUROX_common")} --bmerge {os.path.join(immuno_path, "IMMUNO_uniquesnps")} --make-bed --out data/PPMI_merged

PLINK v1.90b6.12 64-bit (28 Oct 2019)          www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to data/PPMI_merged.log.
Options in effect:
  --bfile data/NEUROX/NEUROX_common
  --bmerge data/IMMUNO/IMMUNO_uniquesnps
  --make-bed
  --out data/PPMI_merged

16384 MB RAM detected; reserving 8192 MB for main workspace.
520 people loaded from data/NEUROX/NEUROX_common.fam.
520 people to be merged from data/IMMUNO/IMMUNO_uniquesnps.fam.
Of these, 0 are new, while 520 are present in the base dataset.
267607 markers loaded from data/NEUROX/NEUROX_common.bim.
189564 markers to be merged from data/IMMUNO/IMMUNO_uniquesnps.bim.
Of these, 189564 are new, while 0 are present in the base dataset.
the same position.
have the same position.
the same position.
Performing single-pass merge (520 people, 457171 variants).
Merged fileset written to data/PPMI_merged-merge.bed +
data/PPMI_merged-merge.bim + data/PPMI_merged-merge.fam .
457