In [1]:
import os, sys
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from scipy.stats import norm
from matplotlib_venn import venn3
import subprocess

In [2]:
immuno_path='../PPMI_data/1_Immunochip_SNP_Data'
neuro_path='../PPMI_data/2_NeuroX_SNP_Data_for_Original_Cohort'

In [3]:
set1_bim_fn = os.path.join(immuno_path , 'IMMUNO.bim')
set2_bim_fn = os.path.join(neuro_path ,  'NEUROX.bim')

set1_bim = pd.read_table(set1_bim_fn, header=None)
set2_bim = pd.read_table(set2_bim_fn, header=None)

In [4]:
set1_bim.head()

Unnamed: 0,0,1,2,3,4,5
0,1,imm_1_898835,0,898835,0,A
1,1,vh_1_1108138,0,1108138,T,C
2,1,vh_1_1110294,0,1110294,A,G
3,1,rs9729550,0,1125105,C,A
4,1,rs1815606,0,1130298,T,G


In [5]:
set2_bim.head()

Unnamed: 0,0,1,2,3,4,5
0,1,NeuroX_PARK7_Pro158del,0,0,0,I
1,1,NeuroX_PINK1_23bp_del_ex7,0,0,0,I
2,1,NeuroX_PINK1_534_535insQ,0,0,0,D
3,1,NeuroX_PINK1_Asp525fs,0,0,0,D
4,1,NeuroX_PINK1_Cys549fs,0,0,0,I


Immuno is on hg18 genome assemply while neurox snps data is map on hg19 genome assemply, we need to map immuno to hg19 as neurox.

In [6]:
tmp_chr=set1_bim.iloc[:,0]
tmp_pos=set1_bim.iloc[:,3]
tmp_name=set1_bim.iloc[:,1]

#append 'chr' to the chromosome number
tmp_chr_str = ["chr" + str(sub) for sub in tmp_chr.values]

#replace the numbers 23, 24, 25, 26 with letters, plink way to encode regions
tmp_chr_str2 = [w.replace('23', 'X') for w in tmp_chr_str]
tmp_chr_str3 = [w.replace('24', 'Y') for w in tmp_chr_str2]
tmp_chr_str4 = [w.replace('25', 'X') for w in tmp_chr_str3]
tmp_chr_str5 = [w.replace('26', 'MT') for w in tmp_chr_str4]

#put they together
bed=pd.concat([pd.DataFrame(tmp_chr_str5), tmp_pos, tmp_pos+1, tmp_name], axis=1)
print(bed.head())
print(bed.tail())
bed.to_csv(os.path.join(immuno_path , 'IMMUNO_hg18.bed'), sep=" ", index=False, header=False)

      0        3        3             1
0  chr1   898835   898836  imm_1_898835
1  chr1  1108138  1108139  vh_1_1108138
2  chr1  1110294  1110295  vh_1_1110294
3  chr1  1125105  1125106     rs9729550
4  chr1  1130298  1130299     rs1815606
            0          3          3           1
196519   chrX  154889906  154889907   rs3093525
196520   chrX  154889917  154889918   rs3093526
196521   chrX  154892630  154892631   rs2742301
196522   chrX  154907376  154907377   rs2981835
196523  chrMT       3721       3722  MitoA3721G


We use the liftOver tool from the UCSC genome browser (https://genome.ucsc.edu/) to get the new positions. Some SNPs will not be converted. Commnad line only for Linux users.
Let's load the new cooridnates.

In [7]:
new_bed=pd.read_table(os.path.join(immuno_path, 'IMMUNO_hg19_ucsc.bed'), header=None)

good_snps=new_bed.iloc[:,3]
#write successful SNPs to a file
good_snps.to_csv(os.path.join(immuno_path, 'good_snps.txt'), index=False, header=False)

In [8]:
!plink --bfile {os.path.join(immuno_path, 'IMMUNO')} --update-map {os.path.join(immuno_path, 'IMMUNO_hg19_ucsc.bed')} 2 4 --make-bed --extract {os.path.join(immuno_path, 'good_snps.txt')} --out Data/IMMUNO_hg19

PLINK v1.90b6.12 64-bit (28 Oct 2019)          www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Data/IMMUNO_hg19.log.
Options in effect:
  --bfile ../PPMI_data/1_Immunochip_SNP_Data/IMMUNO
  --extract ../PPMI_data/1_Immunochip_SNP_Data/good_snps.txt
  --make-bed
  --out Data/IMMUNO_hg19
  --update-map ../PPMI_data/1_Immunochip_SNP_Data/IMMUNO_hg19_ucsc.bed 2 4

32768 MB RAM detected; reserving 16384 MB for main workspace.
196524 variants loaded from .bim file.
523 people (343 males, 180 females) loaded from .fam.
--update-map: 196520 values updated.
--extract: 196520 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 523 founders and 0 nonfounders present.
Calculating allele frequencies... 101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596

## Merging the 2 different dataset

In [15]:
immuno = pd.read_table("Data/IMMUNO_hg19.bim", header=None)
neurox = set2_bim

In [12]:
immuno.head()

Unnamed: 0,0,1,2,3,4,5
0,1,imm_1_898835,0,908972,0,A
1,1,vh_1_1108138,0,1118275,T,C
2,1,vh_1_1110294,0,1120431,A,G
3,1,rs9729550,0,1135242,C,A
4,1,rs1815606,0,1140435,T,G


In [17]:
neurox

Unnamed: 0,0,1,2,3,4,5
0,1,NeuroX_PARK7_Pro158del,0,0,0,I
1,1,NeuroX_PINK1_23bp_del_ex7,0,0,0,I
2,1,NeuroX_PINK1_534_535insQ,0,0,0,D
3,1,NeuroX_PINK1_Asp525fs,0,0,0,D
4,1,NeuroX_PINK1_Cys549fs,0,0,0,I
...,...,...,...,...,...,...
267602,26,exm2216497,0,15942,0,T
267603,26,exm2216498,0,15946,T,C
267604,26,exm2216499,0,15951,G,A
267605,26,exm2216500,0,15978,T,C


In [19]:
coords = list()
for i in range(len(immuno)):
    chrom = str(immuno.iloc[i,0])
    start = str(immuno.iloc[i,3])
    stop  = str(immuno.iloc[i,3]+1)
    coord = '_'.join([chrom,start,stop])
    coords.append(coord)

immuno[6] = coords
immuno.head()

Unnamed: 0,0,1,2,3,4,5,6
0,1,imm_1_898835,0,908972,0,A,1_908972_908973
1,1,vh_1_1108138,0,1118275,T,C,1_1118275_1118276
2,1,vh_1_1110294,0,1120431,A,G,1_1120431_1120432
3,1,rs9729550,0,1135242,C,A,1_1135242_1135243
4,1,rs1815606,0,1140435,T,G,1_1140435_1140436


In [21]:
coords = list()
for i in range(len(neurox)):
    chrom = str(neurox.iloc[i,0])
    start = str(neurox.iloc[i,3])
    stop  = str(neurox.iloc[i,3]+1)
    coord = '_'.join([chrom,start,stop])
    coords.append(coord)

neurox[6] = coords
neurox.head(n=15)

Unnamed: 0,0,1,2,3,4,5,6
0,1,NeuroX_PARK7_Pro158del,0,0,0,I,1_0_1
1,1,NeuroX_PINK1_23bp_del_ex7,0,0,0,I,1_0_1
2,1,NeuroX_PINK1_534_535insQ,0,0,0,D,1_0_1
3,1,NeuroX_PINK1_Asp525fs,0,0,0,D,1_0_1
4,1,NeuroX_PINK1_Cys549fs,0,0,0,I,1_0_1
5,1,NeuroX_PINK1_Lys520fs,0,0,0,I,1_0_1
6,1,exm2268640,0,762320,T,C,1_762320_762321
7,1,exm41,0,861349,0,C,1_861349_861350
8,1,exm1916089,0,865545,0,0,1_865545_865546
9,1,exm44,0,865584,0,G,1_865584_865585


Extract the snps only in immuno but not in neurox

In [29]:
immuno.to_csv('Data/IMMUNO_coords.txt', sep=" ", index=False, header=False)
neurox.to_csv('Data/NEUROX_coords.txt', sep=" ", index=False, header=False)

!awk 'FNR==NR { a[$NF]; next } !($NF in a)' Data/NEUROX_coords.txt Data/IMMUNO_coords.txt > Data/onlyIMMUNO.txt

In [30]:
onlyImmuno = pd.read_csv('Data/onlyIMMUNO.txt',header=None,sep=" ")
onlyImmuno.head()

Unnamed: 0,0,1,2,3,4,5,6
0,1,imm_1_898835,0,908972,0,A,1_908972_908973
1,1,vh_1_1108138,0,1118275,T,C,1_1118275_1118276
2,1,rs9729550,0,1135242,C,A,1_1135242_1135243
3,1,rs1815606,0,1140435,T,G,1_1140435_1140436
4,1,rs7515488,0,1163804,T,C,1_1163804_1163805


In [35]:
onlyImmunoSnps = onlyImmuno.iloc[:,1]
onlyImmunoSnps.to_csv('Data/OnlyImmunoSnps.txt',index=False, header=False)

In [36]:
!plink --bfile Data/IMMUNO_hg19 --extract Data/OnlyImmunoSnps.txt --make-bed --out Data/IMMUNO_extracted

PLINK v1.90b6.12 64-bit (28 Oct 2019)          www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to IMMUNO_extracted.log.
Options in effect:
  --bfile Data/IMMUNO_hg19
  --extract Data/OnlyImmunoSnps.txt
  --make-bed
  --out IMMUNO_extracted

32768 MB RAM detected; reserving 16384 MB for main workspace.
196520 variants loaded from .bim file.
523 people (343 males, 180 females) loaded from .fam.
--extract: 188126 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 523 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
commands treat these as missing.
treat these as missing.
Total genotyping rate is 0.956509.
188126 variants and 523 people pass filters and QC.
Note: 

In [37]:
!plink --bfile {os.path.join(neuro_path, 'NEUROX')} --bmerge Data/IMMUNO_extracted --make-bed --out Data/merged_PPMI

PLINK v1.90b6.12 64-bit (28 Oct 2019)          www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Data/merged_PPMI.log.
Options in effect:
  --bfile ../PPMI_data/2_NeuroX_SNP_Data_for_Original_Cohort/NEUROX
  --bmerge Data/IMMUNO_extracted
  --make-bed
  --out Data/merged_PPMI

32768 MB RAM detected; reserving 16384 MB for main workspace.
619 people loaded from
../PPMI_data/2_NeuroX_SNP_Data_for_Original_Cohort/NEUROX.fam.
523 people to be merged from Data/IMMUNO_extracted.fam.
Of these, 3 are new, while 520 are present in the base dataset.
267607 markers loaded from
../PPMI_data/2_NeuroX_SNP_Data_for_Original_Cohort/NEUROX.bim.
188126 markers to be merged from Data/IMMUNO_extracted.bim.
Of these, 188126 are new, while 0 are present in the base dataset.
the same position.
have the same position.
the same position.
Performing single-pass merge (622 people, 455733 variants).
Merged fileset written to Data/merged_PPMI-