# PRS Prep for UKB

In [1]:
# Imports here.
import numpy as np
import pandas as pd
import os
#import statsmodels.api as sm
#import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Directory in Biowulf that contains the UKB genetic data
os.chdir('/path/to/UKB/data/bfile')

## Activity scores -- overall activity and sedentary traits included
https://www.nature.com/articles/s41467-018-07743-4/tables/1

In [3]:
#Load table from study - include only "overall activity" and "sedendary"
df = pd.read_csv('/path/to/activity.csv')
df = df[df['Trait']!= 'Sleep duration']
df

Unnamed: 0,Status,Trait,ID,Ch,SNP,Position,Nearest gene,Allele (effect/other),EAF,Beta,SE,p
0,Novel,Overall activity,1,10,rs564819152,21820650,SKIDA1,A/G,0.679,0.028,0.005,4.2e-09
3,Novel,Sedentary,4,5,rs26579,87985295,MEF2C-AS2,G/C,0.415,0.028,0.005,2.6e-09
4,Novel,Sedentary,5,5,rs25981,106822908,EFNA5,G/C,0.531,0.028,0.005,3e-09
5,Novel,Sedentary,6,3,rs1858242,68527135,LOC105377146,A/G,0.259,0.031,0.005,3.1e-09
6,Novel,Sedentary,7,7,rs34858520,71723883,CALN1,A/G,0.558,0.028,0.005,4.2e-09
7,Known8,Overall activity,8,17,rs2696625,44326864,KANSL1-AS1,A/G,0.77,-0.037,0.005,3.2e-12
8,Known8,Overall activity,9,18,rs59499656,40768309,SYT4,A/T,0.655,-0.028,0.005,1.9e-09


In [4]:
#Create an "activity" file that includes the info needed ("SNP", 'Allele (effect)', 'Beta')
activity_file = pd.read_csv('/path/to/activity_file.txt', sep = ' ')
activity_file

Unnamed: 0,SNP,Allele (effect/other),Beta
0,rs564819152,A,0.028
1,rs26579,G,0.028
2,rs25981,G,0.028
3,rs1858242,A,0.031
4,rs34858520,A,0.028
5,rs2696625,A,-0.037
6,rs59499656,A,-0.028


In [5]:
#Create a list of the chromosomes
chrom_list = list(df['Ch'])
chrom_list = set(chrom_list)
chrom_list

{3, 5, 7, 10, 17, 18}

In [6]:
#Create list of snps; save as activity_snp_list.txt
!cat /path/to/activity_snp_list.txt

rs564819152
rs26579
rs25981
rs1858242
rs34858520
rs2696625
rs59499656

In [5]:
#Extract SNPs for activity
activity_chroms = ['3', '5', '7', '10', '17', '18']
for i in activity_chroms:
    !plink --bfile /path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/chr{i}.UKBB.EU.filtered --extract /data/levineks/Virus/activity_snp_list.txt --make-bed --out activity_chr{i}

PLINK v1.90b3.36 64-bit (31 Mar 2016)      https://www.cog-genomics.org/plink2
(C) 2005-2016 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to activity_chr3.log.
Options in effect:
  --bfile /data/CARD/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/chr3.UKBB.EU.filtered
  --extract /data/levineks/Virus/activity_snp_list.txt
  --make-bed
  --out activity_chr3

257652 MB RAM detected; reserving 128826 MB for main workspace.
649369 variants loaded from .bim file.
408961 people (187837 males, 221124 females) loaded from .fam.
--extract: 1 variant remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 408961 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.992721.
1 variant and 408961 people pass filters a

In [7]:
#Create list of files for merge; save as activity_merge.txt
activity_chroms = ['3', '5', '7', '10', '17', '18']
for i in activity_chroms:
    print(f'/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/activity_chr{i}')

/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/activity_chr3
/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/activity_chr5
/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/activity_chr7
/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/activity_chr10
/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/activity_chr17
/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/activity_chr18


In [8]:
#Merge the individual chromosome files
!plink --merge-list /data/levineks/Virus/activity_merge.txt --make-bed --out activity_merged

PLINK v1.90b3.36 64-bit (31 Mar 2016)      https://www.cog-genomics.org/plink2
(C) 2005-2016 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to activity_merged.log.
Options in effect:
  --make-bed
  --merge-list /data/levineks/Virus/activity_merge.txt
  --out activity_merged

257652 MB RAM detected; reserving 128826 MB for main workspace.
Performing single-pass merge (408961 people, 7 variants).
Merged fileset written to activity_merged-merge.bed + activity_merged-merge.bim
+ activity_merged-merge.fam .
7 variants loaded from .bim file.
408961 people (187837 males, 221124 females) loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 408961 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate

In [14]:
#Get the --score for the merged files
!plink --bfile /path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/activity_merged --score /path/to/activity_file.txt header --out /path/to/activity_prs_test

PLINK v1.90b3.36 64-bit (31 Mar 2016)      https://www.cog-genomics.org/plink2
(C) 2005-2016 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /data/levineks/Virus/activity_prs_test.log.
Options in effect:
  --bfile /data/CARD/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/activity_merged
  --out /data/levineks/Virus/activity_prs_test
  --score /data/levineks/Virus/activity_file.txt header

257652 MB RAM detected; reserving 128826 MB for main workspace.
7 variants loaded from .bim file.
408961 people (187837 males, 221124 females) loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 408961 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.983925.
7 variants and 408961 people pass filters and Q

In [8]:
#Take a look at the results
activity = pd.read_csv('/path/to/activity_prs_test.profile', sep = '\s+')
activity

Unnamed: 0,FID,IID,PHENO,CNT,CNT2,SCORE
0,1000012,1000012,-9,14,7,-0.003071
1,1000031,1000031,-9,12,9,0.003053
2,1000047,1000047,-9,14,6,0.008000
3,1000050,1000050,-9,12,7,0.007204
4,1000068,1000068,-9,14,8,-0.001286
...,...,...,...,...,...,...
408956,6026129,6026129,-9,14,8,-0.001286
408957,6026131,6026131,-9,14,8,0.003357
408958,6026147,6026147,-9,14,8,0.007786
408959,6026150,6026150,-9,14,8,0.002714


## Diet scores -- protein included
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7767645/

In [9]:
#Load table from study - include only "protein"
df = pd.read_csv('/path/to/diet.csv')
df = df[df['Top hit in locus for']=='Protein']
df

Unnamed: 0,Top hit in locus for,SNPID,CHR,BP,Effect allele,Beta,P-value,Nearest gene
0,Protein,rs780094,2.0,27741237,t,0.018,5.58e-10,GCKR
3,Protein,rs445551,2.0,79697982,a,0.019,1.49e-08,CTNNA2
5,Protein,rs1603978,3.0,25108236,a,0.019,1.35e-10,AC092422.1
8,Protein,rs13146907,4.0,39425248,a,−0.022,1.24e-14,KLB
16,Protein,rs1461729,8.0,9187242,a,0.032,4.09e-12,AC022784.6
19,Protein,rs55872725,16.0,53809123,t,0.018,2.09e-10,FTO
34,Protein,rs838133,19.0,49259529,a,−0.032,4.52e-26,FGF21


In [7]:
#Pick the correct columns and save to file - 'SNPID', 'Effect allele', 'Beta'
#Remember to make effect alleles UPPER CASE
diet_file = pd.read_csv('/path/to/diet_file.txt', sep = ' ')
diet_file

Unnamed: 0,SNPID,Effect allele,Beta
0,rs780094,T,0.018
1,rs445551,A,0.019
2,rs1603978,A,0.019
3,rs13146907,A,−0.022
4,rs1461729,A,0.032
5,rs55872725,T,0.018
6,rs838133,A,−0.032


In [10]:
#Create a list of the chromosomes
chrom_list = list(df['CHR'])
chrom_list = set(chrom_list)
chrom_list

{2.0, 3.0, 4.0, 8.0, 16.0, 19.0}

In [12]:
#Create a list of SNPs; save to diet_snp_list.txt
!cat /path/to/Virus/diet_snp_list.txt

rs780094
rs445551
rs1603978
rs13146907
rs1461729
rs55872725
rs838133

In [13]:
#Extract SNPs for diet
diet_chroms = ['2', '3', '4', '8', '16', '19']
for i in diet_chroms:
    !plink --bfile /path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/chr{i}.UKBB.EU.filtered --extract /path/to/diet_snp_list.txt --make-bed --out diet_chr{i}

PLINK v1.90b3.36 64-bit (31 Mar 2016)      https://www.cog-genomics.org/plink2
(C) 2005-2016 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to diet_chr2.log.
Options in effect:
  --bfile /data/CARD/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/chr2.UKBB.EU.filtered
  --extract /data/levineks/Virus/diet_snp_list.txt
  --make-bed
  --out diet_chr2

257652 MB RAM detected; reserving 128826 MB for main workspace.
764315 variants loaded from .bim file.
408961 people (187837 males, 221124 females) loaded from .fam.
--extract: 2 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 408961 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
2 variants and 408961 people pass filters and QC.
Note: No phenotypes present.
--make-be

In [14]:
##Error: No variants remaining after --extract for chr19 -- Remove #19
#Create list of files for merge; save as diet_merge.txt
diet_chroms = ['2', '3', '4', '8', '16']
for i in diet_chroms:
    print(f'/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/diet_chr{i}')

/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/diet_chr2
/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/diet_chr3
/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/diet_chr4
/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/diet_chr8
/path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/diet_chr16


In [33]:
#Merge the individual chromosome files
!plink --merge-list /path/to/diet_merge.txt --make-bed --out diet_merged

PLINK v1.90b3.36 64-bit (31 Mar 2016)      https://www.cog-genomics.org/plink2
(C) 2005-2016 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to diet_merged.log.
Options in effect:
  --make-bed
  --merge-list /data/levineks/Virus/diet_merge.txt
  --out diet_merged

257652 MB RAM detected; reserving 128826 MB for main workspace.
Performing single-pass merge (408961 people, 6 variants).
Merged fileset written to diet_merged-merge.bed + diet_merged-merge.bim +
diet_merged-merge.fam .
6 variants loaded from .bim file.
408961 people (187837 males, 221124 females) loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 408961 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.997928.
6 variants

In [37]:
#Get the --score for the merged files
!plink --bfile /path/to/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/diet_merged --score /path/to/diet_file.txt header --out /path/to/diet_prs_test

PLINK v1.90b3.36 64-bit (31 Mar 2016)      https://www.cog-genomics.org/plink2
(C) 2005-2016 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /data/levineks/Virus/diet_prs_test.log.
Options in effect:
  --bfile /data/CARD/UKBIOBANK/FILTER_IMPUTED_DATA/bfile/diet_merged
  --out /data/levineks/Virus/diet_prs_test
  --score /data/levineks/Virus/diet_file.txt header

257652 MB RAM detected; reserving 128826 MB for main workspace.
6 variants loaded from .bim file.
408961 people (187837 males, 221124 females) loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 408961 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.997928.
6 variants and 408961 people pass filters and QC.
Note: No phen

In [15]:
#Take a look at the results
diet = pd.read_csv('/path/to/diet_prs_test.profile', sep = '\s+')
diet

Unnamed: 0,FID,IID,PHENO,CNT,CNT2,SCORE
0,1000012,1000012,-9,12,5,0.004250
1,1000031,1000031,-9,12,3,0.001250
2,1000047,1000047,-9,12,5,0.004333
3,1000050,1000050,-9,12,4,0.006250
4,1000068,1000068,-9,12,4,0.003917
...,...,...,...,...,...,...
408956,6026129,6026129,-9,12,7,0.007500
408957,6026131,6026131,-9,12,4,0.002833
408958,6026147,6026147,-9,12,6,0.005917
408959,6026150,6026150,-9,12,8,0.006750
