# PCA analysis UKBB data

The intention of this notebook is to generate the PCA analysis and plots for the exomed samples 200K.

Steps to generate a PCA include removing related individuals, pruning variants in linkage disequilibrium (LD), and excluding outlier samples that can suggest poor genotyping quality or distant relatedness (also restrict to individuals of homogeneous ancestry).

Pitfalls
1. Some of the PCs may capture LD structure rather than population structure (decrease in power to detect associations in these regions of high LD)
2. When projecting a new study dataset to the PCA space computed from a reference dataset: projected PCs are shrunk toward 0 in the new dataset 
3.  PC scores may capture outliers that are due to family structure, population structure or other reasons; it might be beneficial to detect and remove these individuals to maximize the population structure captured by PCA (in the case of removing a few outliers) or to restrict analyses to genetically homogeneous samples


## PCA analysis pipeline

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# BED Plink files for exome data
parameter: bedfiles = paths
# BIM Plink files for exome data
parameter: bimfiles = paths
# The fam file associated to the bed files
parameter: famFile = path 
# The database to extract info from
parameter: database = path
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Number of threads
parameter: numThreads = 1
# Load Plink module from cluster
parameter: plink2_module = '''
module load PLINK/2_x86_64_20180428
echo "Module PLINK2 loaded"
{cmd}
'''
parameter: plink_module = '''
module load PLINK/1.90-beta5.3
echo "Module plink loaded"
{cmd}
'''
# Load Eigensoft module from cluster
parameter: eigensoft_module = '''
module load EIGENSOFT/7.2.1-foss-2018b
echo "Module Eigensoft v.7.2.1 loaded"
{cmd}
'''
# Software container option
parameter: container_lmm = 'statisticalgenetics/lmm:1.4'

In [None]:
# Filter individuals from ancestries different than British, Irish, Other white background, prefer not to answer, do not know
[filter_1: provides = [f'{cwd}/cache/{famFile:bn}.white_ind']]
output: f'{cwd}/cache/{famFile:bn}.white_ind', f'{cwd}/cache/{famFile:bn}.white_ind.pheno'
task: trunk_workers = 1, walltime = '10h', mem = '40G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: container=container_lmm, expand= "${ }", stderr = f'{_output[0]:nn}.stderr', stdout = f'{_output[0]:nn}.stdout'
    #Load libraries
    library('dplyr')
    # This database is the one from June 2020 and contains a subset of variables with the PCs
    fam <- read.table(${famFile:r}, sep=' ', header=F)
    colnames(fam) <- c("FID","IID","fatherID", "motherID", "sex", "phenotype")
    cat("There are",nrow(fam),"individuals with exomes.\n")
    bd <- read.table(${database:r}, sep="\t", header=T)
    cat("The size of the full database is",dim(bd),".\n")
    # Assign individual ID column to bd f.eid
    names(bd)[1] <- "IID"
    # Select the 200K individuals with exomes from the full db
    exomed_IID <- bd[bd$IID %in% fam$IID,]
    cat("The number of selected individuals is",nrow(exomed_IID),".\n")
    # Filter db based on ethnicity variable
    ethnicity <- exomed_IID %>%
          select(IID, starts_with("f.21000"))
    # Function to extract all the available answers for 3 visits and put them in one list
    f<-function(x){
      visit<-c()
      for (i in 2:4){
        if (!is.na(x[i]))
        {visit<-c(visit,x[i])}
      }
      if(is.null(visit)){visit=NA}
      else{visit=as.numeric(visit)}
      return (visit)
    }

    # Apply the above function and remove NAs
    ethnicity$visit<-apply(ethnicity, 1, f)
    # Filter out individuals wih missing values in ethnicity: 212 ind total
    ethnicity <- ethnicity %>%
      filter(!is.na(visit))
    cat("There are",nrow(ethnicity),"individuals without missing values for ethnicity.\n")
    # Identify the unique available codings in f.21000
    code<-union(union(unique(ethnicity$f.21000.0.0),unique(ethnicity$f.21000.1.0)),unique(ethnicity$f.21000.2.0))
    # Codes to keep white individuals
    useful_code<-c(1001,1002,1003,1,-3,-1)
    # the rest that don’t have the combinations above can be set as NA
    useless_code<-code[!code %in% useful_code] 
    useless_code<-useless_code[-which(is.na(useless_code))] # remove NA here in the vector
    # Function to get the final code for ethnicity
    f<-function(x){
      l=length(unique(x$visit))
      if (l==1){ # only one value available
        result=unique(x$visit)
      }
      else{ # more then one value available
        l=length(x$visit)
        for (i in 1:l){
          if (x$visit[i] %in% useless_code){result=NA; break} # inconsistent ones with conbination not wanted
          else {result=9000} # inconsistent ones with right conbination
        }
      }
      return(result)
    }

    # Apply the above function and remove NAs
    ethnicity$new_ethnicity<-apply(ethnicity, 1, f)
    # Filter by NA presence
    ethnicity_noNA<-ethnicity %>%
      filter(!is.na(new_ethnicity))
    cat("There are",nrow(ethnicity_noNA),"individuals consistent for f.21000.\n")
    ethnicity_isNA <- ethnicity %>%
      filter(is.na(new_ethnicity))
    cat("There are",nrow(ethnicity_isNA),"individuals inconsistent for f.21000.\n")
    # keep only white individuals
    white <- ethnicity_noNA %>%
        filter(new_ethnicity %in% c(1,1001,1002,1003,-3,-1,9000)) %>%
        mutate(FID = IID) %>%
        select(FID,IID)
    cat("After excluding non-white ethnic backgrounds, the number of white individuals is",nrow(white),".\n")
    # Write the seleted individuals to a txt file
    write.table(white,${_output[0]:r}, sep="\t", row.names=FALSE, col.names=F)
    # Create the phenotype file
    pheno <- ethnicity_noNA %>%
        filter(new_ethnicity %in% c(1,1001,1002,1003,-3,-1,9000)) %>%
        mutate(ethnicity = new_ethnicity) %>%
        select(IID,ethnicity)
    # Merge the two data frames
    famfile <-merge(fam, pheno, by="IID", all=FALSE)
    cat("The famfile has ",nrow(famfile),"individuals.\n")
    write.table(famfile,${_output[1]:r}, sep="\t", row.names=FALSE, col.names=F)

In [None]:
# Filter SNPs with MAF>1% for PCA analysis, select individuals and merge bed into one file
[filter_2]
parameter: maf_filter = 0.01
#Maximum missingess per-variant
parameter: geno_filter = 0.01
#Maximum missingness per-sample
parameter: mind_filter = 0.01
parameter: hwe_filter = 0.0
input: bedfiles, paired_with=['bimfiles'], group_by=1
depends: f'{cwd}/cache/{famFile:bn}.white_ind'
output: f'{cwd}/{_input:bn}.filtered.bed'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('plink2').target_exists() else plink2_module
    plink2 \
      --bed ${_input}  --bim ${_input._bimfiles} --fam ${famFile} \
      ${('--maf %s' % maf_filter) if maf_filter > 0 else ''} ${('--geno %s' % geno_filter) if geno_filter > 0 else ''} ${('--hwe %s' % hwe_filter) if hwe_filter > 0 else ''} ${('--mind %s' % mind_filter) if mind_filter > 0 else ''} \
      --keep ${_depends} \
      --make-bed \
      --threads ${numThreads} \
      --out ${_output:n} 

In [None]:
# Merge all the .bed files into one bed file for input to eigensoft
[filter_3]
input: group_by = 'all'
output: bfile_merge = f'{cwd}/{famFile:bn}.filtered.merged.bed'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '60G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('plink').target_exists() else plink_module
    echo -e ${' '.join([str(x)[:-4] for x in _input[1:]])} | sed 's/ /\n/g' > ${_output:n}.merge_list
    plink \
    --bfile ${_input[0]:n} \
    --merge-list ${_output:n}.merge_list \
    --make-bed \
    --out ${_output:n} \
    --threads ${numThreads} \
    --memory 48000

In [None]:
# LD prunning window=50, shift-window every 5 SNPS, r2=0.5
[filter_4]
parameter: window = 50
parameter: shift = 5
parameter: r2 = 0.5
output: f'{cwd}/{famFile:bn}.filtered.merged.prun.in'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '60G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('plink').target_exists() else plink_module
    plink \
    --bfile ${_input:n} \
    --indep-pairwise ${window} ${shift} ${r2}  \
    --out ${_output:nn} \
    --threads ${numThreads} \
    --memory 48000
    
    plink \
    --bfile ${_input:n} \
    --extract ${_output} \
    --make-bed \
    --out ${_output:n} 

In [None]:
# Run pca analysis using Eigenstrat: the program suports plink files here called PACKEDPED format
# smartpca.perl: run PCA on input genotype data (calls smartpca)
[pca_1]
# Number of Principal Components to output
parameter: k = int
# Maximum number of iterations for outlier removal. Default 0 turns off outlier removal
parameter: maxiter = 0
# Number of principal components along which to remove outliers during each outlier removal iteration. Default is 10
parameter: topk = 10
# Number of standard deviations which an individual must exceed, along one of topk top principal components, in order to be removed as an outlier. Default is 6
parameter: sigma = 6
input: f'{cwd}/{famFile:bn}.filtered.merged.bed'
output: f'{cwd}/{_input:bn}.pca'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout',  template = '{cmd}' if executable('smartpca.perl').target_exists() else eigensoft_module
    #Create the parfile
    genotypename: UKB_Caucasians_Subgroup_rs121Equal0_pruned.bed
    snpname: UKB_Caucasians_Subgroup_rs121Equal0_pruned.bim
    indivname: UKB_Caucasians_Subgroup_rs121Equal0_pruned.fam
    outputformat: EIGENSTRAT
    fastmode: YES
    genotypeoutname: PCAoutput_UKB_Caucasians_Subgroup_rs121Equal0_pruned.geno
    snpoutname: PCAoutput_UKB_Caucasians_Subgroup_rs121Equal0_pruned.snp
    indivoutname: PCAoutput_UKB_Caucasians_Subgroup_rs121Equal0_pruned.ind
    evecoutname: PCAoutput_UKB_Caucasians_Subgroup_rs121Equal0_pruned.evecout
    evaloutname: PCAoutput_UKB_Caucasians_Subgroup_rs121Equal0_pruned.evalout
    
    
    smartpca.perl \
    -i ${_input} \ 
    -a ${_input:n}.bim \
    -b ${_input:n}.fam \
    -k ${k} \
    -o ${_output} \
    -p ${_output:n}.plot \
    -e ${_output:n}.eval \
    -l ${_output:n}.log \
    -m ${maxiter} \
    -t ${topk} \
    -s ${sigma}

In [None]:
[flashpca]
# Number of Principal Components to output
parameter: k = int
# Maximum number of iterations for outlier removal. Default 0 turns off outlier removal
parameter: maxiter = 0
# Number of principal components along which to remove outliers during each outlier removal iteration. Default is 10
parameter: topk = 10
# Number of standard deviations which an individual must exceed, along one of topk top principal components, in order to be removed as an outlier. Default is 6
parameter: sigma = 6
input: f'{cwd}/{famFile:bn}.filtered.merged.bed'
output: f'{cwd}/{_input:bn}.pca'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    fn <- ${_input:nr}
    f <- flashpca(fn, ndim=10, stand="binom2")
    write.table(f$values,${_output:rn}.values, sep=" ", row.names=FALSE, col.names=F) 
    write.table(f$vectors,${_output:rn}.vectors, sep=" ", row.names=TRUE, col.names=F)
    write.table(f$projection,${_output:rn}.projection, sep=" ", row.names=TRUE, col.names=F)
    write.table(f$loadings,${_output:rn}.loadings, sep=" ", row.names=FALSE, col.names=F)
    write.table(f$scale,${_output:rn}.scale, sep=" ", row.names=FALSE, col.names=F)