# Principal component analysis

The intention of this notebook is to generate the PCA analysis and plots for the exomed samples 200K.

Steps to generate a PCA include removing related individuals, pruning variants in linkage disequilibrium (LD), and excluding outlier samples that can suggest poor genotyping quality or distant relatedness (also restrict to individuals of homogeneous ancestry).

Pitfalls
1. Some of the PCs may capture LD structure rather than population structure (decrease in power to detect associations in these regions of high LD)
2. When projecting a new study dataset to the PCA space computed from a reference dataset: projected PCs are shrunk toward 0 in the new dataset 
3.  PC scores may capture outliers that are due to family structure, population structure or other reasons; it might be beneficial to detect and remove these individuals to maximize the population structure captured by PCA (in the case of removing a few outliers) or to restrict analyses to genetically homogeneous samples


# Command interface

In [2]:
sos run PCA.ipynb -h

usage: sos run PCA.ipynb [workflow_name | -t targets] [options] [workflow_options]
  workflow_name:        Single or combined workflows defined in this script
  targets:              One or more targets to generate
  options:              Single-hyphen sos parameters (see "sos run -h" for details)
  workflow_options:     Double-hyphen workflow-specific parameters

Workflows:
  filter_samples
  filter
  pca
  flashpca

Global Workflow Options:
  --cwd VAL (as path, required)
                        the output directory for generated files
  --genoFile  paths

                        Plink binary files
  --famFile VAL (as path, required)
                        The fam file associated to the bed files
  --database VAL (as path, required)
                        The database to extract info from
  --ethnia-prefix VAL (as str, required)
                        The prefix for the output files
  --phenoFile  path(f'{cwd}/cache/{famFile:bn}.{ethnia_prefix}.pheno')

                        The

## PCA analysis pipeline

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# Plink binary files
parameter: genoFile = paths
# The fam file associated to the bed files
parameter: famFile = path 
# The database to extract info from
parameter: database = path
# The prefix for the output files
parameter: ethnia_prefix = str
# The phenotypic file
parameter: phenoFile = path(f'{cwd}/cache/{famFile:bn}.{ethnia_prefix}.pheno')
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Number of threads
parameter: numThreads = 1
# Merge data
parameter: merge = True
# Load Eigensoft module from cluster
parameter: eigensoft_module = '''
module load EIGENSOFT/7.2.1-foss-2018b
echo "Module Eigensoft v.7.2.1 loaded"
{cmd}
'''
# Software container option
parameter: container_lmm = 'statisticalgenetics/lmm:1.8'

In [None]:
# Filter individuals depending on their ancestry
[filter_samples]
parameter: select_ethnia = [ ]
output: f'{cwd}/cache/{famFile:bn}.{ethnia_prefix}', phenoFile
task: trunk_workers = 1, walltime = '10h', mem = '40G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: container=container_lmm, expand= "${ }", stderr = f'{_output[0]:nn}.stderr', stdout = f'{_output[0]:nn}.stdout'
    #Load libraries
    library('dplyr')
    # This database is the one from June 2020 and contains a subset of variables with the PCs
    fam <- read.table(${famFile:r}, sep=' ', header=F)
    colnames(fam) <- c("FID","IID","fatherID", "motherID", "sex", "phenotype")
    cat("There are",nrow(fam),"individuals with exomes.\n")
    bd <- read.table(${database:r}, sep="\t", header=T)
    cat("The size of the full database is",dim(bd),".\n")
    # Assign individual ID column to bd f.eid
    names(bd)[1] <- "IID"
    # Select the 200K individuals with exomes from the full db
    exomed_IID <- bd[bd$IID %in% fam$IID,]
    cat("The number of selected individuals is",nrow(exomed_IID),".\n")
    # Filter db based on ethnicity variable
    ethnicity <- exomed_IID %>%
          select(IID, starts_with("f.21000"))
    ethnicity <- ethnicity %>%
          mutate_all(na_if,-3) %>%
          mutate_all(na_if,-1)
    # Function to extract all the available answers for 3 visits and put them in one list
    f<-function(x){
      visit<-c()
      for (i in 2:4){
        if (!is.na(x[i]))
        {visit<-c(visit,x[i])}
      }
      if(is.null(visit)){visit=-99}
      else{visit=as.numeric(visit)}
      return (visit)
    }

    # Apply the above function and remove NAs
    ethnicity$visit<-apply(ethnicity, 1, f)
    # Filter out individuals wih missing values in ethnicity: 212 ind total
    ethnicity <- ethnicity %>%
      filter(!is.na(visit))
    cat("There are",nrow(ethnicity),"individuals without missing values for ethnicity.\n")
    # Identify the unique available codings in f.21000
    code<-union(union(unique(ethnicity$f.21000.0.0),unique(ethnicity$f.21000.1.0)),unique(ethnicity$f.21000.2.0))
    # Codes to keep white individuals
    #useful_code<-c(1001,1002,1003,1,-3,-1)
    useful_code<-c(${','.join(['%s ' % x for x in select_ethnia if x is not None])})
    # the rest that don’t have the combinations above can be set as NA
    useless_code<-code[!code %in% useful_code] 
    useless_code<-useless_code[-which(is.na(useless_code))] # remove NA here in the vector
    # Function to get the final code for ethnicity
    f<-function(x){
      l=length(unique(x$visit))
      if (l==1){ # only one value available
        result=unique(x$visit)
      }
      else{ # more then one value available
        l=length(x$visit)
        for (i in 1:l){
          if (x$visit[i] %in% useless_code){result=8000; break} # inconsistent ones with conbination not wanted
          else {result=9000} # inconsistent ones with right conbination
        }
      }
      return(result)
    }

    # Apply the above function and remove NAs
    ethnicity$new_ethnicity<-apply(ethnicity, 1, f)
    # Filter by NA presence
    ethnicity_noNA<-ethnicity %>%
      filter(!is.na(new_ethnicity))
    cat("There are",nrow(ethnicity_noNA),"individuals consistent for f.21000.\n")
    ethnicity_isNA <- ethnicity %>%
      filter(is.na(new_ethnicity))
    cat("There are",nrow(ethnicity_isNA),"individuals inconsistent for f.21000.\n")
    # keep only certain individuals
    ${ethnia_prefix} <- ethnicity_noNA %>%
        filter(new_ethnicity %in% useful_code) %>%
        mutate(FID = IID) %>%
        select(FID,IID)
    cat("After excluding ethnic backgrounds different than  ${ethnia_prefix}, the number of individuals is",nrow(${ethnia_prefix}),".\n")
    # Write the selected individuals to a txt file
    write.table(${ethnia_prefix},${_output[0]:r}, sep="\t", row.names=FALSE, col.names=F)
    # Create the phenotype file
    pheno <- ethnicity_noNA %>%
        filter(new_ethnicity %in% useful_code) %>%
        mutate(ethnicity = new_ethnicity) %>%
        select(IID,ethnicity)
    # Merge the two data frames
    famfile <-merge(fam, pheno, by="IID", all=FALSE)
    cat("The famfile has ",nrow(famfile),"individuals.\n")
    write.table(famfile,${_output[1]:r}, sep="\t", row.names=FALSE, col.names=F)

In [None]:
# Filter SNPs with MAF>1% for PCA analysis, select individuals and merge bed into one file
[filter_1]
parameter: maf_filter = 0.01
#Maximum missingess per-variant
parameter: geno_filter = 0.01
#Maximum missingness per-sample
parameter: mind_filter = 0.02
parameter: hwe_filter = 0.0
input: genoFile, group_by=1
output: f'{cwd}/cache/{_input:bn}.filtered.bed'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    plink2 \
      --bfile ${_input:n} \
      ${('--maf %s' % maf_filter) if maf_filter > 0 else ''} ${('--geno %s' % geno_filter) if geno_filter > 0 else ''} ${('--hwe %s' % hwe_filter) if hwe_filter > 0 else ''} ${('--mind %s' % mind_filter) if mind_filter > 0 else ''} \
      --keep ${cwd}/cache/${famFile:bn}.${ethnia_prefix} \
      --make-bed \
      --threads ${numThreads} \
      --out ${_output:n} 

In [None]:
# Merge all the .bed files into one bed file for input to eigensoft
[filter_2:skip=not merge]
input: group_by = 'all'
output: bfile_merge = f'{cwd}/cache/{famFile:bn}.filtered.merged.bed'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '60G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    echo -e ${' '.join([str(x)[:-4] for x in _input[1:]])} | sed 's/ /\n/g' > ${_output:n}.merge_list
    plink \
    --bfile ${_input[0]:n} \
    --merge-list ${_output:n}.merge_list \
    --make-bed \
    --out ${_output:n} \
    --threads ${numThreads} \
    --memory 48000

In [None]:
# LD prunning window=50, shift-window every 10 SNPS, r2=0.1
[filter_3]
parameter: window = 50
parameter: shift = 10
parameter: r2 = 0.1
output: f'{cwd}/cache/{famFile:bn}.filtered.merged.prune.in', f'{cwd}/{famFile:bn}.filtered.merged.prune.bed'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '60G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    plink \
    --bfile ${_input:n} \
    --indep-pairwise ${window} ${shift} ${r2}  \
    --out ${_output[0]:nn} \
    --threads ${numThreads} \
    --memory 48000
    
    plink \
    --bfile ${_input:n} \
    --extract ${_output[0]} \
    --make-bed \
    --out ${_output[1]:n} 

In [None]:
# Run pca analysis using Eigenstrat: the program suports plink files here called PACKEDPED format
# smartpca.perl: run PCA on input genotype data (calls smartpca)
[smartpca]
# Number of Principal Components to output
parameter: k = int
# Maximum number of iterations for outlier removal. Default 0 turns off outlier removal
parameter: maxiter = 0
# Number of principal components along which to remove outliers during each outlier removal iteration. Default is 10
parameter: topk = 10
# Number of standard deviations which an individual must exceed, along one of topk top principal components, in order to be removed as an outlier. Default is 6
parameter: sigma = 6
input: f'{cwd}/{famFile:bn}.filtered.merged.bed'
output: f'{cwd}/{_input:bn}.parfile' ,  f'{cwd}/{_input:bn}.pca'
task: trunk_workers = 1, walltime = '24h', mem = '80G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand = "${ }", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',  template = '{cmd}' if executable('smartpca.perl').target_exists() else eigensoft_module
  #Create the parfile
  set -e
  echo -e "genotypename: ${_input}" >> ${_output[0]}
  echo -e "snpname: ${_input:n}.bim" >> ${_output[0]}
  echo -e "indivname: ${_input:n}.eigenstrat.fam" >> ${_output[0]}
  echo -e "outputformat: EIGENSTRAT" >> ${_output[0]}
  echo -e "fastmode: YES" >> ${_output[0]}
  echo -e "genotypeoutname: ${_input:n}.geno" >> ${_output[0]}
  echo -e "snpoutname: ${_input:n}.snp" >> ${_output[0]}
  echo -e "indivoutname: ${_input:n}.ind" >> ${_output[0]}
  echo -e "evectoutname: ${_input:n}.evectout" >> ${_output[0]}
  echo -e "evaloutname: ${_input:n}.evalout" >> ${_output[0]} 
  echo -e "numoutevec: ${k}" >> ${_output[0]} 
  echo -e "numoutlieriter: ${maxiter}" >> ${_output[0]} 
  echo -e "numoutlierevec: ${topk}" >> ${_output[0]} 
  echo -e "outliersigmathresh: ${sigma}" >> ${_output[0]} 
  

bash: expand = "${ }", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout',  template = '{cmd}' if executable('smartpca.perl').target_exists() else eigensoft_module
   convertf -p ${_output[0]}
   #smartpca -p ${_output[0]}

In [None]:
# Run PCA analysis using flashpca
[flashpca]
# Number of Principal Components to output. Default is 10
parameter: k = int
# Name of the trait in the phenoFile (format FID, IID, father, mother,trait )
parameter: trait_name = str
# How to standardize X before PCA
parameter: stand = "binom2"
depends: phenoFile
input: f'{cwd}/{famFile:bn}.filtered.merged.prune.bed'
output: f'{cwd}/{_input[0]:bn}.pca',
        f'{cwd}/{_input[0]:bn}.pc1vpc2.png',
        f'{cwd}/{_input[0]:bn}.pc3vpc4.png',
        f'{cwd}/{_input[0]:bn}.pc5vpc6.png' 
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    # Load required libraries
    library(dplyr)
    library(ggplot2)
    library(flashpcaR)
    # Read the PLINK binary files
    fn <- ${_input:nr}
    # Do the PCA computation
    f <- flashpca(fn, ndim=${k}, stand="${stand}", do_loadings=TRUE, check_geno=TRUE)
    # Save the generated matrices to files
    write.table(f$values,'${_output[0]:n}.values', sep=" ", row.names=FALSE, col.names=FALSE) 
    write.table(f$vectors,'${_output[0]:n}.vectors', sep=" ", row.names=TRUE, col.names=FALSE)
    write.table(f$projection,'${_output[0]:n}.projection', sep=" ", row.names=TRUE, col.names=FALSE)
    write.table(f$loadings,'${_output[0]:n}.loadings', sep=" ", row.names=FALSE, col.names=FALSE)
    write.table(f$scale,'${_output[0]:n}.scale', sep=" ", row.names=FALSE, col.names=FALSE)
    # Use the projection file to generate plot
    pca <- read.table('${_output[0]:n}.projection', sep=" ")
    colnames(pca) <- c("ID","PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10")
    pca$IID <- sapply(strsplit(as.character(pca$ID),':'), "[", 1)
    # Read fam file with phenotypes
    pheno <- read.table(${phenoFile:r}, sep="\t" )
    colnames(pheno) <- c("FID", "IID", "father", "mother","sex", "pheno", "${trait_name}")
    pca_final <-merge(pheno, pca, by="IID", all=FALSE)

    write.table(pca_final,${_output[0]:r}, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
  
    png('${_output[1]}', width = 6, height = 4, unit='in', res=300)
    ggplot(pca_final, aes(x=PC1, y=PC2)) + geom_point(aes(color=${trait_name}, shape=${trait_name}), size=2) + labs(title="PC1 vs PC2 exomed subset ${ethnia_prefix}",x="PC1", y="PC2") + theme_classic()
    dev.off()
  
    png('${_output[2]}', width = 6, height = 4, unit='in', res=300)
    ggplot(pca_final, aes(x=PC3, y=PC4)) + geom_point(aes(color=${trait_name}, shape=${trait_name}), size=2) + labs(title="PC3 vs PC4 exomed subset ${ethnia_prefix}", x="PC3", y="PC4") + theme_classic()
    dev.off()
    
    png('${_output[3]}', width = 6, height = 4, unit='in', res=300)
    ggplot(pca_final, aes(x=PC5, y=PC6)) + geom_point(aes(color=${trait_name}, shape=${trait_name}), size=2) + labs(title="P5 vs PC6 exomed subset ${ethnia_prefix}",x="PC5", y="PC6") + theme_classic()
    dev.off()           